FLIGHT TICKET PRICE PREDICTION - Approach



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train_data = pd.read_excel('Data_Train.xlsx')
test_data = pd.read_excel('Test_Set.xlsx')

In [None]:
combined_df = pd.concat([train_data, test_data], ignore_index=True)

# Checking and imputing null values

In [None]:
combined_df.isnull().sum()

In [None]:
combined_df[combined_df['Route'].isnull()].head()

In [None]:
combined_df[(combined_df['Source'] == 'Delhi') &
                           (combined_df['Destination'] == 'Cochin') & 
                           (combined_df['Price']== 7480.0)].head(5)

In [None]:
combined_df['Route'].fillna('DEL → MAA → COK', inplace=True)
combined_df['Total_Stops'].fillna('1 stop', inplace=True)

# Data Cleaning

In [None]:
# Extracting Date and Time from Arrival_Time column
combined_df['Arrival_Date'] = combined_df['Arrival_Time'].str.split(' ').str[1] + ' ' + combined_df['Arrival_Time'].str.split(' ').str[2]
combined_df['Arrival_Time'] = combined_df['Arrival_Time'].str.split(' ').str[0]

In [None]:
# Convert date time columns to datetime format
combined_df['Date_of_Journey'] = pd.to_datetime(combined_df['Date_of_Journey'], format='%d/%m/%Y')
combined_df['Arrival_Date'] = pd.to_datetime(combined_df['Arrival_Date'], format='%d %b')

In [None]:
# Extract day and month from 'Date_of_Journey'
combined_df['Journey_Day'] = combined_df['Date_of_Journey'].dt.day
combined_df['Journey_Month'] = combined_df['Date_of_Journey'].dt.month

In [None]:
#replacing Null Arrival_Date to same day as Date_of_Journey
combined_df.loc[combined_df['Arrival_Date'].isnull(), 'Arrival_Date'] = combined_df.loc[combined_df['Arrival_Date'].isnull(), 'Date_of_Journey']

In [None]:
# Extract day and month from 'Arrival_Date'
combined_df['Arrival_Day'] = combined_df['Arrival_Date'].dt.day.astype(int)
combined_df['Arrival_Month'] = combined_df['Arrival_Date'].dt.month.astype(int)

In [None]:
combined_df[combined_df['Arrival_Day'] < combined_df['Journey_Day']]

In [None]:
# plot number of flights by Date_of_Journey
journey_counts = combined_df['Date_of_Journey'].value_counts().sort_index()
plt.figure(figsize=(10, 5))

plt.bar(journey_counts.index, journey_counts.values, color='blue')
plt.title('Number of Flights by Date_of_Journey')
plt.xlabel('Date_of_Journey')
plt.ylabel('Number of Flights')
plt.xticks(journey_counts.index, journey_counts.index.strftime('%Y-%m-%d'), rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# # Swap dates where Arrival date is earlier than Journey date
# condition = combined_df['Arrival_Day'] < combined_df['Journey_Day']
# combined_df.loc[condition, 'Arrival_Day'] = combined_df.loc[condition, 'Journey_Day']

In [None]:
# combined_df.drop(['Arrival_Date','Date_of_Journey'], axis=1, inplace=True)

In [None]:
combined_df['Total_Stops'].unique()

In [None]:
# Apply the mapping to the 'Total_Stops' column
stops_mapping = {'non-stop': 0,'1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4}
combined_df['Total_Stops'] = combined_df['Total_Stops'].map(stops_mapping)

combined_df['Additional_Info'].unique()

In [None]:
# Replace 'No info' with 'No Info'
combined_df['Additional_Info'] = combined_df['Additional_Info'].replace('No info', 'No Info')

In [None]:
# Convert Dep_Time to datetime format
combined_df['Dep_Time'] = pd.to_datetime(combined_df['Dep_Time'])

# Convert Arrival_Time to datetime format
combined_df['Arrival_Time'] = pd.to_datetime(combined_df['Arrival_Time'] )


In [None]:
# combined_df.head()

In [None]:
combined_df['Diff_Duration'] = combined_df['Arrival_Time'] - combined_df['Dep_Time']

In [None]:
combined_df['Duration'] = pd.to_timedelta(combined_df['Duration'])

In [None]:
# combined_df.head(100)

In [None]:
combined_df[(combined_df['Duration'] != combined_df['Diff_Duration'])]

In [None]:
# Update 'Arrival_Day' and 'Arrival_Time' for those rows
condition = combined_df['Duration'] != combined_df['Diff_Duration']
rows_to_update = combined_df[condition]

new_arrival_time = rows_to_update['Dep_Time'] + rows_to_update['Duration']

combined_df.loc[condition, 'Arrival_Day'] = new_arrival_time.dt.day
combined_df.loc[condition, 'Arrival_Time'] = new_arrival_time
combined_df['Diff_Duration'] = combined_df['Arrival_Time'] - combined_df['Dep_Time']

In [None]:
combined_df[(combined_df['Duration'] != combined_df['Diff_Duration'])]

In [None]:
combined_df.drop(['Diff_Duration'], axis=1, inplace=True)

In [None]:
combined_df.head()

In [None]:
combined_df.info()

In [None]:
# Extract Houe and Minute from date_time formats
combined_df['Dep_Hour'] = combined_df['Dep_Time'].dt.hour
combined_df['Dep_Minute'] = combined_df['Dep_Time'].dt.minute
combined_df['Arrival_Hour'] = combined_df['Arrival_Time'].dt.hour
combined_df['Arrival_Minute'] = combined_df['Arrival_Time'].dt.minute

# Calculate Duration in minute
combined_df['Duration'] = (combined_df['Arrival_Time'] - combined_df['Dep_Time']).dt.total_seconds() / 60

# Drop original datetime columns
combined_df.drop(['Dep_Time', 'Arrival_Time'], axis=1, inplace=True)


In [None]:
# Selecting columns to plot Boxplot
numerical_columns = ['Duration', 'Price']

fig, axes = plt.subplots(nrows=len(numerical_columns)//2, ncols=2, figsize=(14, 3))
axes = axes.flatten()
for i, col in enumerate(numerical_columns):
    sns.boxplot(x=combined_df[col], ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')
    axes[i].set_xlabel(col)
plt.tight_layout()
plt.show()

In [None]:
combined_df['Additional_Info'].value_counts()

In [None]:
combined_df['airline_speciality'] = combined_df['Airline'].apply(lambda x: 'Premium economy' if 'Premium economy' in x else ('Business' if 'Business' in x else '0'))

In [None]:
combined_df['Airline'] = combined_df['Airline'].str.replace(' Premium economy', '')
combined_df['Airline'] = combined_df['Airline'].str.replace(' Business', '')

In [None]:
combined_df['In-flight_meal'] = np.where(combined_df['Additional_Info'] == 'In-flight meal not included', 0, 1)


In [None]:
combined_df['layover'] = np.where(combined_df['Additional_Info'].astype(str).str.contains('layover', case=False), 0, 1)


In [None]:
# combined_df[combined_df['Additional_Info'].str.contains('layover', case=False, na=False)]

In [None]:
combined_df.head()

In [None]:
print(f'Column_name\t unique_values \t {"DataType".rjust(10)} \t {"Minimum".rjust(15)} \t {"Maximum".rjust(10)} ')
for colum in combined_df.columns:
    print(f'{colum.center(20)} {str(len(combined_df[colum].unique())).rjust(5)} \t {str(combined_df[colum].dtype).rjust(10)} \t {str(combined_df[colum].min()).rjust(15)} \t {str(combined_df[colum].max()).rjust(10)}')

In [None]:
reference_date = pd.to_datetime('1/1/2020')
combined_df['day_diff'] = (reference_date - combined_df['Date_of_Journey']).dt.days

In [None]:
combined_df['Arrival_Date'] = combined_df['Arrival_Date'].apply(lambda x: x.replace(year=2019) if x.year != 2019 else x)


In [None]:
import matplotlib.pyplot as plt

# Plotting the distribution of 'Price'
plt.figure(figsize=(10, 6))
plt.hist(combined_df['Price'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Flight Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Log transform the 'Price' column
combined_df['Log_Price'] = np.log1p(combined_df['Price'])

# Plot the distribution of the log-transformed 'Price'
plt.figure(figsize=(10, 6))
plt.hist(combined_df['Log_Price'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Log-Transformed Flight Prices')
plt.xlabel('Log(Price + 1)')
plt.ylabel('Frequency')
plt.show()


In [None]:
combined_df.head()

In [None]:
combined_df['Num_Layovers'] = combined_df['Route'].str.count('→')

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
categorical_columns = combined_df.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)
for column in categorical_columns:
    combined_df[column] = le.fit_transform(combined_df[column])

In [None]:
correlation_matrix  = combined_df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
combined_df.drop(['Total_Stops','Arrival_Month'], axis=1, inplace=True)

In [None]:
#Moving Price column to last index
price_column = combined_df.pop('Price')
combined_df['Price'] = price_column

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

features_to_scale = combined_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
features_to_scale.remove('Price')
features_to_scale.remove('Log_Price') 

scaler = StandardScaler()
combined_df[features_to_scale] = scaler.fit_transform(combined_df[features_to_scale])


In [None]:
combined_df.drop(['Date_of_Journey','Arrival_Date'], axis=1,inplace=True)

In [None]:
#Splitting Train and test data based on presence of Price
train_data = combined_df[combined_df['Price'].notnull()]
test_data = combined_df[combined_df['Price'].isnull()]

In [None]:
test_data = test_data.drop(columns=['Price','Log_Price'])

In [None]:
test_data.head()

In [None]:
train_data.head()

In [None]:
X = train_data.drop(columns=['Price','Log_Price'])  # Features (all columns except 'Price')
y = train_data['Log_Price']  # Target variable ('Price')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [None]:


# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Machine': SVR(),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Neural Network': MLPRegressor(max_iter=3000),
    'Gaussian Process': GaussianProcessRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(verbose=0),
    'CatBoost': CatBoostRegressor(verbose=0) 
}

# Train the models and evaluate
best_model = None
best_mse = float('inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    score = model.score(X_test,y_test)
    
    print(f"Mean Squared Error ({name}): {mse} , Score : {score}")
    
 
    if mse < best_mse:
        best_mse = mse
        best_score= score
        best_model = model

print(f"The best model is {type(best_model).__name__} with an MSE of {best_mse} and score {best_score}")


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a dictionary to store the top 3 best parameters for XGBoost
top_params_xgboost = []

# Define the objective function for XGBoost optimization
def objective_xgboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
    model = XGBRegressor(**params, objective='reg:squarederror', verbosity=0)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error as the objective to minimize
    mse = mean_squared_error(y_test, predictions)

    # Store the top 3 best parameters for XGBoost
    top_params_xgboost.append((mse, params))
    top_params_xgboost.sort()

    return mse

# Create the Optuna study and optimize the objective function for XGBoost
study_xgboost = optuna.create_study(direction='minimize')
study_xgboost.optimize(objective_xgboost, n_trials=150)

# Print the top 3 best parameters for XGBoost
print("Top 3 Best Parameters for XGBoost:")
for i, (mse, params) in enumerate(top_params_xgboost[:3], 1):
    print(f"  Rank {i}: Mean Squared Error = {mse}, Parameters = {params}")


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a dictionary to store the top 3 best parameters for LightGBM
top_params_lightgbm = []

# Define the objective function for LightGBM optimization
def objective_lightgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.35),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
#     params = {
#     'n_estimators': trial.suggest_int('n_estimators', 50, 5000),
#     'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
#     'max_depth': trial.suggest_int('max_depth', -1, 20),  # -1 means no limit
#     'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 1.0),
#     'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#     'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
#     'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
#     'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'rf']),
#     'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#     'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),
#     'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
#     'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
#     'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
#     'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
#     'random_state': 42,
#     'n_jobs': trial.suggest_categorical('n_jobs', [None, -1, 1]),  # Use -1 for all threads, None for default
# }
    model = LGBMRegressor( **params,verbose=-1)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error as the objective to minimize
    mse = mean_squared_error(y_test, predictions)

    # Store the top 3 best parameters for LightGBM
    top_params_lightgbm.append((mse, params))
    top_params_lightgbm.sort()

    return mse

# Create the Optuna study and optimize the objective function for LightGBM
study_lightgbm = optuna.create_study(direction='minimize')
study_lightgbm.optimize(objective_lightgbm, n_trials=150)

# Print the top 3 best parameters for LightGBM
print("Top 3 Best Parameters for LightGBM:")
for i, (mse, params) in enumerate(top_params_lightgbm[:3], 1):
    print(f"  Rank {i}: Mean Squared Error = {mse}, Parameters = {params}")


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a dictionary to store the top 3 best parameters for CatBoost
top_params_catboost = []

# Define the objective function for CatBoost optimization
def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 5000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0),
    }
    model = CatBoostRegressor(**params, verbose=0)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error as the objective to minimize
    mse = mean_squared_error(y_test, predictions)

    # Store the top 3 best parameters for CatBoost
    top_params_catboost.append((mse, params))
    top_params_catboost.sort()

    return mse

# Create the Optuna study and optimize the objective function for CatBoost
study_catboost = optuna.create_study(direction='minimize')
study_catboost.optimize(objective_catboost, n_trials=150)

# Print the top 3 best parameters for CatBoost
print("Top 3 Best Parameters for CatBoost:")
for i, (mse, params) in enumerate(top_params_catboost[:3], 1):
    print(f"  Rank {i}: Mean Squared Error = {mse}, Parameters = {params}")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import optuna

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a dictionary to store the top 3 best parameters for Random Forest
top_params_random_forest = []

# Define the objective function for Random Forest optimization
def objective_random_forest(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 5000),
#         'max_depth': trial.suggest_int('max_depth', 3, 20),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
#         'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
#     }
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'criterion': trial.suggest_categorical('criterion', ['squared_error', 'absolute_error']),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0, step=0.1),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5, step=0.1),
    }
    model = RandomForestRegressor(**params, random_state=42)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error as the objective to minimize
    mse = mean_squared_error(y_test, predictions)

    # Store the top 3 best parameters for Random Forest
    top_params_random_forest.append((mse, params))
    top_params_random_forest.sort()

    return mse

# Create the Optuna study and optimize the objective function for Random Forest
study_random_forest = optuna.create_study(direction='minimize')
study_random_forest.optimize(objective_random_forest, n_trials=150)

# Print the top 3 best parameters for Random Forest
print("Top 3 Best Parameters for Random Forest:")
for i, (mse, params) in enumerate(top_params_random_forest[:3], 1):
    print(f"  Rank {i}: Mean Squared Error = {mse}, Parameters = {params}")


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

# Assuming you have X and y defined

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Define a dictionary to store the top 3 best parameters for KNN
top_params_knn = []

# Define the objective function for KNN optimization
def objective_knn(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 200)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    p = trial.suggest_int('p', 1, 2),
    if algorithm in['ball_tree', 'kd_tree']:
        leaf_size = trial.suggest_int('leaf_size',1,100)


    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test)

    # Calculate mean squared error as the objective to minimize
    mse = mean_squared_error(y_test, predictions)

    # Store the top 3 best parameters for KNN
    top_params_knn.append((mse, {'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm}))
    top_params_knn.sort(key=lambda x: x[0])

    return mse

# Create the Optuna study and optimize the objective function for KNN
study_knn = optuna.create_study(direction='minimize')
study_knn.optimize(objective_knn, n_trials=150)

# Print the top 3 best parameters for KNN
print("Top 3 Best Parameters for KNN:")
for i, (mse, params) in enumerate(top_params_knn[:3], 1):
    print(f"  Rank {i}: Mean Squared Error = {mse}, Parameters = {params}")


In [None]:
best_params_catboost = [item[1] for item in top_params_catboost[:3]]
best_params_xgboost = [item[1] for item in top_params_xgboost[:3]]
best_params_lightgbm = [item[1] for item in top_params_lightgbm[:3]]
best_params_random_forest = [item[1] for item in top_params_random_forest[:3]]
best_params_knn = [item[1] for item in top_params_knn[:3]]


In [None]:

models = {
    'CatBoost': CatBoostRegressor(**best_params_catboost[0], verbose=0),
    'CatBoost1': CatBoostRegressor(**best_params_catboost[1], verbose=0),
    'CatBoost2': CatBoostRegressor(**best_params_catboost[2], verbose=0),
    'XGBoost': XGBRegressor(**best_params_xgboost[0], objective='reg:squarederror', verbosity=0),
    'XGBoost1': XGBRegressor(**best_params_xgboost[1], objective='reg:squarederror', verbosity=0),
    'XGBoost2': XGBRegressor(**best_params_xgboost[2], objective='reg:squarederror', verbosity=0),
    'LightGBM': LGBMRegressor(**best_params_lightgbm[0], verbose=-1),
    'LightGBM1': LGBMRegressor(**best_params_lightgbm[1], verbose=-1),
    'LightGBM2': LGBMRegressor(**best_params_lightgbm[2], verbose=-1),
    'RandomForest': RandomForestRegressor(**best_params_random_forest[0], random_state=42),
    'RandomForest1': RandomForestRegressor(**best_params_random_forest[1], random_state=42),
    'RandomForest2': RandomForestRegressor(**best_params_random_forest[2], random_state=42),
    'knn': KNeighborsRegressor(**best_params_knn[0]),
    'knn1': KNeighborsRegressor(**best_params_knn[1]),
    'knn2': KNeighborsRegressor(**best_params_knn[2])   
}


In [None]:
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


ensembles = {
    'Stacking': StackingRegressor(estimators=list(models.items()), final_estimator=Ridge(), cv=5),
    'Voting': VotingRegressor(estimators=list(models.items())),
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Train and evaluate each ensemble method
for ensemble_name, ensemble_model in ensembles.items():
    ensemble_model.fit(X_train, y_train)
    y_pred = ensemble_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error ({ensemble_name}): {mse}")


In [None]:
sum_prediction=0
model_predictions = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_predictions[model_name] = predictions
    sum_prediction= predictions + sum_prediction
    mse= mean_squared_error(y_test, predictions)
    print(f'mean squared error - {model_name}: {mse}')
    
Ensemble_pred= sum_prediction/len(models)
Ensemble_mse= mean_squared_error(y_test, Ensemble_pred)
print(f'mean squared error - Ensemble: {Ensemble_mse}')    

In [None]:
import itertools

prediction_sets = model_predictions

# Get the list of prediction set names
prediction_set_names = list(prediction_sets.keys())

# Define a function to compute the average of a list of prediction sets
def compute_average(prediction_sets_list):
    return sum(prediction_sets_list) / len(prediction_sets_list)

best_combo_mse= float('inf')
for r in range(1, len(prediction_set_names) + 1):
    for combo in itertools.combinations(prediction_set_names, r):
        combination_name = '_'.join(combo) + '_avg'
        combination_prediction_sets = [prediction_sets[model] for model in combo]
        average_prediction = compute_average(combination_prediction_sets)
        Ensemble_mse= mean_squared_error(y_test, average_prediction)
#         print(f'Log Loss - {combo} Ensemble: {Ensemble_mse}')
        if Ensemble_mse < best_combo_mse:
            best_combo_mse =Ensemble_mse
            best_combo = combo
print(f'\nbest combo: {best_combo} \t mse {best_combo_mse}')

In [None]:
sum_prediction=0
final_model_predictions = {}
for model_name, model in models.items():
    if model_name in best_combo:
        model.fit(X, y)
        predictions = model.predict(test_data)
        model_predictions[model_name + '_pred'] = predictions
        sum_prediction= predictions + sum_prediction
    
Ensemble_pred= sum_prediction/len(best_combo)


In [None]:
Ensemble_pred

In [None]:
predicted_prices = np.expm1(Ensemble_pred)
predictions_df = pd.DataFrame({
    'Price': predicted_prices,
})

# Display the DataFrame with predictions
print(predictions_df)
predictions_df.to_excel("log_predictions.xlsx", index=False)