In [92]:
# Essential Imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [93]:
# Imports for feature engineering
from sklearn.model_selection import train_test_split

# Preprocessors
from sklearn.preprocessing import OneHotEncoder 

from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [94]:
#Model 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [95]:
## Evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [96]:
# Importing the data
df = pd.read_csv(r"data/cleaned_flight_fare.csv")

In [97]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [98]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Weekdays,Time_Of_The_Day
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20:00,01:10:00,170,0.0,No info,3897,24,3,2019,Sunday,Night
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50:00,13:15:00,445,2.0,No info,7662,1,5,2019,Wednesday,Night
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → MUM → COK,09:25:00,04:25:00,1140,2.0,No info,13882,9,6,2019,Sunday,Morning
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05:00,23:30:00,325,1.0,No info,6218,12,5,2019,Sunday,Evening
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50:00,21:35:00,285,1.0,No info,13302,1,3,2019,Friday,Afternoon


In [105]:
# Splitting the data into train test split
X = df.drop("Price",axis=1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Engineering Pipeline

In [106]:
# Creating categorical and numerical columns list
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns

In [107]:
num_pipeline=Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse=False,handle_unknown="ignore"))
])

preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols),
    ]
)

In [108]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

## Model Selection

In [109]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_regression_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)

    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)

    evaluation_results = {
        'Training MAE': mae_train,
        # 'Training MSE': mse_train,
        'Training RMSE': rmse_train,
        'Training R-squared': r2_train,
        # 'Test MAE': mae_test,
        'Test MSE': mse_test,
        'Test RMSE': rmse_test,
        'Test R-squared': r2_test
    }

    return evaluation_results



In [110]:
def result_printer(result):
    for metric, value in result.items():
      print(f"{metric}: {value}")

### Checking the baseline performance

In [112]:
# Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_evaluation_results = evaluate_regression_model(linear_model, X_train, X_test, y_train, y_test)
print("Linear Regression Model Evaluation:")
result_printer(linear_evaluation_results)

Linear Regression Model Evaluation:
Training MAE: 1017.5067770009987
Training RMSE: 1609.3906482477855
Training R-squared: 0.8809296544577384
Test MSE: 7.811446109586316e+27
Test RMSE: 88382385742784.27
Test R-squared: -3.7875345710908015e+20


In [114]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ridge_evaluation_results = evaluate_regression_model(ridge_model, X_train, X_test, y_train, y_test)
print("Ridge Model Evaluation")
result_printer(ridge_evaluation_results)

Ridge Model Evaluation
Training MAE: 1044.6745239231097
Training RMSE: 1657.525081856292
Training R-squared: 0.8737007170396457
Test MSE: 3870253.3478472685
Test RMSE: 1967.295948210962
Test R-squared: 0.8123430905340042


In [115]:
lasso_model = Lasso(alpha=1.0)  
lasso_model.fit(X_train, y_train)
lasso_evaluation_results = evaluate_regression_model(lasso_model, X_train, X_test, y_train, y_test)
print("Lasso Model Evaluation")
result_printer(lasso_evaluation_results)

Lasso Model Evaluation
Training MAE: 1113.319620516553
Training RMSE: 1725.3440247038254
Training R-squared: 0.8631540101403228
Test MSE: 4063980.4425462964
Test RMSE: 2015.9316562191032
Test R-squared: 0.802949848127466


In [116]:
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)

decision_tree_results =  evaluate_regression_model(decision_tree_model, X_train, X_test, y_train, y_test)
print("Decision Tree Regressor Model Evaluation:")
result_printer(decision_tree_results)

Decision Tree Regressor Model Evaluation:
Training MAE: 34.44918438198506
Training RMSE: 274.38710574984873
Training R-squared: 0.9965389484759885
Test MSE: 3432986.225078032
Test RMSE: 1852.8319473384606
Test R-squared: 0.8335448542158584


In [119]:
svr_model = SVR(kernel='sigmoid')
svr_model.fit(X_train, y_train)

svr_evaluation_results = evaluate_regression_model(svr_model, X_train, X_test, y_train, y_test)
print("Support Vector Regression (SVR) Model Evaluation:")
result_printer(svr_evaluation_results)

Support Vector Regression (SVR) Model Evaluation:
Training MAE: 3231.0456285881514
Training RMSE: 4385.430281633586
Training R-squared: 0.11589247038398354
Test MSE: 18009985.302055735
Test RMSE: 4243.817303095851
Test R-squared: 0.1267501433228756


In [120]:
# Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest Regressor model
rf_evaluation_results = evaluate_regression_model(rf_model, X_train, X_test, y_train, y_test)
print("Random Forest Regressor Model Evaluation:")
result_printer(rf_evaluation_results)

Random Forest Regressor Model Evaluation:
Training MAE: 241.45725273571168
Training RMSE: 637.3495533376487
Training R-squared: 0.9813260565771924
Test MSE: 2415764.0922070597
Test RMSE: 1554.272849987112
Test R-squared: 0.882866886790587


In [121]:
 #  K-Nearest Neighbors Regression model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

# Evaluate the model
knn_evaluation_results = evaluate_regression_model(knn_model, X_train, X_test, y_train, y_test)

print("K-Nearest Neighbors (KNN) Regression Model Evaluation:")
result_printer(knn_evaluation_results)

K-Nearest Neighbors (KNN) Regression Model Evaluation:
Training MAE: 854.6222570980169
Training RMSE: 1578.8351978415023
Training R-squared: 0.885408008611483
Test MSE: 3580453.4512945265
Test RMSE: 1892.2086172762574
Test R-squared: 0.8263946132801578


In [122]:
# AdaBoost
adaboost_model = AdaBoostRegressor(n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)

adaboost_evaluation_results = evaluate_regression_model(adaboost_model, X_train, X_test, y_train, y_test)
print("AdaBoost Model Evaluation:")
result_printer(decision_tree_results)

AdaBoost Model Evaluation:
Training MAE: 34.44918438198506
Training RMSE: 274.38710574984873
Training R-squared: 0.9965389484759885
Test MSE: 3432986.225078032
Test RMSE: 1852.8319473384606
Test R-squared: 0.8335448542158584


In [123]:
# Gradient Boosting
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gradient_boosting_model.fit(X_train, y_train)

gradient_boostin_evaluation_results = evaluate_regression_model(gradient_boosting_model, X_train, X_test, y_train, y_test)
print("Gradient Boosting Model Evaluation:")
result_printer(gradient_boostin_evaluation_results)

Gradient Boosting Model Evaluation:
Training MAE: 1207.0001179612557
Training RMSE: 1749.275926701553
Training R-squared: 0.8593313543676896
Test MSE: 4076912.6858616103
Test RMSE: 2019.1366189194853
Test R-squared: 0.8023228026617307
