## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import copy
import datetime

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import math

import warnings
warnings.filterwarnings('ignore')

## Data Preprocessing

In [11]:
# Flight fare prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import datetime

def CheckWeekend(year, month, day):
    convertDate = datetime.date(year, month, day)
    convertDateResult = convertDate.weekday()
    if convertDateResult <= 4:
        return False
    return True

# Preprocessing
# import dataset
df = pd.read_excel("Data_Train.xlsx")

# drop null value
df.dropna(inplace=True)

# drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# convert Dep_Time and Arrival_Time into hour and minute
# Dep_Time
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_minute"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df = df.drop(columns="Dep_Time")

# Arrival_Time
df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arr_minute"] = pd.to_datetime(df["Arrival_Time"]).dt.minute
df = df.drop(columns="Arrival_Time")

# convert Date_of_Journey into timestamp
df["Date_of_Journey_Year"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.year
# ps. I don't think "year" is mandatory
df["Date_of_Journey_Month"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.month
df["Date_of_Journey_Day"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.day
df = df.drop(columns="Date_of_Journey")


# convert Duration into minute
df['Duration'] = df['Duration']\
    .str.replace("h", '*60')\
    .str.replace(' ', '+')\
    .str.replace('m', '*1')\
    .apply(eval)

# drop Additional_Info 
df.drop(['Additional_Info'], axis=1, inplace=True)

# drop Route
df.drop(['Route'], axis=1, inplace=True) 

# Encode categorical attributes
# Select categorical data
df_categorical = df[['Airline', 'Source', 'Destination']]
df.drop(['Airline', 'Source', 'Destination'], axis=1, inplace=True)


# Encode
df_categorical = pd.get_dummies(df_categorical, drop_first=True)
df['Total_Stops'] = df['Total_Stops'].map({"non-stop":0, "1 stop": 1, "2 stops": 2, "3 stops":3, "4 stops": 4})

# Concat categorical and numerical data
preprocessed = pd.concat([df, df_categorical], axis=1)
target = df['Price']

# Drop 'Price' because price is the target variable
preprocessed.drop(['Price'], axis=1, inplace=True)

# Train_Test_Split
data_train, data_test, target_train, target_test = train_test_split(
    preprocessed, target,test_size=0.3, random_state=42)

## Feature Engineering - check if date is weekday or weekend
data_train["IsWeekend"] = data_train.apply(lambda x: 1 if CheckWeekend(x["Date_of_Journey_Year"], x["Date_of_Journey_Month"], x["Date_of_Journey_Day"]) else 0, axis=1)
data_test["IsWeekend"] = data_test.apply(lambda x: 1 if CheckWeekend(x["Date_of_Journey_Year"], x["Date_of_Journey_Month"], x["Date_of_Journey_Day"]) else 0, axis=1)


In [14]:
'''
scaler = MinMaxScaler()
data_train[['Duration']] = scaler.fit_transform(data_train[['Duration']])
data_test[['Duration']] = scaler.fit_transform(data_test[['Duration']])
'''

"\nscaler = MinMaxScaler()\ndata_train[['Duration']] = scaler.fit_transform(data_train[['Duration']])\ndata_test[['Duration']] = scaler.fit_transform(data_test[['Duration']])\n"

## Regression Tree

In [15]:
# Initializing the Decision Tree Regression model
rg_tree = DecisionTreeRegressor(random_state = 42)


#use cross-validation to estimate model performance
cross_regression_tree = cross_val_score(rg_tree, data_train, target_train, cv=10, scoring='r2')
print('Average Cross-Validation R-squared score: ', cross_regression_tree.mean())


# Fitting the Decision Tree Regression model to the data
rg_tree.fit(data_train, target_train)

# Predicting the target values of the test set
y_pred = rg_tree.predict(data_test)


#Evaluate the model on the testing set
#r_squared
r2_regression_tree = r2_score(target_test, y_pred)
print('R-Squared:', r2_regression_tree)

#MSE
mean_squared_error_regression_tree = mean_squared_error(target_test, y_pred)
print('MSE:', mean_squared_error_regression_tree)

Average Cross-Validation R-squared score:  0.6883334583334569
R-Squared: 0.6903080744088547
MSE: 6431864.595777582


## Hyperparameter Tuning

In [16]:
# Using RandomizedSearchCV for hyperparameter tuning

params = {'max_features': ['auto', 'sqrt'],
          'max_depth':[int(x) for x in np.linspace(start = 1, stop = 20, num = 20)],
          'min_samples_leaf':[int(x) for x in np.linspace(start = 1, stop = 20, num = 20)],
          'min_samples_split':[int(x) for x in np.linspace(start = 1, stop = 20, num = 20)]
         }

RS_regression_tree = RandomizedSearchCV(random_state = 42, estimator=rg_tree,param_distributions=params,cv=5,n_iter=10,n_jobs=1, verbose=True, scoring='r2')
RS_regression_tree.fit(data_train, target_train)
print('Best parameters: ',RS_regression_tree.best_params_)
print('Best R-sqaured score: ',RS_regression_tree.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters:  {'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 15}
Best R-sqaured score:  0.7030393085336621


In [17]:
rg_tree = DecisionTreeRegressor(random_state = 42, min_samples_split = 5, min_samples_leaf = 5, max_features = 'auto', max_depth = 15)

#use cross-validation to estimate model performance
cross_regression_tree = cross_val_score(rg_tree, data_train, target_train, cv=10, scoring='r2')
print('Average Cross-Validation R-squared score: ', cross_regression_tree.mean())

# Fitting the Decision Tree Regression model to the data
rg_tree.fit(data_train, target_train)

# Predicting the target values of the test set
y_pred = rg_tree.predict(data_test)


#Evaluate the model on the testing set
#r_squared
r2_regression_tree = r2_score(target_test, y_pred)
print('R-Squared:', r2_regression_tree)

#MSE
mean_squared_error_regression_tree = mean_squared_error(target_test, y_pred)
print('MSE:', mean_squared_error_regression_tree)

#RMSE
root_mean_squared_error_regression_tree = math.sqrt(mean_squared_error_regression_tree)
print('RMSE:', root_mean_squared_error_regression_tree)

Average Cross-Validation R-squared score:  0.7260194936813701
R-Squared: 0.7308717527849882
MSE: 5589414.195031904
RMSE: 2364.194195710645
