## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

## Data Preprocessing

In [2]:
# Flight fare prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Preprocessing
# import dataset
df = pd.read_excel("Data_Train.xlsx")

# drop null value
df.dropna(inplace=True)

# drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# convert Dep_Time and Arrival_Time into hour and minute
# Dep_Time
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_minute"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df = df.drop(columns="Dep_Time")

# Arrival_Time
df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arr_minute"] = pd.to_datetime(df["Arrival_Time"]).dt.minute
df = df.drop(columns="Arrival_Time")

# convert Date_of_Journey into timestamp
df["Date_of_Journey_Year"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.year
# ps. I don't think "year" is mandatory
df["Date_of_Journey_Month"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.month
df["Date_of_Journey_Day"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.day
df = df.drop(columns="Date_of_Journey")


# convert Duration into minute
df['Duration'] = df['Duration']\
    .str.replace("h", '*60')\
    .str.replace(' ', '+')\
    .str.replace('m', '*1')\
    .apply(eval)

# drop Additional_Info 
df.drop(['Additional_Info'], axis=1, inplace=True)

# Encode categorical attributes

# Select categorical data
df_categorical = df[['Airline', 'Source', 'Destination']]
df.drop(['Airline', 'Source', 'Destination'], axis=1, inplace=True)


# Encode
df_categorical = pd.get_dummies(df_categorical, drop_first=True)
df['Total_Stops'] = df['Total_Stops'].map({"non-stop":0, "1 stop": 1, "2 stops": 2, "3 stops":3, "4 stops": 4})

# Concat categorical and numerical data
preprocessed = pd.concat([df, df_categorical], axis=1)
target = df['Price']

# Drop 'Price' because price is the target variable
preprocessed.drop(['Price'], axis=1, inplace=True)

# Train_Test_Split
data_train, data_test, target_train, target_test = train_test_split(
    preprocessed, target,test_size=0.3, random_state=42)

- Drop route

In [3]:
data_train.drop(['Route'], axis=1, inplace=True)
data_test.drop(['Route'], axis=1, inplace=True)

## Regression Tree

In [4]:
display(data_train.head(5))
display(data_test.head(5))
display(target_train.head(5))
display(target_test.head(5))

Unnamed: 0,Duration,Total_Stops,Dep_hour,Dep_minute,Arr_hour,Arr_minute,Date_of_Journey_Year,Date_of_Journey_Month,Date_of_Journey_Day,Airline_Air India,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
5050,1830,3,16,45,23,15,2019,4,1,1,...,0,0,0,1,0,0,0,0,0,0
4879,480,1,13,0,21,0,2019,6,1,0,...,0,0,1,0,0,1,0,0,0,0
208,570,1,14,5,23,35,2019,5,21,0,...,0,0,0,1,0,0,0,0,0,0
8561,930,1,10,0,1,30,2019,6,9,0,...,0,0,1,0,0,1,0,0,0,0
7265,165,0,22,10,0,55,2019,3,21,0,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,Duration,Total_Stops,Dep_hour,Dep_minute,Arr_hour,Arr_minute,Date_of_Journey_Year,Date_of_Journey_Month,Date_of_Journey_Day,Airline_Air India,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
2150,1455,1,8,0,8,15,2019,3,6,0,...,0,0,0,0,0,0,0,0,0,1
3784,140,0,22,20,0,40,2019,6,6,0,...,0,0,0,1,0,0,0,0,0,0
714,170,0,5,30,8,20,2019,3,18,0,...,0,0,0,1,0,0,0,0,0,0
7558,90,0,15,50,17,20,2019,3,24,0,...,0,0,0,0,1,0,0,1,0,0
7413,170,0,9,30,12,20,2019,4,27,0,...,0,0,0,0,0,0,1,0,0,0


5050     8607
4879    13587
208     10844
8561    13377
7265     4148
Name: Price, dtype: int64

2150    17996
3784     3873
714      4462
7558     2228
7413     4991
Name: Price, dtype: int64

In [5]:
# Initializing the Decision Tree Regression model
rg_tree = DecisionTreeRegressor(random_state = 0)

'''
#use cross-validation to estimate model performance
cross_regression_tree = cross_val_score(rg_tree, data_train, target_train, cv=10, scoring='r2')
print('Average Cross-Validation R-squared score: ', cross_regression_tree.mean())
'''

# Fitting the Decision Tree Regression model to the data
rg_tree.fit(data_train, target_train)

# Predicting the target values of the test set
y_pred = rg_tree.predict(data_test)


#Evaluate the model on the testing set
#r_squared
r2_regression_tree = r2_score(target_test, y_pred)
print('R-Squared:', r2_regression_tree)

#MSE
mean_squared_error_regression_tree = mean_squared_error(target_test, y_pred)
print('MSE:', mean_squared_error_regression_tree)

R-Squared: 0.6836740393021428
MSE: 6569644.150244681


## Hyperparametre Tuning