## Import Libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Data Preprocessing

In [9]:
# Flight fare prediction
# Preprocessing
# import dataset
df = pd.read_excel("Data_Train.xlsx")

# drop null value
df.dropna(inplace=True)

# drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# convert Dep_Time and Arrival_Time into hour and minute
# Dep_Time
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_minute"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df = df.drop(columns="Dep_Time")

# Arrival_Time
df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arr_minute"] = pd.to_datetime(df["Arrival_Time"]).dt.minute
df = df.drop(columns="Arrival_Time")

# convert Date_of_Journey into timestamp
df["Date_of_Journey_Year"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.year
# ps. I don't think "year" is mandatory
df["Date_of_Journey_Month"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.month
df["Date_of_Journey_Day"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.day
df = df.drop(columns="Date_of_Journey")


# convert Duration into minute
df['Duration'] = df['Duration']\
    .str.replace("h", '*60')\
    .str.replace(' ', '+')\
    .str.replace('m', '*1')\
    .apply(eval)

# Encode categorical attributes

# Select categorical data
df_categorical = df[['Airline', 'Source', 'Destination']]
df.drop(['Airline', 'Source', 'Destination'], axis=1, inplace=True)


# Encode
df_categorical = pd.get_dummies(df_categorical, drop_first=True)
df['Total_Stops'] = df['Total_Stops'].map({"non-stop":0, "1 stop": 1, "2 stops": 2, "3 stops":3, "4 stops": 4})

# Concat categorical and numerical data
preprocessed = pd.concat([df, df_categorical], axis=1)
target = df['Price']

# Drop 'Price' because price is the target variable
preprocessed.drop(['Price'], axis=1, inplace=True)

# Train_Test_Split
data_train, data_test, target_train, target_test = train_test_split(
    preprocessed, target,test_size=0.3, random_state=42)

## Regression Tree

In [10]:
display(data_train.head(5))
display(data_test.head(5))
display(target_train.head(5))
display(target_test.head(5))

Unnamed: 0,Route,Duration,Total_Stops,Additional_Info,Dep_hour,Dep_minute,Arr_hour,Arr_minute,Date_of_Journey_Year,Date_of_Journey_Month,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
5050,CCU → GAU → IMF → DEL → BLR,1830,3,No info,16,45,23,15,2019,4,...,0,0,0,1,0,0,0,0,0,0
4879,DEL → BOM → COK,480,1,No info,13,0,21,0,2019,6,...,0,0,1,0,0,1,0,0,0,0
208,CCU → BOM → BLR,570,1,In-flight meal not included,14,5,23,35,2019,5,...,0,0,0,1,0,0,0,0,0,0
8561,DEL → BOM → COK,930,1,No info,10,0,1,30,2019,6,...,0,0,1,0,0,1,0,0,0,0
7265,CCU → BLR,165,0,No info,22,10,0,55,2019,3,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,Route,Duration,Total_Stops,Additional_Info,Dep_hour,Dep_minute,Arr_hour,Arr_minute,Date_of_Journey_Year,Date_of_Journey_Month,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
2150,BLR → BOM → DEL,1455,1,No info,8,0,8,15,2019,3,...,0,0,0,0,0,0,0,0,0,1
3784,CCU → BLR,140,0,No info,22,20,0,40,2019,6,...,0,0,0,1,0,0,0,0,0,0
714,CCU → BLR,170,0,No info,5,30,8,20,2019,3,...,0,0,0,1,0,0,0,0,0,0
7558,BOM → HYD,90,0,In-flight meal not included,15,50,17,20,2019,3,...,0,0,0,0,1,0,0,1,0,0
7413,BLR → DEL,170,0,No info,9,30,12,20,2019,4,...,0,0,0,0,0,0,1,0,0,0


5050     8607
4879    13587
208     10844
8561    13377
7265     4148
Name: Price, dtype: int64

2150    17996
3784     3873
714      4462
7558     2228
7413     4991
Name: Price, dtype: int64

- Drop route and additional info

In [13]:
data_train.drop(['Route', 'Additional_Info'], axis=1, inplace=True)
data_test.drop(['Route', 'Additional_Info'], axis=1, inplace=True)

<bound method DataFrame.info of       Duration  Total_Stops  Dep_hour  Dep_minute  Arr_hour  Arr_minute  \
2150      1455            1         8           0         8          15   
3784       140            0        22          20         0          40   
714        170            0         5          30         8          20   
7558        90            0        15          50        17          20   
7413       170            0         9          30        12          20   
...        ...          ...       ...         ...       ...         ...   
4600       300            1        14           0        19           0   
5310      1115            2        13           5         7          40   
1956      1795            2        17           0        22          55   
2330       175            0         7          10        10           5   
2311      1375            2         5          30         4          25   

      Date_of_Journey_Year  Date_of_Journey_Month  Date_of_Journey_

In [29]:
# Initializing the Decision Tree Regression model
model = DecisionTreeRegressor(random_state = 0)

# Fitting the Decision Tree Regression model to the data
model.fit(data_train, target_train)

# Predicting the target values of the test set
y_pred = model.predict(data_test)

#MSE
mean_squared_error_regression_tree = mean_squared_error(target_test, y_pred)
print('MSE:', mean_squared_error_regression_tree)

#rs_score
r2_score_regression_tree = r2_score(target_test, y_pred)
print('R Squared:', r2_score_regression_tree)

print(model.score(data_train, target_train))

MSE: 6569644.150244681
R Squared: 0.6836740393021428
0.9741256432975037
