In [211]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error


In [212]:
df_data_train=pd.read_csv("Data_train.csv")

In [213]:
df_data_train

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU ? BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU ? BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR ? DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR ? DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [214]:
df_data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [215]:
df_data_train.describe()

Unnamed: 0,Price
count,10683.0
mean,9087.064121
std,4611.359167
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [216]:
df_data_train.duplicated().sum()

220

In [217]:
df_data_train.drop_duplicates(inplace=True)

In [218]:
(df_data_train.isna().sum()/df_data_train.shape[0])*100

Airline            0.000000
Date_of_Journey    0.000000
Source             0.000000
Destination        0.000000
Route              0.009557
Dep_Time           0.000000
Arrival_Time       0.000000
Duration           0.000000
Total_Stops        0.009557
Additional_Info    0.000000
Price              0.000000
dtype: float64

In [219]:
route_mode= df_data_train["Route"].mode()[0]
df_data_train["Route"].fillna(route_mode,inplace=True)

In [220]:
total_Stops_mode= df_data_train["Total_Stops"].mode()[0]
df_data_train["Total_Stops"].fillna(total_Stops_mode,inplace=True)

In [221]:
df_data_train.shape

(10463, 11)

## Feature Engineering- 
- manipulating existing columns to create a new one by use of domain knowledge

In [222]:
df_data_train["Date_of_Journey"]=pd.to_datetime(df_data_train["Date_of_Journey"], format="%d/%m/%Y")

In [223]:
df_data_train["month_of_journey"]=df_data_train["Date_of_Journey"].dt.month
df_data_train["days_of_journey"]=df_data_train["Date_of_Journey"].dt.day_name()
df_data_train["year_of_journey"]=df_data_train["Date_of_Journey"].dt.year

In [224]:
df_data_train

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month_of_journey,days_of_journey,year_of_journey
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,Sunday,2019
1,Air India,2019-05-01,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,Wednesday,2019
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6,Sunday,2019
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,Sunday,2019
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3,Friday,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,2019-04-09,Kolkata,Banglore,CCU ? BLR,19:55,22:25,2h 30m,non-stop,No info,4107,4,Tuesday,2019
10679,Air India,2019-04-27,Kolkata,Banglore,CCU ? BLR,20:45,23:20,2h 35m,non-stop,No info,4145,4,Saturday,2019
10680,Jet Airways,2019-04-27,Banglore,Delhi,BLR ? DEL,08:20,11:20,3h,non-stop,No info,7229,4,Saturday,2019
10681,Vistara,2019-03-01,Banglore,New Delhi,BLR ? DEL,11:30,14:10,2h 40m,non-stop,No info,12648,3,Friday,2019


In [225]:
df_data_train.drop(columns=["Date_of_Journey", "Dep_Time", "Arrival_Time","Route" ], inplace=True)

In [226]:
def extract(x):
    hour=0
    minute =0
    x=x.split()
    if len(x)==1:
        if x[0].endswith("h"):
            hour=int (x[0].strip("h"))
        else:
            minute=int (x[0].strip("m"))
    else:
        hour=int (x[0].strip("h"))
        minute=int (x[1].strip("m"))
        
    return hour, minute
        

In [227]:
hour=df_data_train["Duration"].apply(lambda p:extract(p)[0])*60
minute=df_data_train["Duration"].apply(lambda p:extract(p)[1])
df_data_train["duration_in_minute"]=hour+minute

In [228]:
df_data_train.drop(columns=["Duration" ], inplace=True)

In [229]:
# def split_duration(Duration):
#     hours = 0
#     minutes = 0
#     if "h" in Duration:
#         hours = int(Duration.split('h')[0].strip())
#         if "m" in Duration:
#             minutes = int(Duration.split('h')[1].strip(' m'))
#     return hours, minutes
# df_data_train["Hours"], df_data_train["Minutes"] = zip(*df_data_train["Duration"].apply(split_duration))


In [230]:
df_data_train["Airline"].value_counts()

Jet Airways                          3700
IndiGo                               2043
Air India                            1695
Multiple carriers                    1196
SpiceJet                              815
Vistara                               478
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [231]:
df_data_train["Airline"].replace(["Jet Airways Business", "Multiple carriers Premium economy", "Vistara Premium economy"],
                                ["Jet Airways","Multiple carriers", "Vistara"], inplace=True)

In [232]:
X_dummies=df_data_train_d=pd.get_dummies(df_data_train, columns=["Airline","Source","Destination","Total_Stops","Additional_Info"]
                               ,drop_first=True)

In [233]:
le=LabelEncoder()
df_data_train_d["days_of_journey"]=le.fit_transform(df_data_train_d["days_of_journey"])

In [234]:
df_data_train_d

Unnamed: 0,Price,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,...,Total_Stops_non-stop,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,3897,3,3,2019,170,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,7662,5,6,2019,445,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,13882,6,3,2019,1140,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,6218,5,3,2019,325,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,13302,3,0,2019,285,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,4107,4,5,2019,150,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10679,4145,4,2,2019,155,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10680,7229,4,2,2019,180,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
10681,12648,3,0,2019,160,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [235]:
df_data_train_d.to_csv("clean_train.csv", index=False)

In [236]:
X= df_data_train_d.copy()
y=X.pop("Price")

In [237]:
X

Unnamed: 0,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,Airline_SpiceJet,...,Total_Stops_non-stop,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,3,3,2019,170,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,5,6,2019,445,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,6,3,2019,1140,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,3,2019,325,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,0,2019,285,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,4,5,2019,150,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10679,4,2,2019,155,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10680,4,2,2019,180,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
10681,3,0,2019,160,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [238]:
x_train, x_test, y_train, y_test= train_test_split(X_dummies, y, test_size=0.2, random_state=42)

In [239]:
dt=DecisionTreeClassifier(random_state=42)
model_1= dt.fit(x_train, y_train ) 

In [240]:
le=LabelEncoder()
model_2= dt.fit(x_train, y_train ) 

In [241]:
abr=AdaBoostRegressor(random_state=42)
model_3= dt.fit(x_train, y_train ) 

In [244]:
prediction_1=model_1.predict(x_test)
eval_1=mean_absolute_error(y_test, prediction_1)
eval_1_m=mean_squared_error(y_test, prediction_1)
eval_1_r=r2_score(y_test, prediction_1)

print(f"the mean absolute error by {dt} is {eval_1}\n")
print(f"the mean squared error by {dt} is {eval_1}\n")
print(f"the r2 score by {dt} is {eval_1}\n")

the mean absolute error by DecisionTreeClassifier(random_state=42) is 89.45819397993311

the mean squared error by DecisionTreeClassifier(random_state=42) is 89.45819397993311

the r2 score by DecisionTreeClassifier(random_state=42) is 89.45819397993311



In [246]:
prediction_2=model_1.predict(x_test)
eval_2=mean_absolute_error(y_test, prediction_2)
eval_2_m=mean_squared_error(y_test, prediction_2)
eval_2_r=r2_score(y_test, prediction_2)

print(f"the mean absolute error by {le} is {eval_2}\n")
print(f"the mean squared error by {le} is {eval_2}\n")
print(f"the r2 score by {le} is {eval_2}\n")

the mean absolute error by LabelEncoder() is 89.45819397993311

the mean squared error by LabelEncoder() is 89.45819397993311

the r2 score by LabelEncoder() is 89.45819397993311



In [247]:
prediction_3=model_1.predict(x_test)
eval_3=mean_absolute_error(y_test, prediction_3)
eval_3_m=mean_squared_error(y_test, prediction_3)
eval_3_r=r2_score(y_test, prediction_3)

print(f"the mean absolute error by {abr} is {eval_3}\n")
print(f"the mean squared error by {abr} is {eval_3}\n")
print(f"the r2 score by {abr} is {eval_3}\n")

the mean absolute error by AdaBoostRegressor(random_state=42) is 89.45819397993311

the mean squared error by AdaBoostRegressor(random_state=42) is 89.45819397993311

the r2 score by AdaBoostRegressor(random_state=42) is 89.45819397993311



In [248]:
df_data_test=pd.read_csv("Test_set.csv")

In [249]:
df_data_test

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL ? BOM ? COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? MAA ? BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL ? BOM ? COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL ? BOM ? COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR ? DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info
...,...,...,...,...,...,...,...,...,...,...
2666,Air India,6/06/2019,Kolkata,Banglore,CCU ? DEL ? BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info
2667,IndiGo,27/03/2019,Kolkata,Banglore,CCU ? BLR,14:20,16:55,2h 35m,non-stop,No info
2668,Jet Airways,6/03/2019,Delhi,Cochin,DEL ? BOM ? COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info
2669,Air India,6/03/2019,Delhi,Cochin,DEL ? BOM ? COK,04:00,19:15,15h 15m,1 stop,No info


In [250]:
df_data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2671 entries, 0 to 2670
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          2671 non-null   object
 1   Date_of_Journey  2671 non-null   object
 2   Source           2671 non-null   object
 3   Destination      2671 non-null   object
 4   Route            2671 non-null   object
 5   Dep_Time         2671 non-null   object
 6   Arrival_Time     2671 non-null   object
 7   Duration         2671 non-null   object
 8   Total_Stops      2671 non-null   object
 9   Additional_Info  2671 non-null   object
dtypes: object(10)
memory usage: 208.8+ KB


In [253]:
df_data_test.duplicated().sum()

0

In [252]:
df_data_test.drop_duplicates(inplace=True)

In [254]:
(df_data_test.isna().sum()/df_data_test.shape[0])*100

Airline            0.0
Date_of_Journey    0.0
Source             0.0
Destination        0.0
Route              0.0
Dep_Time           0.0
Arrival_Time       0.0
Duration           0.0
Total_Stops        0.0
Additional_Info    0.0
dtype: float64

In [255]:
df_data_test.shape

(2645, 10)

In [257]:
df_data_test["Date_of_Journey"]=pd.to_datetime(df_data_test["Date_of_Journey"], format="%d/%m/%Y")

In [259]:
df_data_test["month_of_journey"]=df_data_test["Date_of_Journey"].dt.month
df_data_test["days_of_journey"]=df_data_test["Date_of_Journey"].dt.day_name()
df_data_test["year_of_journey"]=df_data_test["Date_of_Journey"].dt.year

In [261]:
df_data_test.drop(columns=["Date_of_Journey", "Dep_Time", "Arrival_Time","Route" ], inplace=True)

In [263]:
hour=df_data_test["Duration"].apply(lambda p:extract(p)[0])*60
minute=df_data_test["Duration"].apply(lambda p:extract(p)[1])
df_data_test["duration_in_minute"]=hour+minute

In [265]:
df_data_test.drop(columns=["Duration" ], inplace=True)

In [266]:
df_data_test["Airline"].value_counts()

Jet Airways                          886
IndiGo                               509
Air India                            432
Multiple carriers                    343
SpiceJet                             208
Vistara                              129
Air Asia                              86
GoAir                                 45
Multiple carriers Premium economy      3
Vistara Premium economy                2
Jet Airways Business                   2
Name: Airline, dtype: int64

In [268]:
df_data_test["Airline"].replace(["Jet Airways Business", "Multiple carriers Premium economy", "Vistara Premium economy"],
                                ["Jet Airways","Multiple carriers", "Vistara"], inplace=True)

In [270]:
X_dummies=df_data_test=pd.get_dummies(df_data_train, columns=["Airline","Source","Destination","Total_Stops","Additional_Info"]
                               ,drop_first=True)

In [273]:
le=LabelEncoder()
df_data_test["days_of_journey"]=le.fit_transform(df_data_test["days_of_journey"])

In [275]:
X= df_data_test.copy()
y=X.pop("Price")

In [276]:
df_data_test

Unnamed: 0,Price,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,...,Total_Stops_non-stop,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,3897,3,3,2019,170,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,7662,5,6,2019,445,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,13882,6,3,2019,1140,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,6218,5,3,2019,325,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,13302,3,0,2019,285,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,4107,4,5,2019,150,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10679,4145,4,2,2019,155,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10680,7229,4,2,2019,180,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
10681,12648,3,0,2019,160,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
