In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error


In [45]:
train=pd.read_csv("Data_Train.csv")
test=pd.read_csv("Test_set.csv")

In [46]:
test.duplicated().sum()

26

In [47]:
test.drop_duplicates(inplace=True)

In [48]:
train.duplicated().sum()

220

In [49]:
train.drop_duplicates(inplace=True)

In [50]:
train_len=len(train)

In [51]:
full_df=pd.concat([train,test], ignore_index=True)

In [52]:
full_df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0
...,...,...,...,...,...,...,...,...,...,...,...
13103,Air India,6/06/2019,Kolkata,Banglore,CCU ? DEL ? BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info,
13104,IndiGo,27/03/2019,Kolkata,Banglore,CCU ? BLR,14:20,16:55,2h 35m,non-stop,No info,
13105,Jet Airways,6/03/2019,Delhi,Cochin,DEL ? BOM ? COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info,
13106,Air India,6/03/2019,Delhi,Cochin,DEL ? BOM ? COK,04:00,19:15,15h 15m,1 stop,No info,


In [53]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13108 entries, 0 to 13107
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13108 non-null  object 
 1   Date_of_Journey  13108 non-null  object 
 2   Source           13108 non-null  object 
 3   Destination      13108 non-null  object 
 4   Route            13107 non-null  object 
 5   Dep_Time         13108 non-null  object 
 6   Arrival_Time     13108 non-null  object 
 7   Duration         13108 non-null  object 
 8   Total_Stops      13107 non-null  object 
 9   Additional_Info  13108 non-null  object 
 10  Price            10463 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.1+ MB


In [54]:
full_df[full_df.duplicated()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price


In [55]:
full_df.drop_duplicates(inplace=True)

In [56]:
(full_df.isna().sum()/full_df.shape[0])*100

Airline             0.000000
Date_of_Journey     0.000000
Source              0.000000
Destination         0.000000
Route               0.007629
Dep_Time            0.000000
Arrival_Time        0.000000
Duration            0.000000
Total_Stops         0.007629
Additional_Info     0.000000
Price              20.178517
dtype: float64

In [57]:
route_mode= full_df["Route"].mode()[0]
full_df["Route"].fillna(route_mode,inplace=True)

In [58]:
total_Stops_mode= full_df["Total_Stops"].mode()[0]
full_df["Total_Stops"].fillna(total_Stops_mode,inplace=True)

In [59]:
full_df.shape

(13108, 11)

## Feature Engineering- 
- manipulating existing columns to create a new one by use of domain knowledge

In [60]:
full_df["Date_of_Journey"]=pd.to_datetime(full_df["Date_of_Journey"], format="%d/%m/%Y")

In [61]:
full_df["month_of_journey"]=full_df["Date_of_Journey"].dt.month
full_df["days_of_journey"]=full_df["Date_of_Journey"].dt.day_name()
full_df["year_of_journey"]=full_df["Date_of_Journey"].dt.year

In [62]:
full_df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month_of_journey,days_of_journey,year_of_journey
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,3,Sunday,2019
1,Air India,2019-05-01,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,5,Wednesday,2019
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0,6,Sunday,2019
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0,5,Sunday,2019
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0,3,Friday,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13103,Air India,2019-06-06,Kolkata,Banglore,CCU ? DEL ? BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info,,6,Thursday,2019
13104,IndiGo,2019-03-27,Kolkata,Banglore,CCU ? BLR,14:20,16:55,2h 35m,non-stop,No info,,3,Wednesday,2019
13105,Jet Airways,2019-03-06,Delhi,Cochin,DEL ? BOM ? COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info,,3,Wednesday,2019
13106,Air India,2019-03-06,Delhi,Cochin,DEL ? BOM ? COK,04:00,19:15,15h 15m,1 stop,No info,,3,Wednesday,2019


In [63]:
full_df.drop(columns=["Date_of_Journey", "Dep_Time", "Arrival_Time","Route" ], inplace=True)

In [64]:
def extract(x):
    hour=0
    minute =0
    x=x.split()
    if len(x)==1:
        if x[0].endswith("h"):
            hour=int (x[0].strip("h"))
        else:
            minute=int (x[0].strip("m"))
    else:
        hour=int (x[0].strip("h"))
        minute=int (x[1].strip("m"))
        
    return hour, minute
        

In [65]:
hour=full_df["Duration"].apply(lambda p:extract(p)[0])*60
minute=full_df["Duration"].apply(lambda p:extract(p)[1])
full_df["duration_in_minute"]=hour+minute

In [66]:
full_df.drop(columns=["Duration" ], inplace=True)

In [67]:
# def split_duration(Duration):
#     hours = 0
#     minutes = 0
#     if "h" in Duration:
#         hours = int(Duration.split('h')[0].strip())
#         if "m" in Duration:
#             minutes = int(Duration.split('h')[1].strip(' m'))
#     return hours, minutes
# df_data_train["Hours"], df_data_train["Minutes"] = zip(*df_data_train["Duration"].apply(split_duration))


In [68]:
full_df["Airline"].value_counts()

Jet Airways                          4586
IndiGo                               2552
Air India                            2127
Multiple carriers                    1539
SpiceJet                             1023
Vistara                               607
Air Asia                              405
GoAir                                 239
Multiple carriers Premium economy      16
Jet Airways Business                    8
Vistara Premium economy                 5
Trujet                                  1
Name: Airline, dtype: int64

In [69]:
full_df["Airline"].replace(["Jet Airways Business", "Multiple carriers Premium economy", "Vistara Premium economy"],
                                ["Jet Airways","Multiple carriers", "Vistara"], inplace=True)

In [70]:
df_d=df_data_train_d=pd.get_dummies(full_df, columns=["Airline","Source","Destination","Total_Stops","Additional_Info"]
                               ,drop_first=True)

In [71]:
le=LabelEncoder()
df_d["days_of_journey"]=le.fit_transform(df_d["days_of_journey"])

In [72]:
df_d

Unnamed: 0,Price,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,...,Total_Stops_non-stop,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,3897.0,3,3,2019,170,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,7662.0,5,6,2019,445,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,13882.0,6,3,2019,1140,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,6218.0,5,3,2019,325,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,13302.0,3,0,2019,285,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13103,,6,4,2019,1435,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
13104,,3,6,2019,155,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
13105,,3,6,2019,395,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
13106,,3,6,2019,915,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [73]:
new_train=df_d[:train_len]
new_train.shape

(10463, 35)

In [74]:
new_test=df_d[train_len:].drop(columns=["Price"])
new_test.shape

(2645, 34)

In [75]:
df_d.to_csv("clean_train.csv", index=False)

In [76]:
X= new_train.copy()
y=X.pop("Price")

In [77]:
X

Unnamed: 0,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,Airline_SpiceJet,...,Total_Stops_non-stop,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight
0,3,3,2019,170,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,5,6,2019,445,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,6,3,2019,1140,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,3,2019,325,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,0,2019,285,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10458,4,5,2019,150,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10459,4,2,2019,155,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
10460,4,2,2019,180,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
10461,3,0,2019,160,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [78]:
x_train, x_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
dt=DecisionTreeClassifier(random_state=42)
model_1= dt.fit(x_train, y_train ) 

In [80]:
le=LinearRegression()
model_2= le.fit(x_train, y_train ) 

In [81]:
abr=AdaBoostRegressor(random_state=42)
model_3= abr.fit(x_train, y_train ) 

In [82]:
prediction_1=model_1.predict(x_test)
eval_1=mean_absolute_error(y_test, prediction_1)
eval_1_m=mean_squared_error(y_test, prediction_1)
eval_1_r=r2_score(y_test, prediction_1)

print(f"the mean absolute error by {dt} is {eval_1}\n")
print(f"the mean squared error by {dt} is {eval_1_m}\n")
print(f"the r2 score by {dt} is {eval_1_r}\n")

the mean absolute error by DecisionTreeClassifier(random_state=42) is 1345.0086000955566

the mean squared error by DecisionTreeClassifier(random_state=42) is 8681849.68418538

the r2 score by DecisionTreeClassifier(random_state=42) is 0.5838975703984588



In [83]:
prediction_2=model_2.predict(x_test)
eval_2=mean_absolute_error(y_test, prediction_2)
eval_2_m=mean_squared_error(y_test, prediction_2)
eval_2_r=r2_score(y_test, prediction_2)

print(f"the mean absolute error by {le} is {eval_2}\n")
print(f"the mean squared error by {le} is {eval_2_m}\n")
print(f"the r2 score by {le} is {eval_2_r}\n")

the mean absolute error by LinearRegression() is 1795.02580028667

the mean squared error by LinearRegression() is 7475980.542283803

the r2 score by LinearRegression() is 0.6416922913368751



In [84]:
prediction_3=model_3.predict(x_test)
eval_3=mean_absolute_error(y_test, prediction_3)
eval_3_m=mean_squared_error(y_test, prediction_3)
eval_3_r=r2_score(y_test, prediction_3)

print(f"the mean absolute error by {abr} is {eval_3}\n")
print(f"the mean squared error by {abr} is {eval_3_m}\n")
print(f"the r2 score by {abr} is {eval_3_r}\n")

the mean absolute error by AdaBoostRegressor(random_state=42) is 2037.3317619399552

the mean squared error by AdaBoostRegressor(random_state=42) is 8966456.55389754

the r2 score by AdaBoostRegressor(random_state=42) is 0.5702569737195922



In [86]:
new_test["Predicted_price"]=model_1.predict(new_test)


In [87]:
new_test

Unnamed: 0,month_of_journey,days_of_journey,year_of_journey,duration_in_minute,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Multiple carriers,Airline_SpiceJet,...,Additional_Info_1 Short layover,Additional_Info_2 Long layover,Additional_Info_Business class,Additional_Info_Change airports,Additional_Info_In-flight meal not included,Additional_Info_No Info,Additional_Info_No check-in baggage included,Additional_Info_No info,Additional_Info_Red-eye flight,Predicted_price
10463,6,4,2019,655,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,14714.0
10464,5,3,2019,240,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,4226.0
10465,5,5,2019,1425,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,12898.0
10466,5,5,2019,780,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,14277.0
10467,6,1,2019,170,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,4282.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13103,6,4,2019,1435,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,10203.0
13104,3,6,2019,155,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,5618.0
13105,3,6,2019,395,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,12242.0
13106,3,6,2019,915,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,5998.0
