In [1]:
import pandas as pd
import numpy as np

In [2]:
#importing excel file (train)
train = pd.read_excel('Data_Train.xlsx')
#importing excel file (test)
test = pd.read_excel('Test_set.xlsx') 
pd.set_option('display.max_columns', None)

In [3]:
train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
train['Additional_Info'].value_counts()

No info                         8345
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
2 Long layover                     1
Red-eye flight                     1
1 Short layover                    1
Name: Additional_Info, dtype: int64

In [5]:
train['Destination'].value_counts()

Cochin       4537
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64

In [6]:
test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [7]:
train.shape, test.shape

((10683, 11), (2671, 10))

In [8]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(2, 0)

In [9]:
# Handling missing values 
train['Total_Stops'].fillna(train['Total_Stops'].mode()[0], inplace=True)
train['Route'].fillna(train['Route'].mode()[0], inplace=True)

# EDA

In [10]:
# Checking that which airlines price are comparatively higher than the others
train.groupby('Airline')['Price'].mean()

Airline
Air Asia                              5590.260188
Air India                             9611.210616
GoAir                                 5861.056701
IndiGo                                5673.682903
Jet Airways                          11643.923357
Jet Airways Business                 58358.666667
Multiple carriers                    10902.678094
Multiple carriers Premium economy    11418.846154
SpiceJet                              4338.284841
Trujet                                4140.000000
Vistara                               7796.348643
Vistara Premium economy               8962.333333
Name: Price, dtype: float64

In [11]:
train.groupby('Source')['Price'].mean()

Source
Banglore     8017.464269
Chennai      4789.892388
Delhi       10539.439057
Kolkata      9158.389411
Mumbai       5059.708752
Name: Price, dtype: float64

In [12]:
train['Destination'].value_counts(normalize=True)

Cochin       0.424693
Banglore     0.268745
Delhi        0.118412
New Delhi    0.087241
Hyderabad    0.065244
Kolkata      0.035664
Name: Destination, dtype: float64

In [13]:
train['Route'].value_counts(normalize=True)

DEL → BOM → COK                0.222503
BLR → DEL                      0.145278
CCU → BOM → BLR                0.091641
CCU → BLR                      0.067771
BOM → HYD                      0.058130
                                 ...   
BOM → RPR → VTZ → HYD          0.000094
BOM → CCU → HYD                0.000094
BOM → COK → MAA → HYD          0.000094
BOM → VNS → DEL → HYD          0.000094
BLR → HBX → BOM → BHO → DEL    0.000094
Name: Route, Length: 128, dtype: float64

In [14]:
train['Duration'].value_counts(normalize=True)

2h 50m     0.051484
1h 30m     0.036132
2h 55m     0.031545
2h 45m     0.031545
2h 35m     0.030797
             ...   
4h 10m     0.000094
32h 20m    0.000094
30h 15m    0.000094
35h 20m    0.000094
33h 20m    0.000094
Name: Duration, Length: 368, dtype: float64

In [15]:
train['Total_Stops'].value_counts(normalize = True)

1 stop      0.526631
non-stop    0.326781
2 stops     0.142282
3 stops     0.004212
4 stops     0.000094
Name: Total_Stops, dtype: float64

# Feature Engineering for Training dataset

## 1 : Date_of_Journey

In [16]:
# Extracting day & month of journey from Date of Journey feature
train['Day_of_Journey'] = pd.to_datetime(train['Date_of_Journey'], format = '%d/%m/%Y').dt.day
train['Month_of_Journey'] = pd.to_datetime(train['Date_of_Journey'], format = '%d/%m/%Y').dt.month

In [17]:
# Dropping the Date_of_Journey feature as we don't need it anymore
train.drop('Date_of_Journey', axis = 1, inplace = True)

## 2 : Arrival_Time

In [18]:
train['Arrival_Time'] = pd.to_datetime(train['Arrival_Time'])

train['Arrival_hour'] = pd.to_datetime(train['Arrival_Time']).dt.hour
train['Arrival_mins'] = pd.to_datetime(train['Arrival_Time']).dt.minute

train.drop(['Arrival_Time'], axis = 1, inplace = True)

## 3 : Dep_Time

In [19]:
train['Dep_Time'] = pd.to_datetime(train['Dep_Time'])

train['Dep_Hour'] = pd.to_datetime(train['Dep_Time']).dt.hour
train['Dep_Min'] = pd.to_datetime(train['Dep_Time']).dt.minute

train.drop(['Dep_Time'], axis = 1, inplace = True)

In [20]:
train.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,21,35,16,50


## 4 Total_Stops

In [21]:
train['Total_Stops'].value_counts()

1 stop      5626
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [22]:
train['Total_Stops'] = train['Total_Stops'].replace({'1 stop': 1, 'non-stop': 0, '2 stops': 2, '3 stops': 3, '4 stops': 4})

In [23]:
train.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0,No info,3897,24,3,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,7662,1,5,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,13882,9,6,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,6218,12,5,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1,No info,13302,1,3,21,35,16,50


## 6 : Duration

In [24]:
Duration_list = train["Duration"].tolist()

for i in range(0, len(Duration_list)):
    if len(Duration_list[i].split()) != 2:
        if 'h' in Duration_list[i]:
            Duration_list[i] = Duration_list[i] + ' 0m'
        elif 'm' in Duration_list[i]:
            Duration_list[i] = '0h ' + Duration_list[i]
            
# Extracting features in hour and minutes
Duration_hour = []
Duration_minutes = []

for i in range(0, len(Duration_list)):
    Duration_hour.append(int(Duration_list[i].split(sep = 'h')[0]))
    Duration_minutes.append(int(Duration_list[i].split()[1].split(sep = 'm')[0]))

In [25]:
# Adding lists Duration_hour & Duration_minutes to train dataframe

train['Duration_hour'] = Duration_hour
train['Duration_mins'] = Duration_minutes

In [26]:
# Dropping Duration column as it is of no use now
train.drop('Duration', axis = 1, inplace = True)

In [27]:
train.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins
0,IndiGo,Banglore,New Delhi,BLR → DEL,0,No info,3897,24,3,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2,No info,7662,1,5,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2,No info,13882,9,6,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No info,6218,12,5,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1,No info,13302,1,3,21,35,16,50,4,45


## 4 : Route

In [28]:
# Dropping Route feature as it is of no use now beacause of total_stops column and we can't extract any extra information from it
train.drop('Route', axis = 1, inplace = True)

In [29]:
train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,2,No info,7662,1,5,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,2,No info,13882,9,6,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,21,35,16,50,4,45


# Test Dataset

In [30]:
test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


# Handling missing values

In [31]:
test.isnull().sum().sum()

0

### There are no null values present in test dataset so now we can move towards feature engineering

# Feature Engineering on test dataset

## 1 : Date_of_Journey

### Extracting day & month from the feature

In [32]:
test['Day_of_Journey'] = pd.to_datetime(test['Date_of_Journey'], format = '%d/%m/%Y').dt.day
test['Month_of_Journey'] = pd.to_datetime(test['Date_of_Journey'], format = '%d/%m/%Y').dt.month

### Dropping the feature 'Date_of_Journey' as it is of no use now

In [33]:
test.drop('Date_of_Journey', axis = 1, inplace = True)

### Checking the dataset

In [34]:
test.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info,6,6
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info,12,5
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included,21,5
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info,21,5
4,Air Asia,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info,24,6


## 2 : Dep_Time

### Extracting hours and minutes from this feature

In [35]:
test['Dep_Hour'] = pd.to_datetime(test['Dep_Time']).dt.hour
test['Dep_Min'] = pd.to_datetime(test['Dep_Time']).dt.minute

### Dropping the feature as it is of no use now

In [36]:
test.drop('Dep_Time', axis = 1, inplace = True)

### Checking the dataset

In [37]:
test.head()

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,04:25 07 Jun,10h 55m,1 stop,No info,6,6,17,30
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,10:20,4h,1 stop,No info,12,5,6,20
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,19:00 22 May,23h 45m,1 stop,In-flight meal not included,21,5,19,15
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,21:00,13h,1 stop,No info,21,5,8,0
4,Air Asia,Banglore,Delhi,BLR → DEL,02:45 25 Jun,2h 50m,non-stop,No info,24,6,23,55


## 3 : Total_Stops

In [38]:
test['Total_Stops'].value_counts()

1 stop      1431
non-stop     849
2 stops      379
3 stops       11
4 stops        1
Name: Total_Stops, dtype: int64

### Label Encoding is used where the data's are in order (Intuitively). 
### Since this is the case here we will do 'Label Encoding' for this feature. 

In [39]:
test['Total_Stops'] = test['Total_Stops'].replace({'1 stop': 1, 'non-stop': 0, '2 stops': 2, '3 stops': 3, '4 stops': 4})

### Checking the dataset

In [40]:
test.head()

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,04:25 07 Jun,10h 55m,1,No info,6,6,17,30
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,10:20,4h,1,No info,12,5,6,20
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,19:00 22 May,23h 45m,1,In-flight meal not included,21,5,19,15
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,21:00,13h,1,No info,21,5,8,0
4,Air Asia,Banglore,Delhi,BLR → DEL,02:45 25 Jun,2h 50m,0,No info,24,6,23,55


## 4 : Route

### If we observe our dataset carefully we can see that features Route and Total_Stops are directly correlated.
### Also since Total_Stops is already present we can't extract any more information from this feature.
### Hence we will simply drop this column and go ahead.

### Dropping the feature

In [41]:
test.drop('Route', axis = 1, inplace = True)

### Checking the dataset

In [42]:
test.head()

Unnamed: 0,Airline,Source,Destination,Arrival_Time,Duration,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min
0,Jet Airways,Delhi,Cochin,04:25 07 Jun,10h 55m,1,No info,6,6,17,30
1,IndiGo,Kolkata,Banglore,10:20,4h,1,No info,12,5,6,20
2,Jet Airways,Delhi,Cochin,19:00 22 May,23h 45m,1,In-flight meal not included,21,5,19,15
3,Multiple carriers,Delhi,Cochin,21:00,13h,1,No info,21,5,8,0
4,Air Asia,Banglore,Delhi,02:45 25 Jun,2h 50m,0,No info,24,6,23,55


## 5 : Arrival_Time

### Extracting hours & minutes from this feature

In [43]:
test['Arrival_hour'] = pd.to_datetime(test['Arrival_Time']).dt.hour
test['Arrival_mins'] = pd.to_datetime(test['Arrival_Time']).dt.minute

### Dropping the feature as we don't need it anymore

In [44]:
test.drop('Arrival_Time', axis = 1, inplace=True)

### Checking the dataset

In [45]:
test.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min,Arrival_hour,Arrival_mins
0,Jet Airways,Delhi,Cochin,10h 55m,1,No info,6,6,17,30,4,25
1,IndiGo,Kolkata,Banglore,4h,1,No info,12,5,6,20,10,20
2,Jet Airways,Delhi,Cochin,23h 45m,1,In-flight meal not included,21,5,19,15,19,0
3,Multiple carriers,Delhi,Cochin,13h,1,No info,21,5,8,0,21,0
4,Air Asia,Banglore,Delhi,2h 50m,0,No info,24,6,23,55,2,45


## 6 : Duration

In [46]:
Duration_list_1 = test["Duration"].tolist()

for i in range(0, len(Duration_list_1)):
    if len(Duration_list_1[i].split()) != 2:
        if 'h' in Duration_list_1[i]:
            Duration_list_1[i] = Duration_list_1[i] + ' 0m' # Adding 0 minutes
        elif 'm' in Duration_list_1[i]:
            Duration_list_1[i] = '0h ' + Duration_list_1[i] # Adding 0 hours
            
# Extracting features in hour and minutes
Duration_hour_1 = []
Duration_minutes_1 = []

for i in range(0, len(Duration_list_1)):
    Duration_hour_1.append(int(Duration_list_1[i].split(sep = 'h')[0]))
    Duration_minutes_1.append(int(Duration_list_1[i].split()[1].split(sep = 'm')[0]))

### Adding the lists in the test dataframe

In [47]:
test['Duration_hour'] = Duration_hour_1
test['Duration_mins'] = Duration_minutes_1

### Dropping the 'Duration' column as we don't need it anymore

In [48]:
test.drop('Duration', axis = 1, inplace = True)

### Checking the dataset

In [49]:
test.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min,Arrival_hour,Arrival_mins,Duration_hour,Duration_mins
0,Jet Airways,Delhi,Cochin,1,No info,6,6,17,30,4,25,10,55
1,IndiGo,Kolkata,Banglore,1,No info,12,5,6,20,10,20,4,0
2,Jet Airways,Delhi,Cochin,1,In-flight meal not included,21,5,19,15,19,0,23,45
3,Multiple carriers,Delhi,Cochin,1,No info,21,5,8,0,21,0,13,0
4,Air Asia,Banglore,Delhi,0,No info,24,6,23,55,2,45,2,50


### Now that we have done some feature engineering in both our datasets let's take a look at them 

In [50]:
train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins
0,IndiGo,Banglore,New Delhi,0,No info,3897,24,3,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,2,No info,7662,1,5,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,2,No info,13882,9,6,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,1,No info,6218,12,5,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,1,No info,13302,1,3,21,35,16,50,4,45


In [51]:
test.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min,Arrival_hour,Arrival_mins,Duration_hour,Duration_mins
0,Jet Airways,Delhi,Cochin,1,No info,6,6,17,30,4,25,10,55
1,IndiGo,Kolkata,Banglore,1,No info,12,5,6,20,10,20,4,0
2,Jet Airways,Delhi,Cochin,1,In-flight meal not included,21,5,19,15,19,0,23,45
3,Multiple carriers,Delhi,Cochin,1,No info,21,5,8,0,21,0,13,0
4,Air Asia,Banglore,Delhi,0,No info,24,6,23,55,2,45,2,50


### Both our datasets look pretty good now, all that is remaining is to encode the remaining categorical features.
### So now lets concatenate them and do the remaining work.

### Lets first check the shape of our datasets

In [52]:
train.shape, test.shape

((10683, 14), (2671, 13))

### In order to concatenate these two datsets (row wise) no. of features must be same.
### So we first take out our 'Price' (target) feature from our train dataset then do the concatenating.

### Taking out the target feature : 'Price' from our training dataset

In [53]:
Y = train['Price']
# Dropping out the target feature
train.drop('Price', axis = 1 , inplace = True)

In [54]:
Y

0         3897
1         7662
2        13882
3         6218
4        13302
         ...  
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10683, dtype: int64

### Checking out the shapes

In [55]:
train.shape, test.shape

((10683, 13), (2671, 13))

### Concatenating

In [56]:
all_data = pd.concat([train, test], axis = 0, sort = False)

In [57]:
train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins
0,IndiGo,Banglore,New Delhi,0,No info,24,3,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,2,No info,1,5,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,2,No info,9,6,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,1,No info,12,5,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,1,No info,1,3,21,35,16,50,4,45


In [58]:
test.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Dep_Hour,Dep_Min,Arrival_hour,Arrival_mins,Duration_hour,Duration_mins
0,Jet Airways,Delhi,Cochin,1,No info,6,6,17,30,4,25,10,55
1,IndiGo,Kolkata,Banglore,1,No info,12,5,6,20,10,20,4,0
2,Jet Airways,Delhi,Cochin,1,In-flight meal not included,21,5,19,15,19,0,23,45
3,Multiple carriers,Delhi,Cochin,1,No info,21,5,8,0,21,0,13,0
4,Air Asia,Banglore,Delhi,0,No info,24,6,23,55,2,45,2,50


# all_data dataset

### Checking the shape of our new dataset

In [59]:
all_data.shape

(13354, 13)

In [60]:
all_data

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins
0,IndiGo,Banglore,New Delhi,0,No info,24,3,1,10,22,20,2,50
1,Air India,Kolkata,Banglore,2,No info,1,5,13,15,5,50,7,25
2,Jet Airways,Delhi,Cochin,2,No info,9,6,4,25,9,25,19,0
3,IndiGo,Kolkata,Banglore,1,No info,12,5,23,30,18,5,5,25
4,IndiGo,Banglore,New Delhi,1,No info,1,3,21,35,16,50,4,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,Air India,Kolkata,Banglore,1,No info,6,6,20,25,20,30,23,55
2667,IndiGo,Kolkata,Banglore,0,No info,27,3,16,55,14,20,2,35
2668,Jet Airways,Delhi,Cochin,1,No info,6,3,4,25,21,50,6,35
2669,Air India,Delhi,Cochin,1,No info,6,3,19,15,4,0,15,15


### Using dummy variables for categorical variables

In [61]:
dummy_1 = pd.get_dummies(all_data['Airline'], drop_first=True)
dummy_2 = pd.get_dummies(all_data['Source'], drop_first=True)
dummy_3 = pd.get_dummies(all_data['Destination'], drop_first=True)
dummy_4 = pd.get_dummies(all_data['Additional_Info'], drop_first=True)

### Concatenating dummy variables into the dataset

In [62]:
all_data = pd.concat([all_data, dummy_1, dummy_2, dummy_3, dummy_4], axis = 1)

In [63]:
all_data.shape

(13354, 42)

In [64]:
all_data.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Delhi.1,Hyderabad,Kolkata.1,New Delhi,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No Info,No check-in baggage included,No info,Red-eye flight
0,IndiGo,Banglore,New Delhi,0,No info,24,3,1,10,22,20,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,Air India,Kolkata,Banglore,2,No info,1,5,13,15,5,50,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Jet Airways,Delhi,Cochin,2,No info,9,6,4,25,9,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
3,IndiGo,Kolkata,Banglore,1,No info,12,5,23,30,18,5,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,IndiGo,Banglore,New Delhi,1,No info,1,3,21,35,16,50,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


### Deleting duplicate columns

In [65]:
all_data = all_data.loc[:,~all_data.columns.duplicated()]

### Deleting the remaining categorical variables as we don't need it anymore

In [66]:
all_data.drop(['Airline', 'Source', 'Destination', 'Additional_Info'], axis = 1, inplace = True)

In [67]:
all_data.shape

(13354, 36)

In [68]:
all_data

Unnamed: 0,Total_Stops,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Hyderabad,New Delhi,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No Info,No check-in baggage included,No info,Red-eye flight
0,0,24,3,1,10,22,20,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,2,1,5,13,15,5,50,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2,9,6,4,25,9,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
3,1,12,5,23,30,18,5,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,1,3,21,35,16,50,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,1,6,6,20,25,20,30,23,55,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2667,0,27,3,16,55,14,20,2,35,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2668,1,6,3,4,25,21,50,6,35,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2669,1,6,3,19,15,4,0,15,15,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0


### Now that our all datasets are processed we will again seperate it back into training and test datasets

In [69]:
new_train = all_data.iloc[:10683]
new_test = all_data.iloc[10683:]

In [70]:
new_train.shape, new_test.shape

((10683, 36), (2671, 36))

In [71]:
new_train.head()

Unnamed: 0,Total_Stops,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Hyderabad,New Delhi,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No Info,No check-in baggage included,No info,Red-eye flight
0,0,24,3,1,10,22,20,2,50,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,2,1,5,13,15,5,50,7,25,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2,9,6,4,25,9,25,19,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
3,1,12,5,23,30,18,5,5,25,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,1,3,21,35,16,50,4,45,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [72]:
new_test.head()

Unnamed: 0,Total_Stops,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Hyderabad,New Delhi,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No Info,No check-in baggage included,No info,Red-eye flight
0,1,6,6,4,25,17,30,10,55,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1,1,12,5,10,20,6,20,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,21,5,19,0,19,15,23,45,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,1,21,5,21,0,8,0,13,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0,24,6,2,45,23,55,2,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


### Now that our both train & test datasets are ready finally we can start doing modelling.

# Modelling

### Splitting the new_train dataset 

In [73]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
X_train, X_valid, y_train, y_valid = train_test_split(new_train, Y, test_size = 0.3)

In [74]:
X_train.shape, y_train.shape

((7478, 36), (7478,))

In [75]:
X_train

Unnamed: 0,Total_Stops,Day_of_Journey,Month_of_Journey,Arrival_hour,Arrival_mins,Dep_Hour,Dep_Min,Duration_hour,Duration_mins,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,Chennai,Delhi,Kolkata,Mumbai,Cochin,Hyderabad,New Delhi,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No Info,No check-in baggage included,No info,Red-eye flight
996,1,21,5,4,40,20,0,8,40,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1838,1,1,5,8,15,20,0,12,15,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
10668,2,9,5,19,0,11,40,7,20,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4604,0,6,6,17,10,14,10,3,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
7543,0,1,5,14,5,11,10,2,55,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,1,18,5,1,30,17,10,8,20,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2693,1,1,3,0,45,18,40,6,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1496,2,27,3,19,15,5,55,37,20,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
805,2,6,3,21,20,13,5,8,15,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0


In [76]:
y_train

996      10844
1838     13941
10668    21219
4604      6094
7543      7229
         ...  
10527     7081
2693     35185
1496      8446
805      10759
8549      7198
Name: Price, Length: 7478, dtype: int64

### Regression models to be used here
####  1 : Decision Tree Regressor
####  2 : Random Forest Regressor
####  3 : Linear Regressor
####  4 : Logistic Regressor
####  5 : ElasticNet Regressor
####  6 : Ridge Regressor
####  7 : Lasso Regressor
####  8 : Gradient Boosting Regressor
####  9 : Extreme Gradient Boosting Regressor
#### 10 : Light Gradient Boosting Regressor
#### 11 : Supprt Vector Machine Regressor

## Our Models

In [77]:
pip install xgboost




In [78]:
pip install lightgbm




In [79]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor

In [80]:
model1 = DecisionTreeRegressor()
model2 = RandomForestRegressor()
model3 = LinearRegression()
model4 = LogisticRegression()
model5 = ElasticNet()
model6 = KernelRidge()
model7 = Lasso()
model8 = GradientBoostingRegressor()
model9 = XGBRegressor()
model10 = LGBMRegressor()
model11 = SVR()

### Training the models

In [81]:
model1.fit(X_train, y_train)

DecisionTreeRegressor()

In [82]:
model2.fit(X_train, y_train)

RandomForestRegressor()

In [83]:
model3.fit(X_train, y_train)

LinearRegression()

In [84]:
model5.fit(X_train, y_train)

ElasticNet()

In [85]:
model6.fit(X_train, y_train)

KernelRidge()

In [86]:
model7.fit(X_train, y_train)

Lasso()

In [87]:
model8.fit(X_train, y_train)

GradientBoostingRegressor()

In [88]:
model9.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [89]:
model10.fit(X_train, y_train)

LGBMRegressor()

### Checking cross-validation scores

In [90]:
cross_val_score(model1, X_train, y_train, cv=5).mean()

0.7649945519432926

In [91]:
cross_val_score(model2, X_train, y_train, cv=5).mean()

0.8522447273631399

In [92]:
cross_val_score(model3, X_train, y_train, cv=5).mean()

0.6391349559554602

In [93]:
cross_val_score(model5, X_train, y_train, cv=5).mean()

0.4382370658103459

In [94]:
cross_val_score(model6, X_train, y_train, cv=5).mean()

0.6057219880868818

In [95]:
cross_val_score(model7, X_train, y_train, cv=5).mean()

0.6378722235141202

In [96]:
cross_val_score(model8, X_train, y_train, cv=5).mean()

0.7774347514735526

In [97]:
cross_val_score(model9, X_train, y_train, cv=5).mean()

0.8455524099634489

In [98]:
cross_val_score(model10, X_train, y_train, cv=5).mean()

0.8431579006074978

In [99]:
cross_val_score(model11, X_train, y_train, cv=5).mean()

-0.0066513733241519764

### We can see from the above cross-validation scores that model9 i.e., 'XGBRegressor' is performing best.
### Hence we will now proceed with XGBRegressor & tune it.

# Hyperparmeter Tuning 

In [100]:
Params_grid = {'n_estimators':[10,100,1000], 'max_depth':range(5,36), 'learning_rate':[0.001,0.01,0.1,1], 'subsample': [0.5,0.8,1],
              'min_child_weight':range(1,10)}

# Using GridSearchCV to find the optimum parameters
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Tuned_XGBR = RandomizedSearchCV(model9, Params_grid, cv=5)

# Training the model
Tuned_XGBR.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=100, n_jobs=0,
                                          num_parallel_tree=1, random_state=0,
                                          reg_alpha=0, reg_lambda=1,
                                          scale_pos_

In [101]:
Tuned_XGBR.best_params_

{'subsample': 1,
 'n_estimators': 100,
 'min_child_weight': 3,
 'max_depth': 11,
 'learning_rate': 0.1}

In [102]:
Tuned_XGBR.best_score_

0.8589624533449317

In [103]:
prediction = Tuned_XGBR.predict(X_valid)

In [104]:
cross_val_score(Tuned_XGBR, X_train, y_train, cv=5).mean()

0.8479072487919126

In [105]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Checking the score
mean_squared_error(y_valid, prediction)

2441608.338535296

In [106]:
mean_absolute_error(y_valid, prediction)

678.7294725724576

In [107]:
r2_score(y_valid, prediction)

0.8809704454206979

# Saving the model to reuse it again

In [108]:
import pickle

# Opening a file to store the data
File = open('flightfare.pkl', 'wb')

# Dumping information in that file
pickle.dump(Tuned_XGBR, File)

# Closing the file
File.close()

In [109]:
# Opening the pickle file
FFKPL = 'flightfare.pkl'
File_obj = open(FFKPL, 'rb')

# Retrieving our model from pickle file
Model = pickle.load(File_obj)
print(Model)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=100, n_jobs=0,
                                          num_parallel_tree=1, random_state=0,
                                          reg_alpha=0, reg_lambda=1,
                                          scale_pos_