In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
df_train = pd.read_excel("Data_Train.xlsx")
df_to_predict = pd.read_excel("Test_set.xlsx")

In [3]:
print("Number of rows in training dataset",df_train.shape)
print("Number of rows in prediction dataset",df_to_predict.shape)

Number of rows in training dataset (10683, 11)
Number of rows in prediction dataset (2671, 10)


In [4]:
df_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


### Removing null values

In [5]:
#Check for null values in the dataset
df_train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [6]:
#Check null column values
null_columns=df_train.columns[df_train.isnull().any()]
print(df_train[df_train.isnull().any(axis=1)][null_columns].head())

     Route Total_Stops
9039   NaN         NaN


In [7]:
#drop null column values
df_train.dropna(inplace=True)

In [8]:
# Select duplicate rows except first occurrence based on all columns
duplicateRowsDF = df_train[df_train.duplicated()]
print("Total Duplicate Rows except first occurrence based on all columns are :")
duplicateRowsDF.shape

Total Duplicate Rows except first occurrence based on all columns are :


(220, 11)

In [9]:
#remove duplicate rows in training dataset
df_train.drop_duplicates(keep='first',inplace=True)

In [10]:
#See values in columns of dataset 
df_to_predict.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [11]:
df_train.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## Data cleaning

In [12]:
print("Train set:\n",df_train["Additional_Info"].value_counts())
print("\nPrediction set:\n",df_to_predict["Additional_Info"].value_counts())

Train set:
 No info                         8182
In-flight meal not included     1926
No check-in baggage included     318
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
2 Long layover                     1
Red-eye flight                     1
1 Short layover                    1
Name: Additional_Info, dtype: int64

Prediction set:
 No info                         2148
In-flight meal not included      444
No check-in baggage included      76
1 Long layover                     1
Change airports                    1
Business class                     1
Name: Additional_Info, dtype: int64


In [13]:
# replace repeating value in train set
df_train["Additional_Info"] = df_train["Additional_Info"].replace({'No Info': 'No info'})

In [14]:
df_to_predict["Additional_Info"].value_counts()

No info                         2148
In-flight meal not included      444
No check-in baggage included      76
1 Long layover                     1
Change airports                    1
Business class                     1
Name: Additional_Info, dtype: int64

In [15]:
df_to_predict["Destination"].value_counts()

Cochin       1145
Banglore      710
Delhi         317
New Delhi     238
Hyderabad     186
Kolkata        75
Name: Destination, dtype: int64

In [17]:
df_train["Destination"] = df_train["Destination"].replace({'New Delhi': 'Delhi'})
df_to_predict["Destination"] = df_to_predict["Destination"].replace({'New Delhi': 'Delhi'})

In [18]:
# Assign values manually to Total_stops column since label encoder might assign 
# wrong values. Like it can assign value 1 to 'non-stop' or 2 to '3 stops' 
df_train.Total_Stops.replace(['1 stop', 'non-stop', '2 stops', '3 stops', '4 stops'], [1, 0, 2, 3, 4], inplace=True)
df_to_predict.Total_Stops.replace(['1 stop', 'non-stop', '2 stops', '3 stops', '4 stops'], [1, 0, 2, 3, 4], inplace=True)
df_train["Total_Stops"] = df_train["Total_Stops"].astype(int)
df_to_predict["Total_Stops"] = df_to_predict["Total_Stops"].astype(int)

In [19]:
df_train["Total_Stops"].dtype

dtype('int32')

In [20]:
# Extract day and month of journey from the above column in both train and test dataset

df_train["isWeekend"] = ((pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.dayofweek) // 5 == 1).astype(int)
df_train["Day_of_Week"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.day_name()
df_train["Day_Of_Journey"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.day
df_train["Month_of_Journey"] = pd.to_datetime(df_train["Date_of_Journey"], format = '%d/%m/%Y').dt.month

df_to_predict["isWeekend"] = ((pd.to_datetime(df_to_predict["Date_of_Journey"], format = '%d/%m/%Y').dt.dayofweek) // 5 == 1).astype(int)
df_to_predict["Day_Of_Journey"] = pd.to_datetime(df_to_predict["Date_of_Journey"], format = '%d/%m/%Y').dt.day
df_to_predict["Day_of_Week"] = pd.to_datetime(df_to_predict["Date_of_Journey"], format = '%d/%m/%Y').dt.day_name()
df_to_predict["Month_of_Journey"] = pd.to_datetime(df_to_predict["Date_of_Journey"], format = '%d/%m/%Y').dt.month


In [21]:
# Drop original feature from train and test dataset 
df_train.drop(labels = 'Date_of_Journey', axis = 1, inplace = True)
df_to_predict.drop(labels = 'Date_of_Journey', axis = 1, inplace = True)

In [22]:
# cleaning duration column in train dataset
duration = list(df_train["Duration"])

for i in range(len(duration)) :
    if len(duration[i].split()) != 2:
        if 'h' in duration[i] :
            duration[i] = duration[i].strip() + ' 0m'
        elif 'm' in duration[i] :
            duration[i] = '0h {}'.format(duration[i].strip())

dur_hours = []
dur_minutes = []  
dur_seconds = []
for i in range(len(duration)) :
    dur_minutes.append(int(duration[i].split()[0][:-1])*60 + int(duration[i].split()[1][:-1]))
    dur_seconds.append(int(duration[i].split()[0][:-1])*60*60 + int(duration[i].split()[1][:-1])*60)
df_train["Duration_minutes"] = dur_minutes
df_train.drop(["Duration"], axis=1, inplace = True)

In [23]:
# cleaning duration column in dataset to predict
duration = list(df_to_predict["Duration"])

for i in range(len(duration)) :
    if len(duration[i].split()) != 2:
        if 'h' in duration[i] :
            duration[i] = duration[i].strip() + ' 0m'
        elif 'm' in duration[i] :
            duration[i] = '0h {}'.format(duration[i].strip())

dur_hours = []
dur_minutes = []  
dur_seconds = []
for i in range(len(duration)) :
    dur_minutes.append(int(duration[i].split()[0][:-1])*60 + int(duration[i].split()[1][:-1]))
    dur_seconds.append(int(duration[i].split()[0][:-1])*60*60 + int(duration[i].split()[1][:-1])*60)
df_to_predict["Duration_minutes"] = dur_minutes
df_to_predict.drop(labels = 'Duration', axis = 1, inplace = True)

In [24]:
#Cleaning Departure and Arrival Times
# Training Set
df_train['Depart_Time_Hour'] = pd.to_datetime(df_train.Dep_Time).dt.hour
df_train['Depart_Time_Minutes'] = pd.to_datetime(df_train.Dep_Time).dt.minute
df_train.drop(labels = 'Dep_Time', axis = 1, inplace = True)
df_train['Arr_Time_Hour'] = pd.to_datetime(df_train.Arrival_Time).dt.hour
df_train['Arr_Time_Minutes'] = pd.to_datetime(df_train.Arrival_Time).dt.minute
df_train.drop(labels = 'Arrival_Time', axis = 1, inplace = True)

# Prediction Set
df_to_predict['Depart_Time_Hour'] = pd.to_datetime(df_to_predict.Dep_Time).dt.hour
df_to_predict['Depart_Time_Minutes'] = pd.to_datetime(df_to_predict.Dep_Time).dt.minute
df_to_predict.drop(labels = 'Dep_Time', axis = 1, inplace = True)
df_to_predict['Arr_Time_Hour'] = pd.to_datetime(df_to_predict.Arrival_Time).dt.hour
df_to_predict['Arr_Time_Minutes'] = pd.to_datetime(df_to_predict.Arrival_Time).dt.minute
df_to_predict.drop(labels = 'Arrival_Time', axis = 1, inplace = True)

In [25]:
bins = [-1, 3, 6, 9, 12, 15, 18, 21, 24]
labels = ['late evening','magical hours','early morning','morning','early afternoon','afternoon','early evening','evening']
df_train['part_of_day'] = pd.cut(df_train['Depart_Time_Hour'], bins=bins, labels=labels)
df_to_predict['part_of_day'] = pd.cut(df_to_predict['Depart_Time_Hour'], bins=bins, labels=labels)

df_train['part_of_day'] = df_train['part_of_day'].astype('category')
df_to_predict['part_of_day'] = df_to_predict['part_of_day'].astype('category')

In [26]:
df_train['part_of_day'].value_counts()

early morning      2443
early evening      1682
afternoon          1591
magical hours      1426
morning            1284
early afternoon    1235
evening             506
late evening        295
Name: part_of_day, dtype: int64

In [38]:
# We created X and y columns for training dataset while y column for dataset to predict will
# be generated by our model in final step
X = df_train.drop(["Price"], axis=1)
y = np.log1p(df_train["Price"])
#y = df_train["Price"]
X_to_predict = df_to_predict

In [39]:
X.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,isWeekend,Day_of_Week,Day_Of_Journey,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,IndiGo,Banglore,Delhi,BLR → DEL,0,No info,1,Sunday,24,3,170,22,20,1,10,evening
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2,No info,0,Wednesday,1,5,445,5,50,13,15,magical hours
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2,No info,1,Sunday,9,6,1140,9,25,4,25,early morning
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No info,1,Sunday,12,5,325,18,5,23,30,afternoon
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,1,No info,0,Friday,1,3,285,16,50,21,35,afternoon


In [40]:
#Separate categorical and numerical columns in dataframe
X_categorical = X.select_dtypes(exclude=['int', 'float'])
X_numerical = X.select_dtypes(include=['int', 'float'])

X_to_predict_categorical = X_to_predict.select_dtypes(exclude=['int', 'float'])
X_to_predict_numerical = X_to_predict.select_dtypes(include=['int', 'float'])

In [41]:
# Check if categorical columns are assigned properly
X_categorical.head()

#Create a back up copy specifically for plotting graphs
X_numerical_graph = X_numerical.copy()
X_categorical_graph = X_categorical.copy()

In [42]:
#Check if numerical columns are assigned properly
X_categorical.shape

(10462, 14)

In [43]:
X_numerical.head()

Unnamed: 0,Total_Stops,isWeekend
0,0,1
1,2,0
2,2,1
3,1,1
4,1,0


In [44]:
#We replace values based on insights gathered from graphs generated in further steps
X_categorical["Airline"].replace(to_replace={'Multiple carriers Premium economy':'Other', 
                                                        'Jet Airways Business':'Other',
                                                        'Vistara Premium economy':'Other',
                                                        'Trujet':'Other'
                                                   },    
                                        inplace=True)

X_to_predict_categorical["Airline"].replace(to_replace={'Multiple carriers Premium economy':'Other', 
                                                        'Jet Airways Business':'Other',
                                                        'Vistara Premium economy':'Other',
                                                        'Trujet':'Other'
                                                   },    
                                        inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [45]:
#We replace values based on insights gathered from graphs generated in further steps
X_categorical["Additional_Info"].replace(to_replace={'Change airports':'Other', 
                                                        'Business class':'Other',
                                                        '1 Short layover':'Other',
                                                        'Red-eye flight':'Other',
                                                        '2 Long layover':'Other',   
                                                   },    
                                        inplace=True)
X_to_predict_categorical["Additional_Info"].replace(to_replace={'Change airports':'Other', 
                                                        'Business class':'Other',
                                                        '1 Short layover':'Other',
                                                        'Red-eye flight':'Other',
                                                        '2 Long layover':'Other',   
                                                   },    
                                        inplace=True)

In [46]:
X_categorical.head()

Unnamed: 0,Airline,Source,Destination,Route,Additional_Info,Day_of_Week,Day_Of_Journey,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,IndiGo,Banglore,Delhi,BLR → DEL,No info,Sunday,24,3,170,22,20,1,10,evening
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,No info,Wednesday,1,5,445,5,50,13,15,magical hours
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,No info,Sunday,9,6,1140,9,25,4,25,early morning
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,No info,Sunday,12,5,325,18,5,23,30,afternoon
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,No info,Friday,1,3,285,16,50,21,35,afternoon


In [47]:
X_categorical["Additional_Info"].value_counts()

No info                         8185
In-flight meal not included     1926
No check-in baggage included     318
1 Long layover                    19
Other                             14
Name: Additional_Info, dtype: int64

In [48]:
X_categorical.head()

Unnamed: 0,Airline,Source,Destination,Route,Additional_Info,Day_of_Week,Day_Of_Journey,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,IndiGo,Banglore,Delhi,BLR → DEL,No info,Sunday,24,3,170,22,20,1,10,evening
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,No info,Wednesday,1,5,445,5,50,13,15,magical hours
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,No info,Sunday,9,6,1140,9,25,4,25,early morning
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,No info,Sunday,12,5,325,18,5,23,30,afternoon
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,No info,Friday,1,3,285,16,50,21,35,afternoon


In [49]:
#Label encode and hot encode categorical columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_categorical = X_categorical.apply(LabelEncoder().fit_transform)
X_to_predict_categorical = X_to_predict_categorical.apply(LabelEncoder().fit_transform)

In [50]:
#Check values after label encoding
X_categorical.head()

Unnamed: 0,Airline,Source,Destination,Route,Additional_Info,Day_of_Week,Day_Of_Journey,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,3,0,2,18,3,3,8,0,13,22,4,1,2,4
1,1,3,0,84,3,6,0,2,65,5,10,13,3,6
2,4,2,1,118,3,3,3,3,202,9,5,4,5,3
3,3,3,0,91,3,3,4,2,41,18,1,23,6,0
4,3,0,2,29,3,0,0,0,33,16,10,21,7,0


In [51]:
#Check values in numerical columns
X_numerical.head()

Unnamed: 0,Total_Stops,isWeekend
0,0,1
1,2,0
2,2,1
3,1,1
4,1,0


In [52]:
X_numerical.head()

Unnamed: 0,Total_Stops,isWeekend
0,0,1
1,2,0
2,2,1
3,1,1
4,1,0


In [53]:
# Check the skew of all numerical features
from scipy.stats import skew
skewed_feats = X_numerical.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))


Skew in numerical features: 

There are 2 skewed numerical features to Box Cox transform


In [54]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
cols_train = list(X_numerical.columns)
index_train = X_numerical.index.tolist()
cols_test = list(X_to_predict_numerical.columns)
index_test = X_to_predict_numerical.index.tolist()

# We can also use standard scaler if required...The code is commented for now
# Get column names first
X_numerical_names = X_numerical.columns
X_numerical_index = X_numerical.index
X_to_predict_numerical_names = X_to_predict_numerical.columns
# Create the Scaler object
scaler = StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(X_numerical)
X_numerical = pd.DataFrame(scaled_df, columns=X_numerical_names, index=index_train)
scaled_df_to_predict = scaler.fit_transform(X_to_predict_numerical)
X_to_predict_numerical = pd.DataFrame(scaled_df_to_predict, columns=X_to_predict_numerical_names, index = index_test)

mmx = MinMaxScaler()
df_minmax = mmx.fit_transform(X_numerical)
df_minmax_test = mmx.fit_transform(X_to_predict_numerical)
X_numerical = pd.DataFrame(data=df_minmax,columns=cols_train,index=index_train)
X_to_predict_numerical = pd.DataFrame(data=df_minmax_test,columns=cols_test,index=index_test)

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.5
for feat in skewed_features:
    #all_data[feat] += 1
    X_numerical[feat] = boxcox1p(X_numerical[feat], lam)
    X_to_predict_numerical[feat] = boxcox1p(X_to_predict_numerical[feat], lam)


In [55]:
#Check values in numerical columns after minmax scaling
X_numerical.head(5)

Unnamed: 0,Total_Stops,isWeekend
0,0.0,0.828427
1,0.44949,0.0
2,0.44949,0.828427
3,0.236068,0.828427
4,0.236068,0.0


In [56]:
#Merge categorical and numerical columns back into respective X and X_to_predict
X = pd.concat([X_categorical, X_numerical], axis=1)
X_to_predict = pd.concat([X_to_predict_categorical, X_to_predict_numerical], axis=1)

In [57]:
#Check shape of all three to verify that merge was done properly in above step
print(X.shape, X_numerical.shape, X_categorical.shape)
print(X_to_predict.shape, X_to_predict_numerical.shape, X_to_predict_categorical.shape)
print(y.shape)

(10462, 16) (10462, 2) (10462, 14)
(2671, 16) (2671, 2) (2671, 14)
(10462,)


In [219]:
X_categorical_graph["Additional_Info"].value_counts()

No info                         8185
In-flight meal not included     1926
No check-in baggage included     318
1 Long layover                    19
Change airports                    7
Business class                     4
2 Long layover                     1
Red-eye flight                     1
1 Short layover                    1
Name: Additional_Info, dtype: int64

In [58]:
X_categorical_graph["Airline"].replace(to_replace={'Multiple carriers Premium economy':'Other', 
                                                        'Jet Airways Business':'Other',
                                                        'Vistara Premium economy':'Other',
                                                        'Trujet':'Other'
                                                   },    
                                        inplace=True)
#We have done same step back in the notebook above before label encoding to improve accuracy

In [60]:
X_categorical_graph["Additional_Info"].replace(to_replace={'Change airports':'Other', 
                                                        'Business class':'Other',
                                                        '1 Short layover':'Other',
                                                        'Red-eye flight':'Other',
                                                        '2 Long layover':'Other',   
                                                   },    
                                        inplace=True)
#We have done same step back in the notebook above before label encoding to improve accuracy

In [62]:
df_train.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,isWeekend,Day_of_Week,Day_Of_Journey,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,IndiGo,Banglore,Delhi,BLR → DEL,0,No info,3897,1,Sunday,24,3,170,22,20,1,10,evening
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2,No info,7662,0,Wednesday,1,5,445,5,50,13,15,magical hours
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2,No info,13882,1,Sunday,9,6,1140,9,25,4,25,early morning
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No info,6218,1,Sunday,12,5,325,18,5,23,30,afternoon
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,1,No info,13302,0,Friday,1,3,285,16,50,21,35,afternoon


In [63]:
df_to_predict.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,isWeekend,Day_Of_Journey,Day_of_Week,Month_of_Journey,Duration_minutes,Depart_Time_Hour,Depart_Time_Minutes,Arr_Time_Hour,Arr_Time_Minutes,part_of_day
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,No info,0,6,Thursday,6,655,17,30,4,25,afternoon
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,1,No info,1,12,Sunday,5,240,6,20,10,20,magical hours
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,In-flight meal not included,0,21,Tuesday,5,1425,19,15,19,0,early evening
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,1,No info,0,21,Tuesday,5,780,8,0,21,0,early morning
4,Air Asia,Banglore,Delhi,BLR → DEL,0,No info,0,24,Monday,6,170,23,55,2,45,evening


In [64]:
df_train.isnull().sum()

Airline                0
Source                 0
Destination            0
Route                  0
Total_Stops            0
Additional_Info        0
Price                  0
isWeekend              0
Day_of_Week            0
Day_Of_Journey         0
Month_of_Journey       0
Duration_minutes       0
Depart_Time_Hour       0
Depart_Time_Minutes    0
Arr_Time_Hour          0
Arr_Time_Minutes       0
part_of_day            0
dtype: int64

In [65]:
df_to_predict.isnull().sum()

Airline                0
Source                 0
Destination            0
Route                  0
Total_Stops            0
Additional_Info        0
isWeekend              0
Day_Of_Journey         0
Day_of_Week            0
Month_of_Journey       0
Duration_minutes       0
Depart_Time_Hour       0
Depart_Time_Minutes    0
Arr_Time_Hour          0
Arr_Time_Minutes       0
part_of_day            0
dtype: int64