In [1]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
import joblib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import missingno
import pandas_profiling
from sklearn import metrics
from scipy.stats import zscore
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("Flight_Prediction.csv")

In [3]:
df # checking the first 5 and last 5 rows

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices
0,Air Asia,12:40,20:15,7h 35m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
1,Air Asia,11:55,20:15,8h 20m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
2,Air Asia,16:15,06:20,14h 05m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
3,Go First,18:50,20:45,1h 55m,New Delhi,Mumbai,No Meal Fare,Non Stop,5954
4,Go First,09:05,11:05,2h 00m,New Delhi,Mumbai,No Meal Fare,Non Stop,5954
...,...,...,...,...,...,...,...,...,...
5800,Air India,08:55,08:20,23h 25m,Lucknow,Jaipur,No Meal Fare,1 Stop,9302
5801,Air India,08:55,09:20,24h 25m,Lucknow,Jaipur,No Meal Fare,2 Stop(s),16287
5802,Air India,14:45,09:20,18h 35m,Lucknow,Jaipur,No Meal Fare,2 Stop(s),16865
5803,Air India,08:55,09:20,24h 25m,Lucknow,Jaipur,No Meal Fare,2 Stop(s),16865


### Exploratory Data Analysis (EDA)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5805 entries, 0 to 5804
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Airline_Names      5805 non-null   object
 1   Departure_Time     5805 non-null   object
 2   Arrival_Time       5805 non-null   object
 3   Flight_Duration    5805 non-null   object
 4   Source_Place       5805 non-null   object
 5   Destination_Place  5805 non-null   object
 6   Meal_Availability  5805 non-null   object
 7   Number_Of_Stops    5805 non-null   object
 8   Flight_Prices      5805 non-null   object
dtypes: object(9)
memory usage: 408.3+ KB


In [5]:
print("We have {} Rows and {} Columns in our dataframe".format(df.shape[0], df.shape[1]))
df.head(10)

We have 5805 Rows and 9 Columns in our dataframe


Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices
0,Air Asia,12:40,20:15,7h 35m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
1,Air Asia,11:55,20:15,8h 20m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
2,Air Asia,16:15,06:20,14h 05m,New Delhi,Mumbai,No Meal Fare,1 Stop,5953
3,Go First,18:50,20:45,1h 55m,New Delhi,Mumbai,No Meal Fare,Non Stop,5954
4,Go First,09:05,11:05,2h 00m,New Delhi,Mumbai,No Meal Fare,Non Stop,5954
5,Go First,06:15,08:20,2h 05m,New Delhi,Mumbai,eCash 250,Non Stop,5954
6,Go First,14:20,16:25,2h 05m,New Delhi,Mumbai,eCash 250,Non Stop,5954
7,Go First,20:30,22:35,2h 05m,New Delhi,Mumbai,eCash 250,Non Stop,5954
8,Go First,22:45,01:00,2h 15m,New Delhi,Mumbai,eCash 250,Non Stop,5954
9,Go First,17:45,22:25,4h 40m,New Delhi,Mumbai,No Meal Fare,1 Stop,5954


In [6]:
df.isna().sum() # checking for missing values

Airline_Names        0
Departure_Time       0
Arrival_Time         0
Flight_Duration      0
Source_Place         0
Destination_Place    0
Meal_Availability    0
Number_Of_Stops      0
Flight_Prices        0
dtype: int64

In [7]:
missingno.bar(df, figsize = (25,5), color="tab:green")

<AxesSubplot:>

In [8]:
print("Missing column values in percentage:")
for col in df:
    percentage = np.round((df[col].isnull().sum()/df.shape[0])*100, 6)
    print(col, ":".format(), percentage, '%')

Missing column values in percentage:
Airline_Names : 0.0 %
Departure_Time : 0.0 %
Arrival_Time : 0.0 %
Flight_Duration : 0.0 %
Source_Place : 0.0 %
Destination_Place : 0.0 %
Meal_Availability : 0.0 %
Number_Of_Stops : 0.0 %
Flight_Prices : 0.0 %


In [9]:
df.dtypes

Airline_Names        object
Departure_Time       object
Arrival_Time         object
Flight_Duration      object
Source_Place         object
Destination_Place    object
Meal_Availability    object
Number_Of_Stops      object
Flight_Prices        object
dtype: object

In [10]:
df.nunique().sort_values().to_frame("Unique Values")

Unnamed: 0,Unique Values
Meal_Availability,3
Number_Of_Stops,5
Airline_Names,6
Source_Place,9
Destination_Place,9
Departure_Time,223
Arrival_Time,232
Flight_Duration,409
Flight_Prices,1571


In [11]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq
Airline_Names,5805,6,IndiGo,1782
Departure_Time,5805,223,14:00,162
Arrival_Time,5805,232,19:40,147
Flight_Duration,5805,409,2h 05m,61
Source_Place,5805,9,Mumbai,806
Destination_Place,5805,9,Mumbai,805
Meal_Availability,5805,3,No Meal Fare,4252
Number_Of_Stops,5805,5,1 Stop,3696
Flight_Prices,5805,1571,7425,111


In [12]:
value = ["Meal_Availability", "Number_Of_Stops", "Airline_Names", "Source_Place", "Destination_Place"]
for col in value:
    print(col)
    print(df[col].value_counts())
    print("="*120)


Meal_Availability
No Meal Fare    4252
Free Meal       1014
eCash 250        539
Name: Meal_Availability, dtype: int64
Number_Of_Stops
1 Stop       3696
2 Stop(s)    1154
Non Stop      693
3 Stop(s)     244
4 Stop(s)      18
Name: Number_Of_Stops, dtype: int64
Airline_Names
IndiGo       1782
Air India    1638
Vistara      1514
Go First      492
Air Asia      213
SpiceJet      166
Name: Airline_Names, dtype: int64
Source_Place
Mumbai       806
Bangalore    755
New Delhi    748
Kolkata      744
Hyderabad    614
Goa          612
Chennai      580
Lucknow      519
Jaipur       427
Name: Source_Place, dtype: int64
Destination_Place
Mumbai       805
Bangalore    786
Hyderabad    759
New Delhi    727
Kolkata      710
Chennai      648
Goa          545
Lucknow      521
Jaipur       304
Name: Destination_Place, dtype: int64


### Data Preprocessing

In [13]:
# Meal_Availability 

df.Meal_Availability.replace({"No Meal Fare": "No Meals", "Free Meal": "Free Meals", "eCash 250": "eCash Meals"}, 
                             inplace = True)
df["Meal_Availability"].value_counts()

No Meals       4252
Free Meals     1014
eCash Meals     539
Name: Meal_Availability, dtype: int64

In [14]:
# Number_Of_Stops

df.Number_Of_Stops.replace({"Non Stop": 0, "1 Stop": 1, "2 Stop(s)": 2, "3 Stop(s)": 3, "4 Stop(s)": 4}, 
                           inplace = True)
df["Number_Of_Stops"].value_counts()

1    3696
2    1154
0     693
3     244
4      18
Name: Number_Of_Stops, dtype: int64


Now we are changing the object datatype "Number_Of_Stops" column to integer values by giving them the numeric stops values instead of a descriptive one.

In [15]:
df

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices
0,Air Asia,12:40,20:15,7h 35m,New Delhi,Mumbai,No Meals,1,5953
1,Air Asia,11:55,20:15,8h 20m,New Delhi,Mumbai,No Meals,1,5953
2,Air Asia,16:15,06:20,14h 05m,New Delhi,Mumbai,No Meals,1,5953
3,Go First,18:50,20:45,1h 55m,New Delhi,Mumbai,No Meals,0,5954
4,Go First,09:05,11:05,2h 00m,New Delhi,Mumbai,No Meals,0,5954
...,...,...,...,...,...,...,...,...,...
5800,Air India,08:55,08:20,23h 25m,Lucknow,Jaipur,No Meals,1,9302
5801,Air India,08:55,09:20,24h 25m,Lucknow,Jaipur,No Meals,2,16287
5802,Air India,14:45,09:20,18h 35m,Lucknow,Jaipur,No Meals,2,16865
5803,Air India,08:55,09:20,24h 25m,Lucknow,Jaipur,No Meals,2,16865


In [16]:
# Departure_Time

df["Dep_Hour"] = pd.to_datetime(df.Departure_Time, format="%H:%M").dt.hour
df["Dep_Min"] = pd.to_datetime(df.Departure_Time, format="%H:%M").dt.minute
df["Departure_Time"] = df['Dep_Hour'] + df['Dep_Min'] / 60
#df.drop(columns = ['Dep_Hour','Dep_Min'], inplace=True) 
df.head()

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min
0,Air Asia,12.666667,20:15,7h 35m,New Delhi,Mumbai,No Meals,1,5953,12,40
1,Air Asia,11.916667,20:15,8h 20m,New Delhi,Mumbai,No Meals,1,5953,11,55
2,Air Asia,16.25,06:20,14h 05m,New Delhi,Mumbai,No Meals,1,5953,16,15
3,Go First,18.833333,20:45,1h 55m,New Delhi,Mumbai,No Meals,0,5954,18,50
4,Go First,9.083333,11:05,2h 00m,New Delhi,Mumbai,No Meals,0,5954,9,5


In [17]:
# Arrival_Time

df["Arr_Hour"] = pd.to_datetime(df.Arrival_Time, format="%H:%M").dt.hour
df["Arr_Min"] = pd.to_datetime(df.Arrival_Time, format="%H:%M").dt.minute
df["Arrival_Time"] = df['Arr_Hour'] + df['Arr_Min'] / 60
#df.drop(columns = ['Arr_Hour','Arr_Min'], inplace=True)  
df.head()

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min,Arr_Hour,Arr_Min
0,Air Asia,12.666667,20.25,7h 35m,New Delhi,Mumbai,No Meals,1,5953,12,40,20,15
1,Air Asia,11.916667,20.25,8h 20m,New Delhi,Mumbai,No Meals,1,5953,11,55,20,15
2,Air Asia,16.25,6.333333,14h 05m,New Delhi,Mumbai,No Meals,1,5953,16,15,6,20
3,Go First,18.833333,20.75,1h 55m,New Delhi,Mumbai,No Meals,0,5954,18,50,20,45
4,Go First,9.083333,11.083333,2h 00m,New Delhi,Mumbai,No Meals,0,5954,9,5,11,5


In [18]:
# Flight_Duration

df["FD_Hour"] = df.Flight_Duration.str.split('h').str.get(0)
df["FD_Min"] = df.Flight_Duration.str.split('h').str.get(1)
df["FD_Min"] = df["FD_Min"].str.split('m').str.get(0)
df["FD_Hour"] = df['FD_Hour'].astype('float')
df["FD_Min"] = df['FD_Min'].astype('float')
df["Flight_Duration"] = df["FD_Hour"] + df["FD_Min"] / 60
#df.drop(columns = ["FD_Hour","FD_Min"], inplace=True)
df.head()

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min,Arr_Hour,Arr_Min,FD_Hour,FD_Min
0,Air Asia,12.666667,20.25,7.583333,New Delhi,Mumbai,No Meals,1,5953,12,40,20,15,7.0,35.0
1,Air Asia,11.916667,20.25,8.333333,New Delhi,Mumbai,No Meals,1,5953,11,55,20,15,8.0,20.0
2,Air Asia,16.25,6.333333,14.083333,New Delhi,Mumbai,No Meals,1,5953,16,15,6,20,14.0,5.0
3,Go First,18.833333,20.75,1.916667,New Delhi,Mumbai,No Meals,0,5954,18,50,20,45,1.0,55.0
4,Go First,9.083333,11.083333,2.0,New Delhi,Mumbai,No Meals,0,5954,9,5,11,5,2.0,0.0


In [19]:
# Flight_Prices

df['Flight_Prices'] = df['Flight_Prices'].str.replace(',','')
df['Flight_Prices'] = df['Flight_Prices'].astype('float')
df

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min,Arr_Hour,Arr_Min,FD_Hour,FD_Min
0,Air Asia,12.666667,20.250000,7.583333,New Delhi,Mumbai,No Meals,1,5953.0,12,40,20,15,7.0,35.0
1,Air Asia,11.916667,20.250000,8.333333,New Delhi,Mumbai,No Meals,1,5953.0,11,55,20,15,8.0,20.0
2,Air Asia,16.250000,6.333333,14.083333,New Delhi,Mumbai,No Meals,1,5953.0,16,15,6,20,14.0,5.0
3,Go First,18.833333,20.750000,1.916667,New Delhi,Mumbai,No Meals,0,5954.0,18,50,20,45,1.0,55.0
4,Go First,9.083333,11.083333,2.000000,New Delhi,Mumbai,No Meals,0,5954.0,9,5,11,5,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5800,Air India,8.916667,8.333333,23.416667,Lucknow,Jaipur,No Meals,1,9302.0,8,55,8,20,23.0,25.0
5801,Air India,8.916667,9.333333,24.416667,Lucknow,Jaipur,No Meals,2,16287.0,8,55,9,20,24.0,25.0
5802,Air India,14.750000,9.333333,18.583333,Lucknow,Jaipur,No Meals,2,16865.0,14,45,9,20,18.0,35.0
5803,Air India,8.916667,9.333333,24.416667,Lucknow,Jaipur,No Meals,2,16865.0,8,55,9,20,24.0,25.0


In [20]:
df.dtypes

Airline_Names         object
Departure_Time       float64
Arrival_Time         float64
Flight_Duration      float64
Source_Place          object
Destination_Place     object
Meal_Availability     object
Number_Of_Stops        int64
Flight_Prices        float64
Dep_Hour               int64
Dep_Min                int64
Arr_Hour               int64
Arr_Min                int64
FD_Hour              float64
FD_Min               float64
dtype: object

In [21]:
df.describe(include="all").T


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Airline_Names,5805.0,6.0,IndiGo,1782.0,,,,,,,
Departure_Time,5805.0,,,,13.024591,4.887243,0.583333,8.916667,12.916667,16.833333,23.416667
Arrival_Time,5805.0,,,,15.823572,5.543619,0.083333,11.5,16.666667,20.416667,23.916667
Flight_Duration,5805.0,,,,12.452728,8.137841,0.833333,6.333333,10.25,17.833333,41.083333
Source_Place,5805.0,9.0,Mumbai,806.0,,,,,,,
Destination_Place,5805.0,9.0,Mumbai,805.0,,,,,,,
Meal_Availability,5805.0,3.0,No Meals,4252.0,,,,,,,
Number_Of_Stops,5805.0,,,,1.172782,0.696018,0.0,1.0,1.0,1.0,4.0
Flight_Prices,5805.0,,,,10046.68062,3667.752804,3361.0,7425.0,9747.0,12249.0,38348.0
Dep_Hour,5805.0,,,,12.585185,4.881136,0.0,8.0,12.0,16.0,23.0


In [22]:
# Visualizing the statistical description of numeric datatype columns

plt.figure(figsize = (10,7))
sns.heatmap(round(df.describe()[1:].transpose(),2), linewidth = 2, annot= True, fmt = ".2f", cmap="hot")
plt.title("Satistical Report of Numerical Columns\n", fontsize = 20)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()

## Visualization

In [23]:
try:
    x = 'Meal_Availability'
    k=0
    plt.figure(figsize=[8,8])
    axes = sns.countplot(df[x])
    for i in axes.patches:
        ht = i.get_height()
        mr = len(df[x])
        st = f"{ht} ({round(ht*100/mr,2)}%)"
        plt.text(k, ht/2, st, ha='center', fontweight='bold')
        k += 1
    plt.ylim(0,4500)
    plt.title(f'Count Plot for {x} column\n', fontsize = 20)
    plt.ylabel(f'Total number of rows covered\n')
    plt.show()
    
except Exception as e:
    print("Error:", e)
    pass

In [24]:
try:
    x = 'Airline_Names'
    k=0
    plt.figure(figsize=[15,8])
    axes = sns.countplot(df[x])
    for i in axes.patches:
        ht = i.get_height()
        mr = len(df[x])
        st = f"{ht} ({round(ht*100/mr,2)}%)"
        plt.text(k, ht/2, st, ha='center', fontweight='bold')
        k += 1
    plt.ylim(0,2000)
    plt.title(f'Count Plot for {x} column\n', fontsize = 20)
    plt.ylabel(f'Total number of rows covered\n')
    plt.show()
    
except Exception as e:
    print("Error:", e)
    pass


In [25]:
try:
    x = 'Source_Place'
    k=0
    plt.figure(figsize=[15,8])
    axes = sns.countplot(df[x])
    for i in axes.patches:
        ht = i.get_height()
        mr = len(df[x])
        st = f"{ht} ({round(ht*100/mr,2)}%)"
        plt.text(k, ht/2, st, ha='center', fontweight='bold')
        k += 1
    plt.ylim(0,900)
    plt.title(f'Count Plot for {x} column\n', fontsize = 20)
    plt.ylabel(f'Total number of rows covered\n')
    plt.show()
    
except Exception as e:
    print("Error:", e)
    pass


In [26]:
try:
    x = 'Destination_Place'
    k=0
    plt.figure(figsize=[15,8])
    axes = sns.countplot(df[x])
    for i in axes.patches:
        ht = i.get_height()
        mr = len(df[x])
        st = f"{ht} ({round(ht*100/mr,2)}%)"
        plt.text(k, ht/2, st, ha='center', fontweight='bold')
        k += 1
    plt.ylim(0,900)
    plt.title(f'Count Plot for {x} column\n', fontsize = 20)
    plt.ylabel(f'Total number of rows covered\n')
    plt.show()
    
except Exception as e:
    print("Error:", e)
    pass


In [27]:
try:
    x = 'Number_Of_Stops'
    k=0
    plt.figure(figsize=[15,8])
    axes = sns.countplot(df[x])
    for i in axes.patches:
        ht = i.get_height()
        mr = len(df[x])
        st = f"{ht} ({round(ht*100/mr,2)}%)"
        plt.text(k, ht/2, st, ha='center', fontweight='bold')
        k += 1
    plt.ylim(0,4000)
    plt.title(f'Count Plot for {x} column\n', fontsize = 20)
    plt.ylabel(f'Total number of rows covered\n')
    plt.show()
    
except Exception as e:
    print("Error:", e)
    pass


In [28]:
y = 'Airline_Names'

x = 'Departure_Time'
plt.figure(figsize=[15,7])
sns.barplot(x,y,data=df,orient='h')
plt.title(f"Barplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

x = 'Arrival_Time'
plt.figure(figsize=[15,7])
sns.barplot(x,y,data=df,orient='h')
plt.title(f"Barplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

x = 'Flight_Duration'
plt.figure(figsize=[15,7])
sns.barplot(x,y,data=df,orient='h')
plt.title(f"Barplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

x = 'Flight_Prices'
plt.figure(figsize=[15,7])
sns.barplot(x,y,data=df,orient='h')
plt.title(f"Barplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

In [29]:
x = "Source_Place"
plt.figure(figsize=(18,10))
sns.countplot(x = x, hue = "Airline_Names", data = df)
plt.title(f"Countplot for {x} column\n", fontsize = 20)
plt.show()

x = "Destination_Place"
plt.figure(figsize=(18,10))
sns.countplot(x = x, hue = "Airline_Names", data = df)
plt.title(f"Countplot for {x} column\n", fontsize = 20)
plt.show()


In [30]:
x = "Flight_Prices"
y = "Airline_Names"

plt.figure(figsize=(18,9))
sns.barplot(x=df[x], y=df[y], hue=df['Meal_Availability'], palette="Set1")
plt.title(f"Barplot for {x} column vs {y} columns with respect to total number of stops\n", fontsize = 20)
plt.show()

plt.figure(figsize=(18,9))
sns.barplot(x=df[x], y=df[y], hue=df['Number_Of_Stops'], palette="Set1")
plt.title(f"Barplot for {x} column vs {y} columns with respect to total number of stops\n", fontsize = 20)
plt.show()


In [31]:
y = "Flight_Prices"

x = "Airline_Names"
plt.figure(figsize = (15,8))
sns.barplot(data = df, y = y, x = x)
plt.title("Prices according to different Airlines\n", fontsize = 20)
plt.show()

x = "Source_Place"
plt.figure(figsize = (15,8))
sns.barplot(data = df, y = y, x = x)
plt.title("Prices according to different Source places\n", fontsize = 20)
plt.show()

x = "Destination_Place"
plt.figure(figsize = (15,8))
sns.barplot(data = df, y = y, x = x)
plt.title("Prices according to different Destination places\n", fontsize = 20)
plt.show()

x = "Number_Of_Stops"
plt.figure(figsize = (8,8))
sns.barplot(data = df, y = y, x = x)
plt.title("Prices according to different Number of layover stops\n", fontsize = 20)
plt.show()

x = "Meal_Availability"
plt.figure(figsize = (8,8))
sns.barplot(data = df, y = y, x = x)
plt.title("Prices according to different Meal options\n", fontsize = 20)
plt.show()

In [32]:
y = "Flight_Prices"

x = "Departure_Time"
plt.figure(figsize = (15,8))
sns.scatterplot(x=x,y=y,data=df)
plt.title(f"Scatterplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

x = "Arrival_Time"
plt.figure(figsize = (15,8))
sns.scatterplot(x=x,y=y,data=df)
plt.title(f"Scatterplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()

x = "Flight_Duration"
plt.figure(figsize = (15,8))
sns.scatterplot(x=x,y=y,data=df)
plt.title(f"Scatterplot for {x} column vs {y} column\n", fontsize = 20)
plt.show()


In [33]:
print("*************************************** Pair Plot with Meal Type Legend ***************************************")
sns.pairplot(df, hue='Meal_Availability', diag_kind="kde", kind="scatter", palette="Set1", height=3.5)
plt.show()

*************************************** Pair Plot with Meal Type Legend ***************************************


### Encoding the categorical object datatype columns

In [34]:
# Ordinal Encoder

oe = OrdinalEncoder()
def ordinal_encode(df, column):
    df[column] = oe.fit_transform(df[column])
    return df

column=["Meal_Availability", "Airline_Names", "Source_Place", "Destination_Place"]
df=ordinal_encode(df, column)
df

Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min,Arr_Hour,Arr_Min,FD_Hour,FD_Min
0,0.0,12.666667,20.250000,7.583333,8.0,7.0,1.0,1,5953.0,12,40,20,15,7.0,35.0
1,0.0,11.916667,20.250000,8.333333,8.0,7.0,1.0,1,5953.0,11,55,20,15,8.0,20.0
2,0.0,16.250000,6.333333,14.083333,8.0,7.0,1.0,1,5953.0,16,15,6,20,14.0,5.0
3,2.0,18.833333,20.750000,1.916667,8.0,7.0,1.0,0,5954.0,18,50,20,45,1.0,55.0
4,2.0,9.083333,11.083333,2.000000,8.0,7.0,1.0,0,5954.0,9,5,11,5,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5800,1.0,8.916667,8.333333,23.416667,6.0,4.0,1.0,1,9302.0,8,55,8,20,23.0,25.0
5801,1.0,8.916667,9.333333,24.416667,6.0,4.0,1.0,2,16287.0,8,55,9,20,24.0,25.0
5802,1.0,14.750000,9.333333,18.583333,6.0,4.0,1.0,2,16865.0,14,45,9,20,18.0,35.0
5803,1.0,8.916667,9.333333,24.416667,6.0,4.0,1.0,2,16865.0,8,55,9,20,24.0,25.0


In [35]:
print("Shape of our data frame post encoding shows {} Rows and {} columns\n".format(df.shape[0], df.shape[1]))
df

Shape of our data frame post encoding shows 5805 Rows and 15 columns



Unnamed: 0,Airline_Names,Departure_Time,Arrival_Time,Flight_Duration,Source_Place,Destination_Place,Meal_Availability,Number_Of_Stops,Flight_Prices,Dep_Hour,Dep_Min,Arr_Hour,Arr_Min,FD_Hour,FD_Min
0,0.0,12.666667,20.250000,7.583333,8.0,7.0,1.0,1,5953.0,12,40,20,15,7.0,35.0
1,0.0,11.916667,20.250000,8.333333,8.0,7.0,1.0,1,5953.0,11,55,20,15,8.0,20.0
2,0.0,16.250000,6.333333,14.083333,8.0,7.0,1.0,1,5953.0,16,15,6,20,14.0,5.0
3,2.0,18.833333,20.750000,1.916667,8.0,7.0,1.0,0,5954.0,18,50,20,45,1.0,55.0
4,2.0,9.083333,11.083333,2.000000,8.0,7.0,1.0,0,5954.0,9,5,11,5,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5800,1.0,8.916667,8.333333,23.416667,6.0,4.0,1.0,1,9302.0,8,55,8,20,23.0,25.0
5801,1.0,8.916667,9.333333,24.416667,6.0,4.0,1.0,2,16287.0,8,55,9,20,24.0,25.0
5802,1.0,14.750000,9.333333,18.583333,6.0,4.0,1.0,2,16865.0,14,45,9,20,18.0,35.0
5803,1.0,8.916667,9.333333,24.416667,6.0,4.0,1.0,2,16865.0,8,55,9,20,24.0,25.0


In [36]:
plt.style.use('seaborn-bright')

df.hist(figsize=(20,20))
plt.show()


### Correlation using a Heatmap

In [37]:
upper_triangle = np.triu(df.corr())
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="magma", mask=upper_triangle)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

### Correlation Bar Plot comparing target column with the feature columns

In [38]:
df_corr = df.corr()
plt.figure(figsize=(14,7))
df_corr['Flight_Prices'].sort_values(ascending=False).drop('Flight_Prices').plot.bar()
plt.title("Correlation of Feature columns vs Label column\n", fontsize=20)
plt.xlabel("\nFeatures List", fontsize=14)
plt.ylabel("Correlation Value", fontsize=14)
plt.show()

### Outliers

In [39]:
plt.figure(figsize=(14,7))
outl_df = df.columns.values
for i in range(0, len(outl_df)):
    plt.subplot(3, 5, i+1)
    ax = sns.boxenplot(df[outl_df[i]], color='purple')
    plt.tight_layout()

### SKEWNESS

In [40]:
fig, ax = plt.subplots(ncols=5, nrows=3, figsize=(15,9))
index = 0
ax = ax.flatten()
for col, value in df.items():
    sns.distplot(value, ax=ax[index], hist=False, color="g", kde_kws={"shade": True})
    index += 1
plt.tight_layout(pad=0.8, w_pad=0.8, h_pad=2.0)
plt.show()

### Splitting the dataset into 2 variables namely 'X' and 'Y' for feature and label

In [41]:
X = df.drop('Flight_Prices', axis=1)
Y = df['Flight_Prices']

### Finding the best random state for building Regression Models

In [42]:
maxAccu=0
maxRS=0

for i in range(1, 1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=i)
    lr=LinearRegression()
    lr.fit(X_train, Y_train)
    pred = lr.predict(X_test)
    r2 = r2_score(Y_test, pred)
    
    if r2>maxAccu:
        maxAccu=r2
        maxRS=i

print("Best R2 score is", maxAccu*100,"on Random State", maxRS)


Best R2 score is 41.44799084752707 on Random State 3


### Feature importance bar graph

In [43]:
rf=RandomForestRegressor()
rf.fit(X_train, Y_train)
importances = pd.DataFrame({'Features':X.columns, 'Importance':np.round(rf.feature_importances_,3)})
importances = importances.sort_values('Importance', ascending=False).set_index('Features')
plt.rcParams["figure.figsize"] = (15,8)
importances.plot.bar(color='teal')
importances


Unnamed: 0_level_0,Importance
Features,Unnamed: 1_level_1
Number_Of_Stops,0.229
Flight_Duration,0.165
Airline_Names,0.089
Meal_Availability,0.088
Destination_Place,0.068
Source_Place,0.066
Departure_Time,0.058
Arrival_Time,0.055
Dep_Min,0.046
Arr_Min,0.038


### Machine Learning Model for Regression with Evaluation Metrics

In [44]:
# Regression Model Function

def reg(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=638)
    
    # Training the model
    model.fit(X_train, Y_train)
    
    # Predicting Y_test
    pred = model.predict(X_test)
    
    # RMSE - a lower RMSE score is better than a higher one
    rmse = mean_squared_error(Y_test, pred, squared=False)
    print("RMSE Score is:", rmse)
    
    # R2 score
    r2 = r2_score(Y_test, pred, multioutput='variance_weighted')*100
    print("R2 Score is:", r2)
    
    # Cross Validation Score
    cv_score = (cross_val_score(model, X, Y, cv=5).mean())*100
    print("Cross Validation Score:", cv_score)
    
    # Result of r2 score minus cv score
    result = r2 - cv_score
    print("R2 Score - Cross Validation Score is", result)

In [45]:
# Linear Regression Model

model=LinearRegression()
reg(model, X, Y)

RMSE Score is: 2741.3845061868797
R2 Score is: 41.332305408059746
Cross Validation Score: 33.95569338340015
R2 Score - Cross Validation Score is 7.376612024659593


In [46]:
# Ridge Regularization

model=Ridge(alpha=1e-2, normalize=True)
reg(model, X, Y)

RMSE Score is: 2743.4497644670882
R2 Score is: 41.24387594331852
Cross Validation Score: 33.96445373909664
R2 Score - Cross Validation Score is 7.279422204221881


In [47]:
# Lasso Regularization

model=Lasso(alpha=1e-2, normalize=True, max_iter=1e5)
reg(model, X, Y)


RMSE Score is: 2742.032210252989
R2 Score is: 41.304579422683574
Cross Validation Score: 33.960257819050575
R2 Score - Cross Validation Score is 7.344321603632999


In [48]:
# Decision Tree Regressor

model=DecisionTreeRegressor(criterion="poisson", random_state=111)
reg(model, X, Y)


RMSE Score is: 3411.501062644662
R2 Score is: 9.144727246307482
Cross Validation Score: -34.33257186084151
R2 Score - Cross Validation Score is 43.477299107148994


In [49]:
# Random Forest Regressor

model=RandomForestRegressor(max_depth=2, max_features="sqrt")
reg(model, X, Y)


RMSE Score is: 2883.6621891401373
R2 Score is: 35.084578605849956
Cross Validation Score: 30.07204608209267
R2 Score - Cross Validation Score is 5.012532523757287


In [50]:
# K Neighbors Regressor

KNeighborsRegressor(n_neighbors=2, algorithm='kd_tree')
reg(model, X, Y)

RMSE Score is: 2898.2557275120967
R2 Score is: 34.425872573038305
Cross Validation Score: 30.053755860673654
R2 Score - Cross Validation Score is 4.372116712364651


In [51]:
# Gradient Boosting Regressor

model=GradientBoostingRegressor(loss='quantile', n_estimators=200, max_depth=5)
reg(model, X, Y)


RMSE Score is: 3396.362678311959
R2 Score is: 9.949270683166189
Cross Validation Score: -8.689931605785498
R2 Score - Cross Validation Score is 18.639202288951687


In [52]:
# Ada Boost Regressor

model=AdaBoostRegressor(n_estimators=300, learning_rate=1.05, random_state=42)
reg(model, X, Y)


RMSE Score is: 4227.642017872282
R2 Score is: -39.526122771423644
Cross Validation Score: -11.214507216922213
R2 Score - Cross Validation Score is -28.31161555450143


In [53]:
# Extra Trees Regressor

model=ExtraTreesRegressor(n_estimators=200, max_features='sqrt', n_jobs=6)
reg(model, X, Y)


RMSE Score is: 2049.177883647143
R2 Score is: 67.21928404324086
Cross Validation Score: 52.11514607177289
R2 Score - Cross Validation Score is 15.10413797146797


### Hyper parameter tuning

In [54]:
# Choosing Extra Trees Regressor

fmod_param = {'n_estimators' : [100, 200, 300],
              'criterion' : ['squared_error', 'mse', 'absolute_error', 'mae'],
              'n_jobs' : [-2, -1, 1],
              'random_state' : [42, 251, 340]
             }

GSCV = GridSearchCV(ExtraTreesRegressor(), fmod_param, cv=5)
GSCV.fit(X_train,Y_train)


GridSearchCV(cv=5, estimator=ExtraTreesRegressor(),
             param_grid={'criterion': ['squared_error', 'mse', 'absolute_error',
                                       'mae'],
                         'n_estimators': [100, 200, 300], 'n_jobs': [-2, -1, 1],
                         'random_state': [42, 251, 340]})

In [55]:
GSCV.best_params_


{'criterion': 'mae', 'n_estimators': 300, 'n_jobs': -2, 'random_state': 251}

In [56]:
Final_Model = ExtraTreesRegressor(criterion='mae', n_estimators=300, n_jobs=-2, random_state=251)
Model_Training = Final_Model.fit(X_train, Y_train)
fmod_pred = Final_Model.predict(X_test)
fmod_r2 = r2_score(Y_test, fmod_pred, multioutput='variance_weighted')*100
print("R2 score for the Best Model is:", fmod_r2)


R2 score for the Best Model is: 72.4377190016525


### Prediction of Flight Prices

In [57]:
Predicted_Price = Final_Model.predict(X)
# Checking the predicted price details in dataframe format
predicted_output = pd.DataFrame()
predicted_output['Flight Price Predicted'] = Predicted_Price
predicted_output['Flight Price Actual'] = df["Flight_Prices"]
predicted_output


Unnamed: 0,Flight Price Predicted,Flight Price Actual
0,5953.000000,5953.0
1,5953.000000,5953.0
2,5965.956667,5953.0
3,5590.056667,5954.0
4,5954.086667,5954.0
...,...,...
5800,9284.330000,9302.0
5801,16865.000000,16287.0
5802,16865.000000,16865.0
5803,16865.000000,16865.0


### Saving the best model

In [58]:
filename = "FlightPricePrediction.pkl"
joblib.dump(Final_Model, filename)

['FlightPricePrediction.pkl']