In [109]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [110]:
dataset = pd.read_csv('../input/Flight_Data.csv')

In [111]:
dataset.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [112]:
dataset.Airline.unique()

array([&#39;IndiGo&#39;, &#39;Air India&#39;, &#39;Jet Airways&#39;, &#39;SpiceJet&#39;,
       &#39;Multiple carriers&#39;, &#39;GoAir&#39;, &#39;Vistara&#39;, &#39;Air Asia&#39;,
       &#39;Vistara Premium economy&#39;, &#39;Jet Airways Business&#39;,
       &#39;Multiple carriers Premium economy&#39;, &#39;Trujet&#39;], dtype=object)

In [113]:
dataset.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [114]:
dataset[dataset.Route.isna()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


### As we can see, there is a single row with both Route and Total_Stops as null. 
so we can drop it.

In [115]:
dataset.dropna(inplace=True)

In [116]:
dataset.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [117]:
# Converting all the date and time data to numbers

dataset["Journey_day"] = pd.to_datetime(dataset.Date_of_Journey, format="%d/%m/%Y").dt.day
dataset["Journey_month"] = pd.to_datetime(dataset["Date_of_Journey"], format = "%d/%m/%Y").dt.month
dataset.drop('Date_of_Journey', 1, inplace=True)


dataset["Dep_hour"] = pd.to_datetime(dataset["Dep_Time"]).dt.hour
dataset["Dep_min"] = pd.to_datetime(dataset["Dep_Time"]).dt.minute
dataset.drop(["Dep_Time"], axis = 1, inplace = True)


dataset["Arr_hour"] = pd.to_datetime(dataset["Arrival_Time"]).dt.hour
dataset["Arr_min"] = pd.to_datetime(dataset["Arrival_Time"]).dt.minute
dataset.drop(["Arrival_Time"], axis = 1, inplace = True)

In [118]:
dataset

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Dep_hour,Dep_min,Arr_hour,Arr_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,16,50,21,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,Air India,Kolkata,Banglore,CCU → DEL → BLR,23h 55m,1 stop,No info,14046,6,6,20,30,20,25
13350,IndiGo,Kolkata,Banglore,CCU → BLR,2h 35m,non-stop,No info,14086,27,3,14,20,16,55
13351,Jet Airways,Delhi,Cochin,DEL → BOM → COK,6h 35m,1 stop,No info,22720,6,3,21,50,4,25
13352,Air India,Delhi,Cochin,DEL → BOM → COK,15h 15m,1 stop,No info,23544,6,3,4,0,19,15


In [119]:
duration = list(dataset['Duration'])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"
        else:
            duration[i] = "0h " + duration[i]

duration_hours = []
duration_mins = []

for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0])) 
    duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))

In [120]:
dataset["Duration_hrs"] = duration_hours
dataset["Duration_mins"] = duration_mins

dataset.drop('Duration', 1, inplace=True)

In [121]:
dataset

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_day,Journey_month,Dep_hour,Dep_min,Arr_hour,Arr_min,Duration_hrs,Duration_mins
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13302,1,3,16,50,21,35,4,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,Air India,Kolkata,Banglore,CCU → DEL → BLR,1 stop,No info,14046,6,6,20,30,20,25,23,55
13350,IndiGo,Kolkata,Banglore,CCU → BLR,non-stop,No info,14086,27,3,14,20,16,55,2,35
13351,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1 stop,No info,22720,6,3,21,50,4,25,6,35
13352,Air India,Delhi,Cochin,DEL → BOM → COK,1 stop,No info,23544,6,3,4,0,19,15,15,15


In [122]:
dataset = pd.concat([dataset, pd.get_dummies(dataset[['Airline', 'Source', 'Destination']],
        drop_first = True
    )], axis=1)

In [123]:
dataset.drop(['Airline', 'Source', 'Destination'], 1, inplace=True)

In [124]:
# drop route and total_steps
dataset.drop(['Route', 'Additional_Info'], 1, inplace=True)

In [125]:
dataset

Unnamed: 0,Total_Stops,Price,Journey_day,Journey_month,Dep_hour,Dep_min,Arr_hour,Arr_min,Duration_hrs,Duration_mins,...,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
0,non-stop,3897,24,3,22,20,1,10,2,50,...,0,0,0,0,0,0,0,0,0,1
1,2 stops,7662,1,5,5,50,13,15,7,25,...,0,0,0,1,0,0,0,0,0,0
2,2 stops,13882,9,6,9,25,4,25,19,0,...,0,0,1,0,0,1,0,0,0,0
3,1 stop,6218,12,5,18,5,23,30,5,25,...,0,0,0,1,0,0,0,0,0,0
4,1 stop,13302,1,3,16,50,21,35,4,45,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,1 stop,14046,6,6,20,30,20,25,23,55,...,0,0,0,1,0,0,0,0,0,0
13350,non-stop,14086,27,3,14,20,16,55,2,35,...,0,0,0,1,0,0,0,0,0,0
13351,1 stop,22720,6,3,21,50,4,25,6,35,...,0,0,1,0,0,1,0,0,0,0
13352,1 stop,23544,6,3,4,0,19,15,15,15,...,0,0,1,0,0,1,0,0,0,0


In [126]:
dataset.Total_Stops.unique()

array([&#39;non-stop&#39;, &#39;2 stops&#39;, &#39;1 stop&#39;, &#39;3 stops&#39;, &#39;4 stops&#39;],
      dtype=object)

In [129]:
stops_mapping = {
    'non-stop': 0,
    '1 stop': 1,
    '2 stops': 2,
    '3 stops': 3,
    '4 stops': 4,
}

In [134]:
dataset['Stops'] = dataset.Total_Stops.map(stops_mapping)
dataset.drop('Total_Stops', 1, inplace=True)

Unnamed: 0,Price,Journey_day,Journey_month,Dep_hour,Dep_min,Arr_hour,Arr_min,Duration_hrs,Duration_mins,Airline_Air India,...,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi,Stops
0,3897,24,3,22,20,1,10,2,50,0,...,0,0,0,0,0,0,0,0,1,0
1,7662,1,5,5,50,13,15,7,25,1,...,0,0,1,0,0,0,0,0,0,2
2,13882,9,6,9,25,4,25,19,0,0,...,0,1,0,0,1,0,0,0,0,2
3,6218,12,5,18,5,23,30,5,25,0,...,0,0,1,0,0,0,0,0,0,1
4,13302,1,3,16,50,21,35,4,45,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,14046,6,6,20,30,20,25,23,55,1,...,0,0,1,0,0,0,0,0,0,1
13350,14086,27,3,14,20,16,55,2,35,0,...,0,0,1,0,0,0,0,0,0,0
13351,22720,6,3,21,50,4,25,6,35,0,...,0,1,0,0,1,0,0,0,0,1
13352,23544,6,3,4,0,19,15,15,15,1,...,0,1,0,0,1,0,0,0,0,1
