In [288]:
import numpy as np
import pandas as pd


In [289]:
df = pd.read_excel('flights.xlsx')


In [290]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3898
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7663
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13883
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6219
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13303


In [291]:
df.shape

(10683, 11)

In [292]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


## Convert columns type

In [293]:
###convert time to DateTime type
df['Date_of_Journey']=pd.to_datetime(df['Date_of_Journey'], format = "%d/%m/%Y")





In [294]:
# divide Dep_Time to hours and minutes 

def get_dep_hour(x):
    return x.split(sep = ':')[0]

def get_dep_min(x):
    return x.split(sep = ':')[1]

df['dep_hour'] = df.Dep_Time.apply(get_dep_hour).astype(int)
df['dep_min'] = df.Dep_Time.apply(get_dep_min).astype(int)

In [295]:
#divide years, months, days of datetime dtype
df['journey_year']=df['Date_of_Journey'].dt.year
df['journey_month']=df['Date_of_Journey'].dt.month
df['journey_day']=df['Date_of_Journey'].dt.day

In [296]:

#Add columns: arrival hours, minutes, and Arrived_next_day
def get_arr_hour(x):
    time = x.split(sep= " ")[0]
    return time.split(sep = ":")[0]
def get_arr_min(x):
    time = x.split(sep= " ")[0]
    return time.split(sep = ":")[1]


def arrived_next_day(x):
    values = x.split(sep = " ")
    return len(values) > 2



df['Arrival_hour'] = df.Arrival_Time.apply(get_arr_hour).astype(int)
df['Arrival_min'] = df.Arrival_Time.apply(get_arr_min).astype(int)
df['Arrived_next_day'] = df.Arrival_Time.apply(arrived_next_day)

In [297]:
#create columns for 1- hours, 2- mins, 3- duration converted into mins 

def get_hour(x): 
    if "h" not in x:
        return 0
    return x.split(sep = "h")[0]

def get_min(x):
    if "m" not in x: 
        return 0
    return x.split(sep = "m")[0].split()[-1]

def convert_to_mins(x):
    hours = int(get_hour(x))
    mins = int(get_min(x))
    return mins + hours * 60

df['duration_hours'] = df['Duration'].apply(get_hour)
df['duration_mins'] = df['Duration'].apply(get_min)
df['Duration_in_mins'] = df['Duration'].apply(convert_to_mins)


weekdays = Mon = 0, Tue =1, Wed = 2, Thur = 3, Fri = 4, Sat = 5, Sun = 6

In [298]:
#add weekday 
df['weekday'] = df.Date_of_Journey.dt.dayofweek

## Handling null values

In [299]:
#check for null values
df.isnull().sum()

Airline             0
Date_of_Journey     0
Source              0
Destination         0
Route               1
Dep_Time            0
Arrival_Time        0
Duration            0
Total_Stops         1
Additional Info     0
Price               0
dep_hour            0
dep_min             0
journey_year        0
journey_month       0
journey_day         0
Arrival_hour        0
Arrival_min         0
Arrived_next_day    0
duration_hours      0
duration_mins       0
Duration_in_mins    0
weekday             0
dtype: int64

In [300]:
#check the instance with NaN
df1 = df[df.isna().any(axis=1)]
df1

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional Info,...,journey_year,journey_month,journey_day,Arrival_hour,Arrival_min,Arrived_next_day,duration_hours,duration_mins,Duration_in_mins,weekday
9039,Air India,2019-05-06,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,...,2019,5,6,9,25,True,23,40,1420,0


In [301]:
#drop the instance with NaN
df.dropna(inplace=True)

In [302]:
#renaming Additional Info column to be able to access .value_counts()
df.rename({'Additional Info': 'additional_info'}, axis=1, inplace=True)

In [303]:
#the feature [additional_info] contains 'No info' which counts as nulls 
df.additional_info.value_counts()


No info                         8344
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
1 Short layover                    1
Red-eye flight                     1
2 Long layover                     1
Name: additional_info, dtype: int64

In [304]:
#since 'No info' represent approximately 76% of the column and it appears to be missing at random, but we're waiting for more info. 
#meanwhile, we'll keep it but will not include it in the training set

## Check for duplicates

In [305]:
#check for duplicates
df.duplicated().value_counts()

False    10462
True       220
dtype: int64

In [306]:
#view duplicated rows
df[df.duplicated()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,additional_info,...,journey_year,journey_month,journey_day,Arrival_hour,Arrival_min,Arrived_next_day,duration_hours,duration_mins,Duration_in_mins,weekday
683,Jet Airways,2019-06-01,Delhi,Cochin,DEL → NAG → BOM → COK,14:35,04:25 02 Jun,13h 50m,2 stops,No info,...,2019,6,1,4,25,True,13,50,830,5
1061,Air India,2019-05-21,Delhi,Cochin,DEL → GOI → BOM → COK,22:00,19:15 22 May,21h 15m,2 stops,No info,...,2019,5,21,19,15,True,21,15,1275,1
1348,Air India,2019-05-18,Delhi,Cochin,DEL → HYD → BOM → COK,17:15,19:15 19 May,26h,2 stops,No info,...,2019,5,18,19,15,True,26,0,1560,5
1418,Jet Airways,2019-06-06,Delhi,Cochin,DEL → JAI → BOM → COK,05:30,04:25 07 Jun,22h 55m,2 stops,In-flight meal not included,...,2019,6,6,4,25,True,22,55,1375,3
1674,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,18:25,21:20,2h 55m,non-stop,No info,...,2019,3,24,21,20,False,2,55,175,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10594,Jet Airways,2019-06-27,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,12:35 28 Jun,13h 30m,2 stops,No info,...,2019,6,27,12,35,True,13,30,810,3
10616,Jet Airways,2019-06-01,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 02 Jun,26h 55m,2 stops,No info,...,2019,6,1,12,35,True,26,55,1615,5
10634,Jet Airways,2019-06-06,Delhi,Cochin,DEL → JAI → BOM → COK,09:40,12:35 07 Jun,26h 55m,2 stops,In-flight meal not included,...,2019,6,6,12,35,True,26,55,1615,3
10672,Jet Airways,2019-06-27,Delhi,Cochin,DEL → AMD → BOM → COK,23:05,19:00 28 Jun,19h 55m,2 stops,In-flight meal not included,...,2019,6,27,19,0,True,19,55,1195,3


In [307]:
#drop duplicated rows
df.drop_duplicates(keep='first',inplace=True)

## Feature Engineering

Define a function to return arrival and departure hours as parts of the day

In [308]:
#define hours of the day
def ptday(x):
    if x>=00 and x<=4:
        return 'late night'
    elif x>4 and x<=8:
        return 'early morning'
    elif x>8 and x<=12:
        return 'morning'
    elif x>12 and x<=16:
        return 'After noon'
    elif x>16 and x<=20:
        return'evening'
    elif x>20 and x<=23:
        return 'night'

In [309]:
#apply the function to arrival hours and departure hours
df['arrival_period']=df['Arrival_hour'].apply(ptday)
df['departure_period']=df['dep_hour'].apply(ptday)

## Drop redundant columns

In [310]:
#since we created new columns that extracted info from the following columns, we will drop them
df.drop(['Date_of_Journey'], axis=1, inplace=True)
df.drop(['Dep_Time'], axis=1, inplace=True)
df.drop('Arrival_Time', axis = 1, inplace = True)
# df.drop('duration_mins', axis = 1, inplace = True)
df.drop(['Duration'], axis=1, inplace=True)


#Route contains the name of cities and the number of stops. This info is already given in the Total_Stops and Destination and Source columns
#thus, we will drop Route
# df.drop(['Route'], axis=1, inplace=True)

In [311]:
df.head(10)

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,additional_info,Price,dep_hour,dep_min,journey_year,...,journey_day,Arrival_hour,Arrival_min,Arrived_next_day,duration_hours,duration_mins,Duration_in_mins,weekday,arrival_period,departure_period
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3898,22,20,2019,...,24,1,10,True,2,50,170,6,late night,night
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7663,5,50,2019,...,1,13,15,False,7,25,445,2,After noon,early morning
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13883,9,25,2019,...,9,4,25,True,19,0,1140,6,late night,morning
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6219,18,5,2019,...,12,23,30,False,5,25,325,6,night,evening
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13303,16,50,2019,...,1,21,35,False,4,45,285,4,night,After noon
5,SpiceJet,Kolkata,Banglore,CCU → BLR,non-stop,No info,3874,9,0,2019,...,24,11,25,False,2,25,145,0,morning,morning
6,Jet Airways,Banglore,New Delhi,BLR → BOM → DEL,1 stop,In-flight meal not included,11088,18,55,2019,...,12,10,25,True,15,30,930,1,morning,evening
7,Jet Airways,Banglore,New Delhi,BLR → BOM → DEL,1 stop,No info,22271,8,0,2019,...,1,5,5,True,21,5,1265,4,early morning,early morning
8,Jet Airways,Banglore,New Delhi,BLR → BOM → DEL,1 stop,In-flight meal not included,11088,8,55,2019,...,12,10,25,True,25,30,1530,1,morning,early morning
9,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,1 stop,No info,8626,11,25,2019,...,27,19,15,False,7,50,470,0,evening,morning


## Rearrange and rename columns

**Renaming columns**

In [312]:
df.rename(columns={'Airline': "airline", 'Source':'embarked', 'Destination':'destination','Price':'price','Total_Stops':'stops'}, inplace=True)

**Rearranging columns**

In [313]:
df = df[['airline', 'embarked', 'destination', 'Route', 'stops',
       'journey_year',
       'journey_month', 'journey_day', 'weekday', 'dep_hour', 'dep_min', 'departure_period',  'Arrival_hour', 'Arrival_min', 'arrival_period',
       'Arrived_next_day',  'duration_hours', 'duration_mins',
       'Duration_in_mins',  'price' ,'additional_info']]

## Save cleaned data to new file

In [314]:
df.to_csv("./df_cleaned.csv", index = False)

In [315]:
cleaned = pd.read_csv("./df_cleaned.csv")
cleaned

Unnamed: 0,airline,embarked,destination,Route,stops,journey_year,journey_month,journey_day,weekday,dep_hour,...,departure_period,Arrival_hour,Arrival_min,arrival_period,Arrived_next_day,duration_hours,duration_mins,Duration_in_mins,price,additional_info
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,2019,3,24,6,22,...,night,1,10,late night,True,2,50,170,3898,No info
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,2019,5,1,2,5,...,early morning,13,15,After noon,False,7,25,445,7663,No info
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,2019,6,9,6,9,...,morning,4,25,late night,True,19,0,1140,13883,No info
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,2019,5,12,6,18,...,evening,23,30,night,False,5,25,325,6219,No info
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,2019,3,1,4,16,...,After noon,21,35,night,False,4,45,285,13303,No info
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10457,Air Asia,Kolkata,Banglore,CCU → BLR,non-stop,2019,4,9,1,19,...,evening,22,25,night,False,2,30,150,4108,No info
10458,Air India,Kolkata,Banglore,CCU → BLR,non-stop,2019,4,27,5,20,...,evening,23,20,night,False,2,35,155,4146,No info
10459,Jet Airways,Banglore,Delhi,BLR → DEL,non-stop,2019,4,27,5,8,...,early morning,11,20,morning,False,3,0,180,7230,No info
10460,Vistara,Banglore,New Delhi,BLR → DEL,non-stop,2019,3,1,4,11,...,morning,14,10,After noon,False,2,40,160,12649,No info
