In [1]:
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import requests
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedShuffleSplit
    import category_encoders as ce
except ImportError as e:
    print(e)

# read data

In [2]:
df = pd.read_csv(r"Datasets\flight_data.csv",index_col=0)
df

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


In [3]:
df.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  object 
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 27.5+ MB


In [5]:
df.shape

(300153, 11)

In [6]:
df.isna().sum()

airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [7]:
df.nunique()

airline                 6
flight               1561
source_city             6
departure_time          6
stops                   3
arrival_time            6
destination_city        6
class                   2
duration              476
days_left              49
price               12157
dtype: int64

In [8]:
df

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


In [9]:
df['stops'].unique()

array(['zero', 'one', 'two_or_more'], dtype=object)

In [10]:
stops_mapping = {'zero': 0,'one': 1,'two_or_more': 2}
df['stops'] = df['stops'].replace(stops_mapping).astype(int)
df

  df['stops'] = df['stops'].replace(stops_mapping).astype(int)


Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,0,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,0,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,0,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,0,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,0,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,1,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,1,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,1,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,1,Evening,Hyderabad,Business,10.00,49,81585


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300153 entries, 0 to 300152
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   flight            300153 non-null  object 
 2   source_city       300153 non-null  object 
 3   departure_time    300153 non-null  object 
 4   stops             300153 non-null  int64  
 5   arrival_time      300153 non-null  object 
 6   destination_city  300153 non-null  object 
 7   class             300153 non-null  object 
 8   duration          300153 non-null  float64
 9   days_left         300153 non-null  int64  
 10  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 27.5+ MB


In [12]:
encoder= ce.BaseNEncoder(cols=["source_city",'destination_city'],return_df=True,base=5)


In [13]:
data_encoded=encoder.fit_transform(df)
data_encoded

Unnamed: 0,airline,flight,source_city_0,source_city_1,departure_time,stops,arrival_time,destination_city_0,destination_city_1,class,duration,days_left,price
0,SpiceJet,SG-8709,0,1,Evening,0,Night,0,1,Economy,2.17,1,5953
1,SpiceJet,SG-8157,0,1,Early_Morning,0,Morning,0,1,Economy,2.33,1,5953
2,AirAsia,I5-764,0,1,Early_Morning,0,Early_Morning,0,1,Economy,2.17,1,5956
3,Vistara,UK-995,0,1,Morning,0,Afternoon,0,1,Economy,2.25,1,5955
4,Vistara,UK-963,0,1,Morning,0,Morning,0,1,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,1,1,Morning,1,Evening,0,4,Business,10.08,49,69265
300149,Vistara,UK-826,1,1,Afternoon,1,Night,0,4,Business,10.42,49,77105
300150,Vistara,UK-832,1,1,Early_Morning,1,Night,0,4,Business,13.83,49,79099
300151,Vistara,UK-828,1,1,Early_Morning,1,Evening,0,4,Business,10.00,49,81585


In [14]:
data_encoded=encoder.fit_transform(df)
data_encoded

Unnamed: 0,airline,flight,source_city_0,source_city_1,departure_time,stops,arrival_time,destination_city_0,destination_city_1,class,duration,days_left,price
0,SpiceJet,SG-8709,0,1,Evening,0,Night,0,1,Economy,2.17,1,5953
1,SpiceJet,SG-8157,0,1,Early_Morning,0,Morning,0,1,Economy,2.33,1,5953
2,AirAsia,I5-764,0,1,Early_Morning,0,Early_Morning,0,1,Economy,2.17,1,5956
3,Vistara,UK-995,0,1,Morning,0,Afternoon,0,1,Economy,2.25,1,5955
4,Vistara,UK-963,0,1,Morning,0,Morning,0,1,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,1,1,Morning,1,Evening,0,4,Business,10.08,49,69265
300149,Vistara,UK-826,1,1,Afternoon,1,Night,0,4,Business,10.42,49,77105
300150,Vistara,UK-832,1,1,Early_Morning,1,Night,0,4,Business,13.83,49,79099
300151,Vistara,UK-828,1,1,Early_Morning,1,Evening,0,4,Business,10.00,49,81585
