In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression


import joblib

In [2]:
data = pd.read_csv('Flight_Train.csv')

In [3]:
data.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662


In [5]:
data.duplicated().sum()

220

In [6]:
# Duplicate handling
data.drop_duplicates(inplace = True)

In [7]:
data.isna().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [8]:
# Null Value dropping
data.dropna(subset = ['Route','Total_Stops'],inplace = True)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin

In [16]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('\n>>>>>>>init() called.\n')
    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n')
        return self
    def transform(self, X, y = None):
        print('\n>>>>>>>transform() called.\n')
        print("\n>>>> Input : ",X)
        X.drop(columns = 'Route',inplace = True)
        X['Source'] = X['Source'].apply(lambda x: x.lower())
        X['Destination'] = X['Destination'].replace({'New Delhi':'Delhi'})
        X['Destination'] = X['Destination'].apply(lambda x: x.lower())
        X['Date_of_Journey'] = pd.to_datetime(X['Date_of_Journey'],dayfirst = True)
        X['Day_of_Journey'] = X['Date_of_Journey'].dt.weekday
        X['Day_of_Journey'] = X['Day_of_Journey'].replace({0:'Monday',1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',5:'Saturday',6:'Sunday'})
        X['Month'] = X['Date_of_Journey'].dt.month
        X['Month'] = X['Month'].replace({1:'January',2:'February',3:'March',4:'April',5:'May',6:'June',7:'July',8:'August',9:'September',10:'October',11:'November',12:'December'})
        X.drop(columns = 'Date_of_Journey',inplace = True)
        X.loc[X['Dep_Time'] < '12:00','Dep_Time']='AM'
        X.loc[X['Dep_Time'] !='AM','Dep_Time']='PM'
        X['Arrival_Time'] = X['Arrival_Time'].apply(lambda x: x.split()[0])
        X.loc[X['Arrival_Time'] < '12:00','Arrival_Time']='AM'
        X.loc[X['Arrival_Time'] !='AM','Arrival_Time']='PM'
        X['Additional_Info'].replace({'No info':'No Info'},inplace = True)
        print("\n>>>> Output : ", X)
        return X

In [34]:
class CustomTransformer1(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('\n>>>>>>>init() called.\n numerical')
    def fit(self, X, y = None):
        print('\n>>>>>>>fit() called.\n numerical')
        return self
    def transform(self, X, y = None):
        print('\n>>>>>>>transform() called.\n numerical')
        print("\n>>>> Input : ",X)
        X['Duration'] = (pd.to_timedelta(X['Duration']).dt.seconds // 60).astype(int)
        #X.drop(columns = 'Duration',inplace = True)
        #DF = pd.DataFrame(X['Duration'])
        print("\n>>>> Output : ",X)
        return X

In [42]:
categorical_preprocessing = Pipeline([
    ('custom transforming',CustomTransformer()),
    ('encoding',OneHotEncoder(sparse = False))    
])


>>>>>>>init() called.



In [43]:
numerical_preprocessing = Pipeline([
    ('custom transforming',CustomTransformer1()),
    ('scaling',MinMaxScaler())
])


>>>>>>>init() called.
 numerical


In [44]:
Final_preprocessing = ColumnTransformer(
transformers = 
    [
    ('categorical preprocessing',categorical_preprocessing,['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
    'Dep_Time', 'Arrival_Time','Total_Stops','Additional_Info']),
    ('numerical preprocessing',numerical_preprocessing,['Duration'])      
])

In [45]:
package = Pipeline([
    ('preprocess',Final_preprocessing),
    ('model',LinearRegression())
])

In [46]:
x = data.drop(columns = ['Price'])

In [47]:
y = data['Price']

In [48]:
package.fit(x,y)


>>>>>>>init() called.


>>>>>>>fit() called.


>>>>>>>transform() called.


>>>> Input :             Airline Date_of_Journey    Source Destination  \
0           IndiGo      24/03/2019  Banglore   New Delhi   
1        Air India       1/05/2019   Kolkata    Banglore   
2      Jet Airways       9/06/2019     Delhi      Cochin   
3           IndiGo      12/05/2019   Kolkata    Banglore   
4           IndiGo      01/03/2019  Banglore   New Delhi   
...            ...             ...       ...         ...   
10678     Air Asia       9/04/2019   Kolkata    Banglore   
10679    Air India      27/04/2019   Kolkata    Banglore   
10680  Jet Airways      27/04/2019  Banglore       Delhi   
10681      Vistara      01/03/2019  Banglore   New Delhi   
10682    Air India       9/05/2019     Delhi      Cochin   

                       Route Dep_Time  Arrival_Time Total_Stops  \
0                  BLR ? DEL    22:20  01:10 22 Mar    non-stop   
1      CCU ? IXR ? BBI ? BLR    05:50         13:15   



In [49]:
package.predict(x)


>>>>>>>transform() called.


>>>> Input :             Airline Date_of_Journey    Source Destination  \
0           IndiGo      24/03/2019  Banglore   New Delhi   
1        Air India       1/05/2019   Kolkata    Banglore   
2      Jet Airways       9/06/2019     Delhi      Cochin   
3           IndiGo      12/05/2019   Kolkata    Banglore   
4           IndiGo      01/03/2019  Banglore   New Delhi   
...            ...             ...       ...         ...   
10678     Air Asia       9/04/2019   Kolkata    Banglore   
10679    Air India      27/04/2019   Kolkata    Banglore   
10680  Jet Airways      27/04/2019  Banglore       Delhi   
10681      Vistara      01/03/2019  Banglore   New Delhi   
10682    Air India       9/05/2019     Delhi      Cochin   

                       Route Dep_Time  Arrival_Time Total_Stops  \
0                  BLR ? DEL    22:20  01:10 22 Mar    non-stop   
1      CCU ? IXR ? BBI ? BLR    05:50         13:15     2 stops   
2      DEL ? LKO ? BOM ? COK    09

array([ 6147., 10773., 15802., ...,  9165.,  8687., 10403.])

In [50]:
test_data = pd.read_csv('Flight_Test.csv')

In [51]:
test_data.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL ? BOM ? COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? MAA ? BLR,06:20,10:20,4h,1 stop,No info


In [52]:
package.predict(test_data)


>>>>>>>transform() called.


>>>> Input :                  Airline Date_of_Journey    Source Destination  \
0           Jet Airways       6/06/2019     Delhi      Cochin   
1                IndiGo      12/05/2019   Kolkata    Banglore   
2           Jet Airways      21/05/2019     Delhi      Cochin   
3     Multiple carriers      21/05/2019     Delhi      Cochin   
4              Air Asia      24/06/2019  Banglore       Delhi   
...                 ...             ...       ...         ...   
2666          Air India       6/06/2019   Kolkata    Banglore   
2667             IndiGo      27/03/2019   Kolkata    Banglore   
2668        Jet Airways       6/03/2019     Delhi      Cochin   
2669          Air India       6/03/2019     Delhi      Cochin   
2670  Multiple carriers      15/06/2019     Delhi      Cochin   

                Route Dep_Time  Arrival_Time Total_Stops  \
0     DEL ? BOM ? COK    17:30  04:25 07 Jun      1 stop   
1     CCU ? MAA ? BLR    06:20         10:20      1 sto

array([12971.,  7901., 10069., ..., 14836., 10385., 10984.])

In [53]:
joblib.dump(package,'FlightPrice.pkl')

['FlightPrice.pkl']

In [54]:
model_load = joblib.load('FlightPrice.pkl')

In [55]:
model_load.predict(test_data)


>>>>>>>transform() called.


>>>> Input :                  Airline Date_of_Journey    Source Destination  \
0           Jet Airways       6/06/2019     Delhi      Cochin   
1                IndiGo      12/05/2019   Kolkata    Banglore   
2           Jet Airways      21/05/2019     Delhi      Cochin   
3     Multiple carriers      21/05/2019     Delhi      Cochin   
4              Air Asia      24/06/2019  Banglore       Delhi   
...                 ...             ...       ...         ...   
2666          Air India       6/06/2019   Kolkata    Banglore   
2667             IndiGo      27/03/2019   Kolkata    Banglore   
2668        Jet Airways       6/03/2019     Delhi      Cochin   
2669          Air India       6/03/2019     Delhi      Cochin   
2670  Multiple carriers      15/06/2019     Delhi      Cochin   

                Route Dep_Time  Arrival_Time Total_Stops  \
0     DEL ? BOM ? COK    17:30  04:25 07 Jun      1 stop   
1     CCU ? MAA ? BLR    06:20         10:20      1 sto

array([12971.,  7901., 10069., ..., 14836., 10385., 10984.])

In [24]:
testsample = ['IndiGo','12/05/2019','Kolkata','Banglore','CCU ? MAA ? BLR','06:20','10:20','4h','1 stop','No info']

In [25]:
testsample = pd.DataFrame(testsample).T

In [26]:
testsample.columns = test_data.columns

In [27]:
testsample

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? MAA ? BLR,06:20,10:20,4h,1 stop,No info


In [28]:
package.predict(testsample)


>>>>>>>transform() called.


>>>> Input :    Airline Date_of_Journey   Source Destination            Route Dep_Time  \
0  IndiGo      12/05/2019  Kolkata    Banglore  CCU ? MAA ? BLR    06:20   

  Arrival_Time Total_Stops Additional_Info  
0        10:20      1 stop         No info  

>>>> Output :    Airline   Source Destination Dep_Time Arrival_Time Total_Stops  \
0  IndiGo  kolkata    banglore       AM           AM      1 stop   

  Additional_Info Day_of_Journey Month  
0         No Info         Sunday   May  

>>>>>>>transform() called.


>>>> Input :    Duration
0       4h

>>>> Output :     Duration
0       240


array([7901.])