In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# import some libery

from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OneHotEncoder # encoder
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split # train test split

 #Model Training

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import warnings
warnings.simplefilter(action='ignore')
%matplotlib inline


In [4]:
df = pd.read_csv('data/flight_price_clean_data')

In [5]:
df

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_Day,Journey_Month,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Duration_hour,Duration_min
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2 stops,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2 stops,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1 stop,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1 stop,13302,1,3,16,50,21,35,4,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10675,Air Asia,Kolkata,Banglore,non-stop,4107,9,4,19,55,22,25,2,30
10676,Air India,Kolkata,Banglore,non-stop,4145,27,4,20,45,23,20,2,35
10677,Jet Airways,Banglore,Delhi,non-stop,7229,27,4,8,20,11,20,3,0
10678,Vistara,Banglore,New Delhi,non-stop,12648,1,3,11,30,14,10,2,40


In [6]:
# Stops maping non-stop : 0 , 1-stop : 1 , 2-stop : 2 , 3-stop : 3 , 4-stop : 4
total_stops_map = {'non-stop' : 0 , '1 stop' : 1 , '2 stops' : 2 , '3 stops' : 3 , '4 stops' : 4}

In [7]:
df['Total_Stops'] = df['Total_Stops'].replace(total_stops_map)

In [8]:
# Define independent and dependent feature

X = df.drop(['Price'] , axis = 1)
y = df['Price']

In [9]:
# Define numerical-cols and categorical_cols

numerical_cols = X.select_dtypes(exclude='O').columns
categorical_cols = X.select_dtypes(include='O').columns

In [10]:
# Create pipeline

# Numerical_pipeline

num_pipeline = Pipeline(
                steps = [
                    ('imputer' , SimpleImputer(strategy='median')),
                    ('scaler' ,  StandardScaler())
                ])

cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy = 'most_frequent')),
                    ('one_encoder' ,OneHotEncoder(sparse=False ,handle_unknown='ignore')),
                    ('scaler' , StandardScaler())
                ]
)


# Columns transformer

preprocessor=ColumnTransformer(
    [
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline' , cat_pipeline , categorical_cols)

    ])

In [11]:
# train test split

X_train,X_test,y_train,y_test=train_test_split(X,y , test_size = 0.3 , random_state = 45)

In [12]:
X_train

Unnamed: 0,Airline,Source,Destination,Total_Stops,Journey_Day,Journey_Month,Dep_hour,Dep_min,Arrival_hour,Arrival_min,Duration_hour,Duration_min
8013,Vistara,Banglore,Delhi,0,21,5,7,0,9,40,2,40
1993,Multiple carriers,Delhi,Cochin,1,12,6,13,0,21,0,8,0
1309,IndiGo,Banglore,Delhi,0,6,5,7,10,10,5,2,55
434,Vistara,Banglore,Delhi,0,1,6,19,30,22,15,2,45
8105,Jet Airways,Delhi,Cochin,1,27,6,17,30,12,35,19,5
...,...,...,...,...,...,...,...,...,...,...,...,...
8772,IndiGo,Delhi,Cochin,1,24,4,5,5,12,10,7,5
163,IndiGo,Delhi,Cochin,1,9,5,6,50,16,10,9,20
6012,SpiceJet,Kolkata,Banglore,0,18,5,9,0,11,25,2,25
6558,Jet Airways,Delhi,Cochin,1,24,6,7,5,19,0,11,55


In [14]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [15]:
model = RandomForestRegressor()

In [16]:
# data fit in model

model.fit(X_train,y_train)

In [18]:
# model Predict

y_pred = model.predict(X_test)

In [20]:
# check model accuracey score
score = r2_score(y_pred,y_test)

In [24]:
print(f'Model Name:- {model}')
print('-------------------------------------------------------')
print(f'Model Score:- {score*100}')

Model Name:- RandomForestRegressor()
-------------------------------------------------------
Model Score:- 70.96997513401556
