### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("Dataset/Flight-Price.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [6]:
df.tail()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
300148,300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.0,49,81585
300152,300152,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,81585


In [7]:
df.describe()

Unnamed: 0.1,Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0,300153.0
mean,150076.0,12.221021,26.004751,20889.660523
std,86646.852011,7.191997,13.561004,22697.767366
min,0.0,0.83,1.0,1105.0
25%,75038.0,6.83,15.0,4783.0
50%,150076.0,11.25,26.0,7425.0
75%,225114.0,16.17,38.0,42521.0
max,300152.0,49.83,49.0,123071.0


In [8]:
df.drop(["Unnamed: 0", "flight"], axis = 1, inplace = True)

In [9]:
df.isnull().sum()

airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [10]:
df.duplicated().sum()

2213

In [11]:
df.drop_duplicates(inplace = True)

In [12]:
df.duplicated().sum()

0

## Machine Learning

### Importing Libraries

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

### Stratified Shuffle Split

In [14]:
df['price_cat'] = pd.cut(df['price'], bins = [0, 5000, 10000, 20000, 40000, np.inf], labels = [1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_set, test_set in split.split(df, df['price_cat']):
    strat_train_set = df.iloc[train_set]
    strat_test_set = df.iloc[test_set]

### Copying the Training Set

In [13]:
flight_train = strat_train_set.copy()

### Separating the Features and Labels

In [14]:
flight_train_features = flight_train.drop(["price", "price_cat"], axis = 1)
flight_train_labels = flight_train['price']

### Separating the Numerical and Categorical Values

In [15]:
flight_train_num = ['duration', 'days_left']
flight_train_cat = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

## Let's Start Making Pipelines

### Numerical and Categorical Pipelines

In [16]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())
])
cat_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown = "ignore")),
])
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, flight_train_num),
    ("cat", cat_pipeline, flight_train_cat),
])

### Transforming the Data

In [17]:
flight_prepared_train = full_pipeline.fit_transform(flight_train_features)

## Training the Models

### Linear Regression Model for Training Data

In [18]:
lin_model_train = LinearRegression()
print("Training the Model........")
lin_model_train.fit(flight_prepared_train, flight_train_labels)
lin_predict = lin_model_train.predict(flight_prepared_train)
lin_R2 = r2_score(flight_train_labels, lin_predict)
lin_MAE = mean_absolute_error(flight_train_labels, lin_predict)
lin_RMSE = root_mean_squared_error(flight_train_labels, lin_predict)
print("Running Cross Validation........")
lin_CV = cross_val_score(lin_model_train, flight_prepared_train, flight_train_labels, scoring = "neg_root_mean_squared_error", cv = 10)

print("Linear Regression Model for Training Data -:")
print(pd.Series(lin_CV).agg(['count', 'mean', 'std']))
print(f"\nR2 Score:{lin_R2:.2f}")
print(f"MAE Score:{lin_MAE:.2f}")
print(f"RMSE Score:{lin_RMSE:.2f}")

Training the Model........
Running Cross Validation........
Linear Regression Model for Training Data -:
count      10.000000
mean    -6761.018630
std        64.213674
dtype: float64

R2 Score:0.91
MAE Score:4566.01
RMSE Score:6760.56


### Random Forest Model for Training Data

In [19]:
forest_model_train = RandomForestRegressor(n_estimators = 100, max_depth = 15, n_jobs = -1)
print("Training the Model........")
forest_model_train.fit(flight_prepared_train, flight_train_labels)
forest_predict = forest_model_train.predict(flight_prepared_train)
forest_R2 = r2_score(flight_train_labels, forest_predict)
forest_MAE = mean_absolute_error(flight_train_labels, forest_predict)
forest_RMSE = root_mean_squared_error(flight_train_labels, forest_predict)
# print("Running Cross Validation........")
# forest_CV = cross_val_score(forest_model_train, flight_prepared_train, flight_train_labels, scoring = "neg_root_mean_squared_error", cv = 10)

print("Random Forest Regression Model for Training Data -:")
# print(pd.Series(forest_CV).agg(['count', 'mean', 'std']))
print(f"\nR2 Score:{forest_R2:.2f}")
print(f"MAE Score:{forest_MAE:.2f}")
print(f"RMSE Score:{forest_RMSE:.2f}")

Training the Model........
Random Forest Regression Model for Training Data -:

R2 Score:0.98
MAE Score:1609.73
RMSE Score:3055.52


### Gradient Boosting for Training Data

In [21]:
grad_model_train = GradientBoostingRegressor()
print("Training the Model........")
grad_model_train.fit(flight_prepared_train, flight_train_labels)
grad_predict = grad_model_train.predict(flight_prepared_train)
grad_R2 = r2_score(flight_train_labels, grad_predict)
grad_MAE = mean_absolute_error(flight_train_labels, grad_predict)
grad_RMSE = root_mean_squared_error(flight_train_labels, grad_predict)
# print("Running Cross Validation........")
# grad_CV = cross_val_score(grad_model_train, flight_prepared_train, flight_train_labels, scoring = "neg_root_mean_squared_error", cv = 10)

print("Gradient Boosting Regression Model for Training Data -:")
# print(pd.Series(grad_CV).agg(['count', 'mean', 'std']))
print(f"\nR2 Score:{grad_R2:.2f}")
print(f"MAE Score:{grad_MAE:.2f}")
print(f"RMSE Score:{grad_RMSE:.2f}")

Training the Model........
Gradient Boosting Regression Model for Training Data -:

R2 Score:0.95
MAE Score:2955.10
RMSE Score:4967.25


## Testing the Model - For the Best Training Model which is Gradient Boosting and Random Forest

### Preprocessing the Data

In [24]:
flight_test = strat_test_set.copy()
flight_test_features = flight_test.drop(['price', 'price_cat'], axis = 1)
flight_test_labels = flight_test['price']
flight_test_num = ['duration', 'days_left']
flight_test_cat = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']
flight_prepared_test = full_pipeline.transform(flight_test_features)

### Testing the Model: Random Forest

In [26]:
forest_predict_test = forest_model_train.predict(flight_prepared_test)
forest_R2_test = r2_score(flight_test_labels, forest_predict_test)
forest_MAE_test = mean_absolute_error(flight_test_labels, forest_predict_test)
forest_RMSE_test = root_mean_squared_error(flight_test_labels, forest_predict_test)
# print("Running Cross Validation........")
# forest_CV = cross_val_score(forest_imodel_train, flight_prepared_train, flight_train_labels, scoring = "neg_root_mean_squared_error", cv = 10)

print("Random Forest Regression Model for Testing Data -:")
# print(pd.Series(forest_CV).agg(['count', 'mean', 'std']))
print(f"\nR2 Score:{forest_R2_test:.2f}")
print(f"MAE Score:{forest_MAE_test:.2f}")
print(f"RMSE Score:{forest_RMSE_test:.2f}")

Random Forest Regression Model for Testing Data -:

R2 Score:0.98
MAE Score:1766.13
RMSE Score:3377.25
