# SkyInsight: Predictive Analytics for Cost-Effective Air Travel"

by: Laundry Houston, Mark Dunlea Tate, Anthony Amadasun
 

---

### Introduction

**Imports/Load datasets**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeCV, LassoCV, Lasso, Ridge 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import pickle

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [2]:
df = pd.read_csv('../data/clean_dataset.csv')
df.head()

Unnamed: 0,airline,flight,origin,departure_time,stops,arrival_time,class,duration,price,duration_range,departure_time_encoded,duration_range_encoded,destination_Bangalore,destination_Chennai,destination_Delhi,destination_Hyderabad,destination_Kolkata,destination_Mumbai
0,SpiceJet,SG-8709,Delhi,Evening,0,Night,0,130,71.44,Short,2,0,0,0,0,0,0,1
1,SpiceJet,SG-8157,Delhi,Early_Morning,0,Morning,0,140,71.44,Short,1,0,0,0,0,0,0,1
2,AirAsia,I5-764,Delhi,Early_Morning,0,Early_Morning,0,130,71.47,Short,1,0,0,0,0,0,0,1
3,Vistara,UK-995,Delhi,Morning,0,Afternoon,0,135,71.46,Short,4,0,0,0,0,0,0,1
4,Vistara,UK-963,Delhi,Morning,0,Morning,0,140,71.46,Short,4,0,0,0,0,0,0,1


---

### 2.1 Feature Engineering

In [3]:
df.shape

(300153, 18)

In [4]:
df.dtypes

airline                    object
flight                     object
origin                     object
departure_time             object
stops                       int64
arrival_time               object
class                       int64
duration                    int64
price                     float64
duration_range             object
departure_time_encoded      int64
duration_range_encoded      int64
destination_Bangalore       int64
destination_Chennai         int64
destination_Delhi           int64
destination_Hyderabad       int64
destination_Kolkata         int64
destination_Mumbai          int64
dtype: object

In [5]:
#additional one hot encoding
ohe_columns = ['airline', 'origin', 'arrival_time', 'departure_time', 'duration_range']
ohe_df = pd.get_dummies(df, columns=ohe_columns, prefix=ohe_columns, dtype=int, drop_first=True)

In [6]:
ohe_df.shape

(300153, 36)

In [7]:
ohe_df.columns

Index(['flight', 'stops', 'class', 'duration', 'price',
       'departure_time_encoded', 'duration_range_encoded',
       'destination_Bangalore', 'destination_Chennai', 'destination_Delhi',
       'destination_Hyderabad', 'destination_Kolkata', 'destination_Mumbai',
       'airline_Air_India', 'airline_GO_FIRST', 'airline_Indigo',
       'airline_SpiceJet', 'airline_Vistara', 'origin_Chennai', 'origin_Delhi',
       'origin_Hyderabad', 'origin_Kolkata', 'origin_Mumbai',
       'arrival_time_Early_Morning', 'arrival_time_Evening',
       'arrival_time_Late_Night', 'arrival_time_Morning', 'arrival_time_Night',
       'departure_time_Early_Morning', 'departure_time_Evening',
       'departure_time_Late_Night', 'departure_time_Morning',
       'departure_time_Night', 'duration_range_Medium', 'duration_range_Short',
       'duration_range_Very Long'],
      dtype='object')

---

### 2.2 Linear Regression Algorithm

In [8]:
X = ohe_df.drop(columns=['flight', 'price'])#'departure_time', 'duration_range'
y = ohe_df['price']

In [9]:
poly = PolynomialFeatures()
X_poly = poly.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, 
                                                    random_state=42)

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [12]:
print(f"Training Score: {lr.score(X_train, y_train):.4f}")
print(f"Testing Score: {lr.score(X_test, y_test):.4f}")

y_poly_pred = lr.predict(X_test)

print(f"RMSE Score: {np.sqrt(mean_squared_error(y_test, y_poly_pred))}")

Training Score: 0.9428
Testing Score: 0.9420
RMSE Score: 65.63709916908026


**Interpretation:**

The provided results suggest that the model performs well in terms of R-squared on both the training and testing sets, indicating that the model explains a significant portion of the variability in flight prices based on the selected features. The Training Score of 0.9428 shows that the model fits the training data welll and the Testing Score of 0.9420 shows that the model generalizes well to new, unseen data. The lower RMSE score of 65.63 indicates that the model predictions are close to the actual prices and suggest that the selected features, preprocessing, and cleaning methods contribute to accurate predictions of flight prices.





---

### 2.3 Model Training

**Random Forest**

In [25]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

In [28]:
print(f"Training Score: {rf.score(X_train, y_train):.4f}")
print(f"Testing Score: {rf.score(X_test, y_test):.4f}")

y_pred = rf.predict(X_test)

print(f"RMSE Score: {np.sqrt(mean_squared_error(y_test, y_pred))}")

Training Score: 0.9793
Testing Score: 0.9754
RMSE Score: 42.754946001290214


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1), param_grid, cv=5)
grid_search_rf.fit(X_train, y_train)

print("Best Parameters:", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_
print(f"Training Score: {best_rf.score(X_train, y_train):.4f}")
print(f"Testing Score: {best_rf.score(X_test, y_test):.4f}")

y_pred_best_rf = best_rf.predict(X_test)
print(f"RMSE Score: {np.sqrt(mean_squared_error(y_test, y_pred_best_rf))}")

In [32]:
with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(grid_search_rf, f)

NameError: name 'pickle' is not defined

**Interpretation**

In [29]:
dt = DecisionTreeRegressor(max_depth=3, min_samples_split=10,
                           min_samples_leaf=5)
dt.fit(X_train, y_train)


In [30]:
print(f"Training Score: {dt.score(X_train, y_train):.4f}")
print(f"Testing Score: {dt.score(X_test, y_test):.4f}")

y_pred = dt.predict(X_test)

print(f"RMSE Score: {np.sqrt(mean_squared_error(y_test, y_pred))}")

Training Score: 0.9291
Testing Score: 0.9283
RMSE Score: 72.94028407664383


In [31]:
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search_dt = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search_dt.fit(X_train, y_train)


print("Best Parameters:", grid_search_dt.best_params_)

best_dt = grid_search_dt.best_estimator_
print(f"Training Score: {best_dt.score(X_train, y_train):.4f}")
print(f"Testing Score: {best_dt.score(X_test, y_test):.4f}")

y_pred_best = best_dt.predict(X_test)
print(f"RMSE Score: {np.sqrt(mean_squared_error(y_test, y_pred_best))}")

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 2}
Training Score: 0.9450
Testing Score: 0.9439
RMSE Score: 64.53416228179876


In [None]:
   
with open('../models/decision_tree_model.pkl', 'wb') as f:
    pickle.dump(grid_search_dt, f)

**Gradient Boosting:**

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [13]:
#
# path difference b/c solution code:
#with open('../../models/author_pipe.pkl', 'wb') as pickle_out:
   # pickle_out = pickle.dump(pipe, pickle_out)

In [14]:
# #lasso regression
# lasso_params = {'alpha': [0.01, 0.1, 1, 10],
#                'max_iter': [1000, 2000, 3000]}
# lasso_grid = GridSearchCV(Lasso(), lasso_params, cv=5)
# lasso_grid.fit(X_train, y_train)
# best_lasso = lasso_grid.best_estimator_


In [15]:
# #ridge regression
# ridge_params = {'alpha': [0.01, 0.1, 1, 10]}
# ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=5)
# ridge_grid.fit(X_train, y_train)
# best_ridge = ridge_grid.best_estimator_

---

### 2.4 Model Evaluation

In [16]:
# import folium
# import plotly.express as px

In [17]:
# file_path = '/Users/aamad_000/Downloads/India Cities LatLng.csv'

# df_city_coordinates = pd.read_csv(file_path)

# df_city_coordinates.head()

In [18]:
X_test_df = pd.DataFrame(X_test)
X_test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,620,621,622,623,624,625,626,627,628,629
0,1.0,1.0,0.0,1185.0,4.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,1.0,590.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,630.0,4.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,870.0,5.0,2.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,495.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75034,1.0,1.0,0.0,1455.0,2.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75035,1.0,0.0,0.0,175.0,5.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75036,1.0,1.0,0.0,1040.0,2.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75037,1.0,1.0,1.0,440.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [19]:

# city_df = pd.DataFrame(ohe_df[['destination_Bangalore', 'destination_Chennai', 'destination_Delhi',
#                                 'destination_Hyderabad', 'destination_Kolkata', 'destination_Mumbai', 'price']])

# result_df = pd.concat([city_df, df_city_coordinates], axis=1)


In [20]:
# result_df.head()

In [21]:
# fig = px.scatter(x=result_df['lng'], y=result_df['lat'], color=result_df['price'], title='Predicted Prices on Map')
# fig.show()
# # Assuming 'fig' is your Plotly Express figure
# fig.write_html("predicted_prices_map.html")

In [22]:
# fig.write_html("predicted_prices_map.html", auto_open=True)


In [23]:
# #Visualize the predictions using Plotly Express
# fig = px.scatter(x=X_test['longitude'], y=X_test['latitude'], color=y_poly_pred, title='Predicted Prices on Map')
# fig.show()