# Preprocess

In [1]:
# Flight fare prediction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)

# Preprocessing
# import dataset
df = pd.read_excel("Data_Train.xlsx")

# drop null value
df.dropna(inplace=True)

# drop duplicates
df.drop_duplicates(keep='first', inplace=True)

# convert Dep_Time and Arrival_Time into hour and minute
# Dep_Time
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_minute"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df = df.drop(columns="Dep_Time")

# Arrival_Time
df["Arr_hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour
df["Arr_minute"] = pd.to_datetime(df["Arrival_Time"]).dt.minute
df = df.drop(columns="Arrival_Time")

# convert Date_of_Journey into timestamp
df["Date_of_Journey_Year"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.year
# ps. I don't think "year" is mandatory
df["Date_of_Journey_Month"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.month
df["Date_of_Journey_Day"] = pd.to_datetime(
    df["Date_of_Journey"], format="%d/%m/%Y").dt.day
df = df.drop(columns="Date_of_Journey")


# convert Duration into minute
df['Duration'] = df['Duration']\
    .str.replace("h", '*60')\
    .str.replace(' ', '+')\
    .str.replace('m', '*1')\
    .apply(eval)

# drop Additional_Info 
df.drop(['Additional_Info'], axis=1, inplace=True)

# drop Route
df.drop(['Route'], axis=1, inplace=True) 

# Encode categorical attributes
# Select categorical data
df_categorical = df[['Airline', 'Source', 'Destination']]
df.drop(['Airline', 'Source', 'Destination'], axis=1, inplace=True)


# Encode
df_categorical = pd.get_dummies(df_categorical, drop_first=True)
df['Total_Stops'] = df['Total_Stops'].map({"non-stop":0, "1 stop": 1, "2 stops": 2, "3 stops":3, "4 stops": 4})

# Concat categorical and numerical data
preprocessed = pd.concat([df, df_categorical], axis=1)
target = df['Price']

# Drop 'Price' because price is the target variable
preprocessed.drop(['Price'], axis=1, inplace=True)

# Train_Test_Split
data_train, data_test, target_train, target_test = train_test_split(
    preprocessed, target,test_size=0.3, random_state=42)



In [2]:
# Copy data
data_train_poly = data_train.copy()
data_test_poly = data_test.copy()
target_train_poly = target_train.copy()
target_test_poly = target_test.copy()

display(data_train_poly.head())

Unnamed: 0,Duration,Total_Stops,Dep_hour,Dep_minute,Arr_hour,Arr_minute,Date_of_Journey_Year,Date_of_Journey_Month,Date_of_Journey_Day,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_Multiple carriers Premium economy,Airline_SpiceJet,Airline_Trujet,Airline_Vistara,Airline_Vistara Premium economy,Source_Chennai,Source_Delhi,Source_Kolkata,Source_Mumbai,Destination_Cochin,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_New Delhi
5050,1830,3,16,45,23,15,2019,4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4879,480,1,13,0,21,0,2019,6,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0
208,570,1,14,5,23,35,2019,5,21,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8561,930,1,10,0,1,30,2019,6,9,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0
7265,165,0,22,10,0,55,2019,3,21,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


# Polynomial Regression
## [F regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.r_regression.html#sklearn.feature_selection.r_regression)

In [3]:
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import PolynomialFeatures

# create a transformer
transformer = PolynomialFeatures(degree=2, include_bias=False)

# run the F-Test
f, pval = f_regression(transformer.fit_transform(data_train_poly), target_train_poly)

# prepare a dataframe to inspect the results
stat = pd.DataFrame({ 'feature': transformer.get_feature_names_out(data_train_poly.columns), 'F value': f, 'p value': pval })
stat['p value'] = round(stat['p value'], 2)

# show the results
display(stat)

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


Unnamed: 0,feature,F value,p value
0,Duration,2418.001683,0.00
1,Total_Stops,4080.632555,0.00
2,Dep_hour,1.997667,0.16
3,Dep_minute,4.815703,0.03
4,Arr_hour,7.350185,0.01
...,...,...,...
459,Destination_Hyderabad Destination_Kolkata,,
460,Destination_Hyderabad Destination_New Delhi,,
461,Destination_Kolkata^2,226.644207,0.00
462,Destination_Kolkata Destination_New Delhi,,


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, SelectFwe
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

transformer = PolynomialFeatures(degree=2, include_bias=False)
best = SelectFwe(f_regression, alpha=0.05)
estimator = LinearRegression()

pipeline = Pipeline([ ('transformer', transformer), ('feature_selection', best), ('estimator', estimator)])

# fit the regression on the training data
pipeline.fit(data_train_poly, target_train_poly)

# predict the values for the test data
predictions = pipeline.predict(data_test_poly)

# evaluate using different measures
mse = mean_squared_error(target_test_poly, predictions)
r2 = r2_score(target_test_poly, predictions)

print("MSE:", mse)
print("RMSE:", sqrt(mse))
print("R^2:", r2)


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


MSE: 6916564.187417173
RMSE: 2629.9361565287422
R^2: 0.6669699665191691


In [5]:
# get the selected features
selected_features = pipeline.named_steps['feature_selection'].get_support()

# print the selected variables
for i, f in enumerate(pipeline.named_steps['transformer'].get_feature_names_out(data_train_poly.columns)):
    if selected_features[i]:
        print(f)

Duration
Total_Stops
Arr_minute
Date_of_Journey_Month
Date_of_Journey_Day
Airline_Air India
Airline_GoAir
Airline_IndiGo
Airline_Jet Airways
Airline_Jet Airways Business
Airline_Multiple carriers
Airline_SpiceJet
Airline_Vistara
Source_Chennai
Source_Delhi
Source_Mumbai
Destination_Cochin
Destination_Delhi
Destination_Hyderabad
Destination_Kolkata
Destination_New Delhi
Duration^2
Duration Total_Stops
Duration Dep_hour
Duration Dep_minute
Duration Arr_hour
Duration Arr_minute
Duration Date_of_Journey_Year
Duration Date_of_Journey_Month
Duration Date_of_Journey_Day
Duration Airline_Air India
Duration Airline_GoAir
Duration Airline_IndiGo
Duration Airline_Jet Airways
Duration Airline_Jet Airways Business
Duration Airline_Multiple carriers
Duration Airline_SpiceJet
Duration Source_Chennai
Duration Source_Delhi
Duration Source_Kolkata
Duration Destination_Cochin
Duration Destination_Delhi
Duration Destination_Kolkata
Duration Destination_New Delhi
Total_Stops^2
Total_Stops Dep_hour
Total_St