## Model - Neural Nets

### 1. Feature Engineering (continuing after data cleaning)

In [1]:
import pandas as pd
df = pd.read_pickle("../../data/processed/data_cleaned.pkl.gz",compression='gzip')

In [2]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,segmentsDepartureTimeEpochSeconds_Leg1,...,segmentsDurationInSeconds_Leg4,segmentsDistance_Leg1,segmentsDistance_Leg2,segmentsDistance_Leg3,segmentsDistance_Leg4,segmentsCabinCode_Leg1,segmentsCabinCode_Leg2,segmentsCabinCode_Leg3,segmentsCabinCode_Leg4,traveltime_hours
0,2022-04-16,2022-04-17,ATL,BOS,False,False,True,248.600006,947.0,1650215000.0,...,0.0,947.0,0.0,0.0,0.0,coach,0,0,0,2.483333
1,2022-04-16,2022-04-17,ATL,BOS,False,False,True,248.600006,947.0,1650191000.0,...,0.0,947.0,0.0,0.0,0.0,coach,0,0,0,2.5
2,2022-04-16,2022-04-17,ATL,BOS,False,False,True,248.600006,947.0,1650210000.0,...,0.0,947.0,0.0,0.0,0.0,coach,0,0,0,2.5
3,2022-04-16,2022-04-17,ATL,BOS,False,False,True,248.600006,947.0,1650218000.0,...,0.0,947.0,0.0,0.0,0.0,coach,0,0,0,2.533333
4,2022-04-16,2022-04-17,ATL,BOS,False,False,True,248.600006,947.0,1650204000.0,...,0.0,947.0,0.0,0.0,0.0,coach,0,0,0,2.566667


In [3]:
df['flightDate'].min(), df['flightDate'].max()

('2022-04-17', '2022-07-17')

In [4]:
df['segmentsCabinCode_Leg1'].value_counts()

coach            13472653
premium coach       22826
first               21120
business             3400
Name: segmentsCabinCode_Leg1, dtype: int64

In [5]:
# convert departure date to pandas datetime type
df['flightDate'] = pd.to_datetime(df['flightDate'])

### Cleaning time column

In [6]:
from dateutil import parser

# column containing parsed datetime strings as datetime.datetime objects
df['segmentsDepartureTimeRaw_Leg1'] = df['segmentsDepartureTimeRaw_Leg1'].apply(parser.parse)

# Extract 'H:M' format and store in a new column
df['departure_time'] = df['segmentsDepartureTimeRaw_Leg1'].apply(lambda x: x.strftime('%H:%M'))

In [7]:
# Rounding the 'H:M' formatted time to the nearest 30-minute interval
df['departure_time'] = pd.to_datetime(df['departure_time']).dt.round('30min').dt.strftime('%H:%M')

### Taking average price for all the flights for same dates, similar journey and timings

In [8]:
features = ['flightDate', 'startingAirport', 'destinationAirport', 'departure_time', 'segmentsCabinCode_Leg1']
target = 'totalFare'

# Grouping and calculating the average total fare for similar flights
grouped_data = df.groupby(features)[target].mean().reset_index()

In [9]:
grouped_data.dropna(subset=['totalFare'], inplace=True)

In [10]:
grouped_data.shape

(518989, 6)

In [11]:
grouped_data.head()

Unnamed: 0,flightDate,startingAirport,destinationAirport,departure_time,segmentsCabinCode_Leg1,totalFare
189,2022-04-17,ATL,BOS,05:30,coach,271.589996
193,2022-04-17,ATL,BOS,06:00,coach,252.600006
197,2022-04-17,ATL,BOS,06:30,coach,248.600006
201,2022-04-17,ATL,BOS,07:00,coach,251.100006
213,2022-04-17,ATL,BOS,08:30,coach,251.100006


In [13]:
grouped_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 518989 entries, 189 to 3956561
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   flightDate              518989 non-null  datetime64[ns]
 1   startingAirport         518989 non-null  category      
 2   destinationAirport      518989 non-null  category      
 3   departure_time          518989 non-null  object        
 4   segmentsCabinCode_Leg1  518989 non-null  category      
 5   totalFare               518989 non-null  float32       
dtypes: category(3), datetime64[ns](1), float32(1), object(1)
memory usage: 15.3+ MB


In [14]:
grouped_data['flightDate'].min(), grouped_data['flightDate'].max()

(Timestamp('2022-04-17 00:00:00'), Timestamp('2022-07-17 00:00:00'))

In [15]:
# rename column to cabin_type
grouped_data.rename(columns = {'segmentsCabinCode_Leg1':'cabin_type'}, inplace = True)

In [16]:
# Save the processed data file to csv
grouped_data.to_csv("../../data/processed/data_processed.csv", index=False)

### Extracting date and time features
#### Reading saved data

In [1]:
import pandas as pd
flight_data = pd.read_csv("../../data/processed/data_processed.csv")

In [2]:
## convert into pandas data type
dtype_mapping = {'startingAirport': 'category',
        'destinationAirport': 'category',
        'totalFare': 'float32',
        'cabin_type': 'category'}
flight_data = flight_data.astype(dtype_mapping)

In [3]:
import sys
sys.path.append("../../src/features")

# import function from build_features python script 
from build_features import get_date_features, get_time_features
flight_data = get_date_features(flight_data)
flight_data = get_time_features(flight_data)

In [4]:
flight_data.head()

Unnamed: 0,startingAirport,destinationAirport,cabin_type,totalFare,month,day,weekday,departure_time_sin,departure_time_cos,departure_time_category
0,ATL,BOS,coach,271.589996,4,17,6,0.965926,0.258819,night
1,ATL,BOS,coach,252.600006,4,17,6,1.0,6.123234000000001e-17,night
2,ATL,BOS,coach,248.600006,4,17,6,1.0,6.123234000000001e-17,night
3,ATL,BOS,coach,251.100006,4,17,6,0.965926,-0.258819,morning
4,ATL,BOS,coach,251.100006,4,17,6,0.866025,-0.5,morning


In [7]:
flight_data.describe()

Unnamed: 0,totalFare,month,day,weekday,departure_time_sin,departure_time_cos
count,518989.0,518989.0,518989.0,518989.0,518989.0,518989.0
mean,409.114044,5.518647,15.690963,3.000399,-0.046502,-0.3360548
std,268.047119,0.921477,8.733479,2.010741,0.739494,0.5814252
min,23.969999,4.0,1.0,0.0,-1.0,-1.0
25%,249.02858,5.0,8.0,1.0,-0.707107,-0.8660254
50%,359.31427,5.0,16.0,3.0,-0.258819,-0.5
75%,499.096008,6.0,23.0,5.0,0.707107,6.123234000000001e-17
max,8260.610352,7.0,31.0,6.0,1.0,1.0


### 2. Model building

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Splitting into features and target
X = flight_data.drop('totalFare', axis=1)
y = flight_data['totalFare']

# Label encoding categorical variables
label_encoders = {}
for col in ['startingAirport', 'destinationAirport', 'cabin_type', 'departure_time_category']:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=23)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 2.1 Accessing Baseline

In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Calculate the mean of totalFare
mean_total_fare = y_train.mean()

# Create a list with the mean value to match the length of the test set
baseline_predictions = [mean_total_fare] * len(X_test)

# Calculate the Mean Absolute Error of the baseline model
rmse_baseline = np.sqrt(mean_squared_error(y_test, baseline_predictions))
print(f"Root Mean Squared Error of the baseline model: {rmse_baseline}")

Root Mean Squared Error of the baseline model: 268.4026184082031
