# Imports

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from datetime import datetime
seed = 343

In [45]:
# Useful Functions
def cv_rmse(model, x, y, kfolds):
    rmse = np.sqrt(-cross_val_score(model, x, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return rmse

def get_total_minutes(td):
    hours, minutes = map(int, td.split(':'))
    return hours * 60 + minutes

# Preprocessing

In [46]:
base_df = pd.read_csv("flights.csv")

# Label Encoder (proper way to handle Categorical Data)
lb_encode = LabelEncoder()
categorical_variables = ['AirlineName', 'AirportDest', 'AirportOrig']
for i in categorical_variables:
    base_df[i] = lb_encode.fit_transform(base_df[i])
    
# Handle date and time variables
date_variables = ['ArrTime', 'DepTime']
for i in date_variables:
    base_df[i] = np.array([datetime.strptime(d, "%Y/%m/%d-%H:%M").timestamp() for d in base_df[i]])
base_df['FlightDuration'] = np.array([get_total_minutes(td) for td in base_df['FlightDuration']])

train_df, test_df = train_test_split(base_df, test_size=0.2)
train_x, train_y = train_df.drop(['Price'], axis=1), train_df.Price
test_df = test_df.drop(['Price'], axis=1)

# Linear Regression

In [49]:
lr = LinearRegression()
kfolds = KFold(n_splits=50, shuffle=True, random_state=seed)
benchmark_model = make_pipeline(RobustScaler(), lr).fit(train_x, train_y)
cv_rmse(benchmark_model, train_x, train_y, kfolds).mean()

778.9261502833601