# Flights Capstone Modeling



In [51]:
! pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/13/94/f73d4efcc9a0272ea9f93c03f4744a2b709172309cd0bfde1e9012776330/xgboost-2.0.1-py3-none-macosx_12_0_arm64.whl.metadata
  Downloading xgboost-2.0.1-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.1-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.1


In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression
import time
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor

In [2]:
X_train_scaled = pd.read_csv('flights_X_train_scaled.csv', index_col = 0)
X_test_scaled = pd.read_csv('flights_X_test_scaled.csv', index_col = 0)
y_train = pd.read_csv('flights_y_train.csv', index_col = 0)
y_test = pd.read_csv('flights_y_test.csv', index_col = 0)

# Dummy Regressor
## What if we just guess the mean? 

Baseline to compare with later

In [3]:
y_train_mean = y_train.mean()
y_train_mean 

price_usd    255.485841
dtype: float64

In [4]:
dum_reg = DummyRegressor(strategy = 'mean')
dum_reg.fit(X_train_scaled, y_train)
dum_reg.constant_

array([[255.48584134]])

In [5]:
y_train_pred = dum_reg.predict(X_train_scaled)
y_train_pred

array([255.48584134, 255.48584134, 255.48584134, ..., 255.48584134,
       255.48584134, 255.48584134])

In [6]:
dum_reg.score(X_train_scaled, y_train)

0.0

In [7]:
dum_reg.score(X_test_scaled, y_test)

-2.5818853610637404e-06

## Metrics for the Dummy Regressor

As expected this would be a terrible way to predict ticket price. Here are all of the common metrics for this dummy regressor model. We can use these values to guage how much better our other models are than this dummy model. 

In [8]:
y_test_pred = dum_reg.predict(X_test_scaled)

In [9]:
r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

(0.0, -2.5818853610637404e-06)

In [10]:
mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)

(241.68521330855492, 241.47284821915693)

In [11]:
mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)

(77102.76711131452, 76792.4956587832)

In [12]:
mean_squared_error(y_train, y_train_pred, squared = False), mean_squared_error(y_test, y_test_pred, squared = False)

(277.6738502475783, 277.1145894008166)

# Linear Regression Models



In [13]:
LinearRegression.score?

In [14]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_train_pred_lin = lin_reg.predict(X_train_scaled)
y_test_pred_lin = lin_reg.predict(X_test_scaled)

In [15]:
r2_score(y_train, y_train_pred_lin), r2_score(y_test, y_test_pred_lin)

(0.9254824492306011, -1.3165964551481275e+20)

In [16]:
mean_absolute_error(y_train, y_train_pred_lin), mean_absolute_error(y_test, y_test_pred_lin)

(51.38056278820564, 15715810684.837727)

In [17]:
mean_squared_error(y_train, y_train_pred_lin), mean_squared_error(y_test, y_test_pred_lin)

(5745.509362678522, 1.0110446652618985e+25)

In [18]:
mean_squared_error(y_train, y_train_pred_lin, squared = False), mean_squared_error(y_test, y_test_pred_lin, squared = False)

(75.79913827134529, 3179692855075.626)

## Observations

The scores for the test set are worse than the scores in the dummy regressor, but training scores improved. This is definitely not our model. In the guided capstone they use SelectKBest to subset the features used and hopefully improve the model. 

In [19]:
pipe = make_pipeline(SelectKBest(f_regression), LinearRegression())

In [20]:
y_train.shape

(225195, 1)

In [21]:
pipe.fit(X_train_scaled, y_train.price_usd)

In [22]:
y_train_pred_lin = pipe.predict(X_train_scaled)
y_test_pred_lin = pipe.predict(X_test_scaled)

In [23]:
r2_score(y_train, y_train_pred_lin), r2_score(y_test, y_test_pred_lin)

(0.9061065842786092, 0.9062376038300555)

In [24]:
mean_absolute_error(y_train, y_train_pred_lin), mean_absolute_error(y_test, y_test_pred_lin)

(55.785391049912924, 55.570766223643034)

In [25]:
mean_squared_error(y_train, y_train_pred_lin), mean_squared_error(y_test, y_test_pred_lin)

(7239.442165652228, 7200.229810669629)

In [26]:
mean_squared_error(y_train, y_train_pred_lin, squared = False), mean_squared_error(y_test, y_test_pred_lin, squared = False)

(85.08491150405122, 84.85416790393757)

## Observations

These scores are much better. Maybe we should include SelectKBest with other models as well. I was surprised that the scores for the test data are actually slightly better than the scores for the training data...I'm not sure why that would be. We could try setting the value for k, but I'll wait and see how other models perform first. 

# Lasso and Ridge Models

Next I wanted to try a Lasso and Ridge modles, but we need to pick values for alpha. I'm going to test out some different values to see how they change the score. 

In [49]:
lasso_scores = []
for alpha in [1, 5, 10, 20]:
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_train_scaled, y_train)
    y_pred = lasso.predict(X_test_scaled)
    score = r2_score(y_test, y_pred)
    lasso_scores.append(score)

In [50]:
print(lasso_scores)

[0.9199797921974991, 0.9060540930116233, 0.901156052794979, 0.8879319119850346]


In [47]:
scores = []
for alpha in [10, 50, 100, 1000]:
    ridge = Ridge(alpha = alpha)
    ridge.fit(X_train_scaled, y_train)
    y_pred = ridge.predict(X_test_scaled)
    score = r2_score(y_test, y_pred)
    scores.append(score)

In [48]:
print(scores)

[0.9246842784907655, 0.9246849807932151, 0.9246854883857387, 0.9246661032874668]


## Observations

My best scores were Lasso(alpha = 1) and Ridge(alpha = 100). If one of these wind up being our best model, I'll run CV to select alpha. 

# Random Forest Model

Next I wanted to try RandomForestRegressor. First I'll try it with the default settings to see what we get. 

In [39]:
start = time.time()
rf = RandomForestRegressor()
rf.fit(X_train_scaled, y_train.price_usd)
y_pred_rf = rf.predict(X_test_scaled)
score = r2_score(y_test, y_pred_rf)
print(score)
executionTime = (time.time() - start)
print('Execution time in seconds: ' + str(executionTime))

0.9905783194859459
Execution time in seconds: 750.976350069046


In [40]:
mean_absolute_error(y_test, y_pred_rf)

10.062227260595428

In [41]:
mean_squared_error(y_test, y_pred_rf, squared = False)

26.898187521608733

## Observations

These are really high scores. This is the best model so far. I wonder if it's overfitting or if I should mess with parameter settings? 

# XGBoost Models

In [54]:
XGBRegressor?

### Tree Base Learners

In [58]:
start = time.time()
xg_reg = XGBRegressor(objective = 'reg:squarederror')
xg_reg.fit(X_train_scaled, y_train)
y_pred_xg = xg_reg.predict(X_test_scaled)
score = r2_score(y_test, y_pred_xg)
print(score)
executionTime = (time.time() - start)
print('Execution time in seconds: ' + str(executionTime))

0.9771675993941723
Execution time in seconds: 11.939765930175781


In [59]:
mean_absolute_error(y_test, y_pred_xg)

24.216309484062002

In [60]:
mean_squared_error(y_test, y_pred_xg, squared = False)

41.87305216302573

## Observations

This is much fast than the Random Forest Model, but the scores are not as good. Maybe tuning will improve the scores. I also want to try XGBoost with linear base learners. 

### Linear Base Learners

In [61]:
start = time.time()
DM_train = xgb.DMatrix(data = X_train_scaled, label = y_train)
DM_test = xgb.DMatrix(data = X_test_scaled, label = y_test)
params = {'booster':'gblinear', 'objective':'reg:squarederror'}
xg_reg = xgb.train(params = params, dtrain = DM_train)
preds = xg_reg.predict(DM_test)
score = r2_score(y_test, preds)
print(score)
executionTime = (time.time() - start)
print('Execution time in seconds: ' + str(executionTime))

0.9243776169041054
Execution time in seconds: 29.132174730300903


In [62]:
mean_absolute_error(y_test, preds)

51.384494647561866

In [63]:
mean_squared_error(y_test, preds, squared = False)

76.20509518418682

## Observations

The error increased with linear base learners. I think the tree base learners are a better choice. 

# Best Model Selection

Our best model is the Random Forest Model. A close second is the XGBoost tree based model. The XGBoost model was much faster, but had higher error. 

It took over 12 minutes to train and make predictions with one RF model. Doing CV will take a long time. 