In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
import xgboost as xgb
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import uniform, randint

## Baseline model

In [11]:
df = pd.read_csv("Data/NYC_Taxi_Dataset.csv", index_col='key').dropna()
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
# let's split data into train/valid/test sets before we continue preprocessing
# Since dataset is quite big we will take 2% of rows for validation and 2% for testing.
train_df, valid_df = train_test_split(df, test_size=0.04, shuffle=True, random_state=42)
valid_df, test_df = train_test_split(valid_df, test_size=0.5, shuffle=True, random_state=42)

# remove df to free up space
del df

In [13]:
X_train = train_df.drop(['fare_amount', 'pickup_datetime'], axis=1)
y_train = train_df['fare_amount']
X_test = test_df.drop(['fare_amount', 'pickup_datetime'], axis=1)
y_test = test_df['fare_amount']

# Scale the features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train.index = y_train.index
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_test.index = y_test.index

# Create a linear regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse, end='\n\n')

Root Mean Squared Error (RMSE): 9.576859470263193



<p style='color:red;'>So, for the initial model we only dropped missing values and used pickup/dropoff coordinates and passenger count as independent variables. We did not check validity of data, or outliers. Basically we did nothing. Therefore, performance is terible. P-values of all the variables except passenger_count are way more than 0.05 suggesting that there is not enough evedence to reject null hypothesis of no significant relationship. We will try to and hopefully improve this model.</p>

## First Try To Improve

## Data

<p style='color:red;'>For the following steps I will be using dataset cleaned in <b style='font-size:17px;color:yellow;'><i>clean_visualize_engineer.ipynb</i></b> notebook.</p>

<ul style='color:red;'><b style='font-size:17px;color:yellow;'>fare_amount:</b> amout of fare paid for taxi ride</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>[pickup_latitude, pickup_longitude]:</b> pickup location coordinates</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>[dropoff_latitude, dropoff_longitude]:</b> dropoff location coordinates</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>passenger_count:</b> number of passenger taxi took</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>pickup boroughs:</b> 5 pickup borough dummy columns</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>dropoff boroughs:</b> 5 dropoff borough dummy columns</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>distance_covered:</b> distance covered during a taxi ride</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>distance_to_airport:</b> distance from pickup/dropoff location to closest airport</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>year, month, week, day_of_month, day_of_week, hour, numeric_date:</b> time features describing taxi pickup prediod.</ul>
<ul style='color:red;'><b style='font-size:17px;color:yellow;'>after_nov_2012:</b>Becasue fare jumped up in 2012, it is 0 if date was before Novemebr of 2012, and 1 if it was after November of 2012</ul>

In [2]:
train_df = pd.read_csv('Data/train_df.csv', index_col='key')
valid_df =pd.read_csv('Data/valid_df.csv', index_col='key')
test_df = pd.read_csv('Data/test_df.csv', index_col='key')

In [3]:
train_df.shape, valid_df.shape, test_df.shape

((4603014, 26), (97756, 26), (97752, 26))

## Linear Model

In [3]:
X_train = train_df.drop(['fare_amount'], axis=1)
y_train = train_df['fare_amount']
X_test = test_df.drop(['fare_amount'], axis=1)
y_test = test_df['fare_amount']

# Scale the features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train.index = y_train.index
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_test.index = y_test.index

# Create a random forest regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse, end='\n\n')

Root Mean Squared Error (RMSE): 5.157373169472006



## XGBOOST

### Conduct randomized search to fine tune model

In [14]:
# Get data
X_train = train_df.drop(['fare_amount'], axis=1)
y_train = train_df['fare_amount']

# Scale the features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train.index = y_train.index

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train.sample(100000, random_state=42), y_train.sample(100000, random_state=42), test_size=0.2, random_state=42)

# Define the parameter distributions for randomized search
param_dist = {
    "learning_rate": uniform(0.01, 0.1),
    "n_estimators": randint(100, 1000),
    "max_depth": randint(3, 8),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "gamma": uniform(0, 1),
    "reg_alpha": uniform(0, 1),
    "reg_lambda": uniform(0, 1),
    "min_child_weight": randint(1, 10),
}

# Create the XGBoost regressor
xgb_model = xgb.XGBRegressor()

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(
    xgb_model, param_distributions=param_dist, n_iter=10, cv=5, scoring="neg_mean_squared_error", random_state=42
)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

# Print the best hyperparameters and evaluation metrics
print("Best Hyperparameters:", best_params)
print("Root Mean Squared Error:", rmse)


Best Hyperparameters: {'colsample_bytree': 0.8416317594127292, 'gamma': 0.6099966577826209, 'learning_rate': 0.09331949117361643, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 661, 'reg_alpha': 0.662522284353982, 'reg_lambda': 0.31171107608941095, 'subsample': 0.7600340105889054}
Root Mean Squared Error: 2.8867515529670067


### Train model

In [4]:
X_train = train_df.drop(['fare_amount'], axis=1)
y_train = train_df['fare_amount']
X_valid = valid_df.drop(['fare_amount'], axis=1)
y_valid = valid_df['fare_amount']
X_test = test_df.drop(['fare_amount'], axis=1)
y_test = test_df['fare_amount']

# Scale the features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_train.index = y_train.index
X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X_valid.columns)
X_valid.index = y_valid.index
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_test.index = y_test.index

In [5]:
my_model = xgb.XGBRegressor(
    colsample_bytree=0.8416317594127292, 
    gamma=0.6099966577826209,
    learning_rate=0.09331949117361643,
    max_depth=5,
    min_child_weight=1,
    n_estimators=661,
    reg_alpha=0.662522284353982,
    reg_lambda=0.31171107608941095,
    subsample=0.7600340105889054
    )
my_model.fit(X_train, y_train, 
             eval_set=[(X_valid, y_valid)])

[0]	validation_0-rmse:12.99806
[1]	validation_0-rmse:11.92012
[2]	validation_0-rmse:10.95203
[3]	validation_0-rmse:10.08369
[4]	validation_0-rmse:9.30651
[5]	validation_0-rmse:8.61670
[6]	validation_0-rmse:8.00196
[7]	validation_0-rmse:7.48041
[8]	validation_0-rmse:6.99106
[9]	validation_0-rmse:6.56226
[10]	validation_0-rmse:6.18347
[11]	validation_0-rmse:5.85470
[12]	validation_0-rmse:5.56791
[13]	validation_0-rmse:5.32529
[14]	validation_0-rmse:5.11358
[15]	validation_0-rmse:4.92179
[16]	validation_0-rmse:4.76496
[17]	validation_0-rmse:4.62067
[18]	validation_0-rmse:4.50407
[19]	validation_0-rmse:4.39667
[20]	validation_0-rmse:4.30508
[21]	validation_0-rmse:4.22932
[22]	validation_0-rmse:4.16139
[23]	validation_0-rmse:4.10330
[24]	validation_0-rmse:4.05250
[25]	validation_0-rmse:4.00735
[26]	validation_0-rmse:3.96987
[27]	validation_0-rmse:3.93743
[28]	validation_0-rmse:3.91033
[29]	validation_0-rmse:3.88604
[30]	validation_0-rmse:3.86790
[31]	validation_0-rmse:3.85221
[32]	validatio

In [6]:
y_pred = my_model.predict(X_test)
# Calculate the RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse, end='\n\n')

Root Mean Squared Error (RMSE): 3.6495172908672284



In [8]:
# Save the model to a file
file_name = "Data/xgb_model.pkl"
pickle.dump(my_model, open(file_name, "wb"))