# Importing Libraries

In [79]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

# Load Files

In [80]:
test = pd.read_csv('../stage3/test_student.txt', sep=' ', header=None, index_col=0)
test.columns = ['cab_file', 'start_latitude', 'start_longitude', 'source_timestamp','end_latitude', 'end_longitude']
test['source_timestamp'] = pd.to_datetime(test['source_timestamp'], unit='s')
test.head()

Unnamed: 0_level_0,cab_file,start_latitude,start_longitude,source_timestamp,end_latitude,end_longitude
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,edf177ee-7e81-4785-821e-ac3d13094ec9.txt,37.61564,-122.3919,2008-06-09 16:57:34,37.78373,-122.40747
1,edf177ee-7e81-4785-821e-ac3d13094ec9.txt,37.6206,-122.39936,2008-06-09 16:19:15,37.74579,-122.41056
2,edf177ee-7e81-4785-821e-ac3d13094ec9.txt,37.7616,-122.43947,2008-06-09 14:55:29,37.61486,-122.38584
3,edf177ee-7e81-4785-821e-ac3d13094ec9.txt,37.76731,-122.43667,2008-06-09 14:00:35,37.61791,-122.38583
4,edf177ee-7e81-4785-821e-ac3d13094ec9.txt,37.78554,-122.40636,2008-06-09 13:06:06,37.61786,-122.38543


In [81]:
train = pd.read_csv('../stage3/outputs/All_Taxi_Trips.csv')
train.head()

Unnamed: 0,trip_id,start_latitude,start_longitude,source_timestamp,origin_point,end_latitude,end_longitude,destination_timestamp,destination_point,cab_id
0,1,37.96943,-122.31778,2008-05-17 15:20:33,"(37.96943, -122.31778)",37.79119,-122.40449,2008-05-17 15:40:50,"(37.79119, -122.40449)",0
1,2,37.79505,-122.40479,2008-05-17 15:41:28,"(37.79505, -122.40479)",37.78362,-122.40262,2008-05-17 15:46:48,"(37.78362, -122.40262)",0
2,3,37.78363,-122.40261,2008-05-17 15:46:49,"(37.78363, -122.40261)",37.79552,-122.40463,2008-05-17 15:51:49,"(37.79552, -122.40463)",0
3,4,37.79593,-122.40495,2008-05-17 15:52:36,"(37.79593, -122.40495)",37.80647,-122.42048,2008-05-17 15:59:31,"(37.80647, -122.42048)",0
4,5,37.80648,-122.42048,2008-05-17 15:59:39,"(37.80648, -122.42048)",37.80052,-122.4303,2008-05-17 16:16:11,"(37.80052, -122.4303)",0


# Preprocess Data

In [82]:
# make trip length variable
train['destination_timestamp'] = pd.to_datetime(train['destination_timestamp'])
train['source_timestamp'] = pd.to_datetime(train['source_timestamp'])
train['trip_length'] = train['destination_timestamp'] - train['source_timestamp']
y = pd.to_timedelta(train['trip_length'], unit='s').dt.total_seconds()

# drop unnecessary columns
X = train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude', 'cab_id', 'source_timestamp']].copy()

# generate time series parameters
X['hour'] = X['source_timestamp'].dt.hour
X['minute'] = X['source_timestamp'].dt.minute
X['second'] = X['source_timestamp'].dt.second
X['weekday'] = X['source_timestamp'].dt.weekday
X['week_of_year'] = X['source_timestamp'].dt.weekofyear
X['day_of_year'] = X['source_timestamp'].dt.dayofyear
X['month'] = X['source_timestamp'].dt.month
X['year'] = X['source_timestamp'].dt.year

# drop timestamp
X.drop('source_timestamp', axis=1, inplace=True)


  X['week_of_year'] = X['source_timestamp'].dt.weekofyear


In [83]:
X.head()

Unnamed: 0,start_latitude,start_longitude,end_latitude,end_longitude,cab_id,hour,minute,second,weekday,week_of_year,day_of_year,month,year
0,37.96943,-122.31778,37.79119,-122.40449,0,15,20,33,5,20,138,5,2008
1,37.79505,-122.40479,37.78362,-122.40262,0,15,41,28,5,20,138,5,2008
2,37.78363,-122.40261,37.79552,-122.40463,0,15,46,49,5,20,138,5,2008
3,37.79593,-122.40495,37.80647,-122.42048,0,15,52,36,5,20,138,5,2008
4,37.80648,-122.42048,37.80052,-122.4303,0,15,59,39,5,20,138,5,2008


In [84]:
y.head()

0    1217.0
1     320.0
2     300.0
3     415.0
4     992.0
Name: trip_length, dtype: float64

# Make Training and Test Sets

In [85]:
X_train, X_train_test, y_train, y_train_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Simple Model

## Train Model

In [86]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [87]:
xg_reg.fit(X_train,y_train)

In [88]:
xg_reg.score(X_train_test, y_train_test)

0.03010611186058243

## Final Predictions of Model

In [89]:
import pickle

# load taxi dictionary
with open('../stage3/outputs/Taxi_Dictionary.pkl', 'rb') as handle:
    taxi_dict = pickle.load(handle)

taxi_dict = {value: key for key, value in taxi_dict.items()}

print(taxi_dict)

{'db8b8e6c-315b-4a8e-bb27-d077647011c5.txt': 0, 'da1c3cf6-e628-4ca0-935c-5f91aa35db4f.txt': 1, 'dd3eb516-dc20-48e1-aec3-ae7d3f1e9b68.txt': 2, 'ead6caf4-8feb-4ade-831d-abd720fac240.txt': 3, 'fc5a051c-c4fb-4c03-b5e3-ea99d8440d20.txt': 4, 'f1a6a66b-287c-46c8-abe3-a5b7c20ff93f.txt': 5, 'db51a9a0-0ae3-4e32-8b37-3ab8b53d6710.txt': 6, 'daea9817-45a7-443d-843d-7ab2cda4531c.txt': 7, 'fb33e23a-48be-4dba-bee5-bcb74cc77436.txt': 8, 'fa24af3a-f415-4dd7-9f8d-7d631b1be9ec.txt': 9, 'ee6de574-7cd6-43dd-83f1-37728b2d9a9c.txt': 10, 'f0086b28-73de-4163-a5e2-8b52dd63446a.txt': 11, 'ecea2a86-3075-4f21-8a4d-be726d38a518.txt': 12, 'dfb2d390-d52e-45f6-a163-2c895fe76d93.txt': 13, 'd7367c6b-7d8c-4b9f-87ca-d88ceca42629.txt': 14, 'd072ac88-ff62-48e8-a825-530c96a0f6ea.txt': 15, 'fba3897b-9201-47f1-a20f-482894296f93.txt': 16, 'f8c6f1a5-031e-4f4c-a0cb-4c7a6624cbca.txt': 17, 'df28992f-86b9-479e-ae50-fa952fb8934a.txt': 18, 'e2f7f587-4ab9-4040-a903-cd268bbb2505.txt': 19, 'fe6f1c48-d0d1-44ed-ab13-4c05e6eb5a48.txt': 20, '

In [92]:
# load cab file to id dictionary
test['cab_id'] = test['cab_file'].map(taxi_dict)

# generate time series parameters
X_test = test[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude', 'cab_id', 'source_timestamp']].copy()

# generate time series parameters
X_test['hour'] = X_test['source_timestamp'].dt.hour
X_test['minute'] = X_test['source_timestamp'].dt.minute
X_test['second'] = X_test['source_timestamp'].dt.second
X_test['weekday'] = X_test['source_timestamp'].dt.weekday
X_test['week_of_year'] = X_test['source_timestamp'].dt.isocalendar().week
X_test['day_of_year'] = X_test['source_timestamp'].dt.dayofyear
X_test['month'] = X_test['source_timestamp'].dt.month
X_test['year'] = X_test['source_timestamp'].dt.year

# drop timestamp
X_test.drop('source_timestamp', axis=1, inplace=True)

X_test.head()

Unnamed: 0_level_0,start_latitude,start_longitude,end_latitude,end_longitude,cab_id,hour,minute,second,weekday,week_of_year,day_of_year,month,year
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,37.61564,-122.3919,37.78373,-122.40747,46,16,57,34,0,24,161,6,2008
1,37.6206,-122.39936,37.74579,-122.41056,46,16,19,15,0,24,161,6,2008
2,37.7616,-122.43947,37.61486,-122.38584,46,14,55,29,0,24,161,6,2008
3,37.76731,-122.43667,37.61791,-122.38583,46,14,0,35,0,24,161,6,2008
4,37.78554,-122.40636,37.61786,-122.38543,46,13,6,6,0,24,161,6,2008


In [93]:
X_test.head()

Unnamed: 0_level_0,start_latitude,start_longitude,end_latitude,end_longitude,cab_id,hour,minute,second,weekday,week_of_year,day_of_year,month,year
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,37.61564,-122.3919,37.78373,-122.40747,46,16,57,34,0,24,161,6,2008
1,37.6206,-122.39936,37.74579,-122.41056,46,16,19,15,0,24,161,6,2008
2,37.7616,-122.43947,37.61486,-122.38584,46,14,55,29,0,24,161,6,2008
3,37.76731,-122.43667,37.61791,-122.38583,46,14,0,35,0,24,161,6,2008
4,37.78554,-122.40636,37.61786,-122.38543,46,13,6,6,0,24,161,6,2008


In [94]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1901 entries, 0 to 1900
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   start_latitude   1901 non-null   float64
 1   start_longitude  1901 non-null   float64
 2   end_latitude     1901 non-null   float64
 3   end_longitude    1901 non-null   float64
 4   cab_id           1901 non-null   int64  
 5   hour             1901 non-null   int64  
 6   minute           1901 non-null   int64  
 7   second           1901 non-null   int64  
 8   weekday          1901 non-null   int64  
 9   week_of_year     1901 non-null   UInt32 
 10  day_of_year      1901 non-null   int64  
 11  month            1901 non-null   int64  
 12  year             1901 non-null   int64  
dtypes: UInt32(1), float64(4), int64(8)
memory usage: 202.4 KB


In [95]:
X_test.week_of_year = X_test.week_of_year.astype('int64')

In [98]:
y_pred = xg_reg.predict(X_test)

In [100]:
y_pred = pd.DataFrame(y_pred)
y_pred.head()

Unnamed: 0,0
0,1506.025024
1,1727.622314
2,1747.943848
3,1510.564575
4,905.274597


In [106]:
output = pd.concat([test.source_timestamp, y_pred], axis=1)
output['eta'] = output['source_timestamp'] + pd.to_timedelta(output[0], unit='s')
output.drop([0, 'source_timestamp'], axis=1, inplace=True)
output.head()

Unnamed: 0,eta
0,2008-06-09 17:22:40.025024414
1,2008-06-09 16:48:02.622314453
2,2008-06-09 15:24:36.943847656
3,2008-06-09 14:25:45.564575195
4,2008-06-09 13:21:11.274597168


In [107]:
output.to_csv('../stage3/outputs/xgboost_eta.csv', index=False)

# Steps to improve
- use featurewiz for engineering and selection
- try catboost, lightgbm, and a lazy learn model
- use optuna on best dt model to tune hyperparameters