In [1]:
# import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import datasets
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import math
import statistics

### 1.0 Functions to clean the dfs

In [2]:
# Function to clean the trips df
# Based off of the CleaningNotebook cleaning methods
def clean_trips(x):
    
    # Converting dates to datetime64
    x['DAYOFSERVICE'] = pd.to_datetime(x['DAYOFSERVICE'], infer_datetime_format=True)
    x['LASTUPDATE'] = pd.to_datetime(x['LASTUPDATE'], infer_datetime_format=True)
    
    # Adding new features
    x['MONTH'] = x['DAYOFSERVICE'].dt.month
    x['DAY'] = x['DAYOFSERVICE'].dt.dayofweek
    x['HOUR'] = x['LASTUPDATE'].dt.hour
    
    # Creating an additional feature called Journey Time
    x['JOURNEY_TIME'] = x['ACTUALTIME_ARR'] - x['ACTUALTIME_DEP']
    
    # Removing Journey times less than or equal to 0
    x = x[x["JOURNEY_TIME"] >= 0]
    
    # Calling the remove outliers function
    x = x[~x.groupby('LINEID')['JOURNEY_TIME'].apply(is_outlier)]
    
    # Convert data type to category for these columns
    categorical_columns = x[["TRIPID","LINEID", "ROUTEID", "DIRECTION", "PLANNEDTIME_ARR","PLANNEDTIME_DEP", "ACTUALTIME_ARR", "ACTUALTIME_DEP", "NOTE"]].columns
    for column in categorical_columns:
        x[column] = x[column].astype('category')

    # Sorting df by LASTUPDATE
    x.sort_values('LASTUPDATE', inplace=True)
    
    return x

In [3]:
# Function to clean the weather df
# Based off of the CleaningNotebook cleaning methods

def cleaning_weather(x):
    
    x = x.drop(columns=["msl"])

    # Converting date to datetime
    x['date'] = pd.to_datetime(x['date'], infer_datetime_format=True)
    
    # Convert empty string to null pandas (to convert to float64)
    x = x.replace(r'^\s*$', np.NaN, regex=True)
    
    # Converting rain to float64
    x['rain'] = x['rain'].astype('float64')
    
    # Sorting df by date
    x.sort_values(by=['date'])
    return x

In [4]:
# Function to remove outliers

def is_outlier(s):
    lower_limit = s.mean() - (s.std() * 3)
    upper_limit = s.mean() + (s.std() * 3)
    return ~s.between(lower_limit, upper_limit)

### 2.0 Read In Trips Data

In [5]:
#read in cleaned trips data
df_trips = pd.read_csv(r'C:\Users\jason\OneDrive - University College Dublin\Documents\MSc Computer Science\Summer Semester\Data\Notebooks\CleanedCSVs\rt_trips_DB_2018_cleaned.csv', index_col=[0])

In [6]:
# Pass the leavetimes df through the cleaning function
df_trips = clean_trips(df_trips)

In [7]:
df_trips.dtypes

DAYOFSERVICE       datetime64[ns]
TRIPID                   category
LINEID                   category
ROUTEID                  category
DIRECTION                category
PLANNEDTIME_ARR          category
PLANNEDTIME_DEP          category
ACTUALTIME_ARR           category
ACTUALTIME_DEP           category
LASTUPDATE         datetime64[ns]
NOTE                     category
MONTH                       int64
DAY                         int64
HOUR                        int64
JOURNEY_TIME              float64
dtype: object

### 3.0 Read in weather CSV

In [8]:
#read in cleaned weath~er csc
df_weather = pd.read_csv(r'C:\Users\jason\OneDrive - University College Dublin\Documents\MSc Computer Science\Summer Semester\Data\Notebooks\CleanedCSVs\weather2018_cleaned.csv', index_col=[0])

In [9]:
# Pass the leavetimes df through the cleaning function
df_weather = cleaning_weather(df_weather)

In [10]:
df_weather.dtypes

date    datetime64[ns]
rain           float64
temp           float64
dtype: object

### 4.0 Merging Trips and Weather CSV

In [11]:
# Merging weather and trips
merged_data = pd.merge_asof(df_trips, df_weather, left_on="LASTUPDATE", right_on="date")

In [12]:
merged_data.sort_values('LINEID', inplace=True)
merged_data

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,NOTE,MONTH,DAY,HOUR,JOURNEY_TIME,date,rain,temp
665532,2018-06-05,6859600,1,1_40,2,40107,37200,40201.0,37185.0,2018-06-14 14:51:15,",2856269,",6,1,14,3016.0,2018-06-14 14:00:00,0.0,19.0
149413,2018-01-28,6245808,1,1_40,2,59799,57000,59758.0,57049.0,2018-02-06 13:03:54,",2862091,",1,6,13,2709.0,2018-02-06 13:00:00,0.0,3.9
816412,2018-05-10,6745143,1,1_37,1,52254,49200,52401.0,49225.0,2018-06-26 08:24:53,",2860769,",5,3,8,3176.0,2018-06-26 08:00:00,0.0,20.0
1233617,2018-08-25,7326029,1,1_37,1,54463,51600,54481.0,51648.0,2018-09-03 10:39:32,",1936680,",8,5,10,2833.0,2018-09-03 10:00:00,0.0,14.7
1233616,2018-08-25,7323445,1,1_37,1,55663,52800,56263.0,52999.0,2018-09-03 10:39:32,",1940853,",8,5,10,3264.0,2018-09-03 10:00:00,0.0,14.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210538,2018-02-06,6255246,9,9_7,2,69732,65400,69934.0,65422.0,2018-02-28 11:48:21,",3089597,",2,1,11,4512.0,2018-02-28 11:00:00,0.2,-1.5
1294246,2018-09-05,7658886,9,9_5,1,77374,73200,77364.0,73140.0,2018-09-18 11:16:31,",2965811,",9,2,11,4224.0,2018-09-18 11:00:00,0.0,16.5
754339,2018-04-25,6647150,9,9_7,2,72675,68100,72681.0,68069.0,2018-06-25 09:36:53,",3092809,",4,2,9,4612.0,2018-06-25 09:00:00,0.0,21.9
1239562,2018-08-27,7502709,9,9_7,2,84095,80400,83748.0,80383.0,2018-09-05 11:54:42,",3092314,",8,0,11,3365.0,2018-09-05 11:00:00,0.0,17.0


In [13]:
merged_data.dtypes

DAYOFSERVICE       datetime64[ns]
TRIPID                   category
LINEID                   category
ROUTEID                  category
DIRECTION                category
PLANNEDTIME_ARR          category
PLANNEDTIME_DEP          category
ACTUALTIME_ARR           category
ACTUALTIME_DEP           category
LASTUPDATE         datetime64[ns]
NOTE                     category
MONTH                       int64
DAY                         int64
HOUR                        int64
JOURNEY_TIME              float64
date               datetime64[ns]
rain                      float64
temp                      float64
dtype: object

In [14]:
# Create a df with unique values of LineID
unique_lineid = pd.DataFrame({'LINEID':merged_data.LINEID.unique()})

# Getting a list of unique lineids
LineList = list(unique_lineid['LINEID'])

### 5.0 Random Forest Statistical Measurements

In [15]:
# Testing r2 score, MAE & RMSE on whole routes
r2_list = []
MAE_list = []
RMSE_list = []
for key in LineList:
    df_r2 = merged_data[merged_data['LINEID'] == key]
    if not df_r2.empty:
        X = df_r2[["MONTH", "DAY", "HOUR", "rain", "temp"]]
        y = df_r2[["JOURNEY_TIME"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=20)
        randForestModel = RandomForestRegressor(n_estimators = 100, random_state=10)
        randForestModel.fit(X_train, y_train.values.ravel())
        predictionTest = randForestModel.predict(X_test)
        #Testing r2
        randForestR2 = r2_score(y_test, predictionTest)
        r2_list.append(randForestR2)
        #Testing MAE
        randForestMAE = mean_absolute_error(y_test, predictionTest)
        MAE_list.append(randForestMAE)
        #Testing MSE
        RandforestMSE = mean_squared_error(y_test, predictionTest)
        RandForestRMSE = math.sqrt(RandforestMSE)
        RMSE_list.append(RandForestRMSE)

In [16]:
print("==================== Random Forest Accuracy - R2 Score, MAE, RMSE =======================\n")
print("The largest r2 score is : ", max(r2_list))
print("MAE Average Score : ", statistics.mean(MAE_list))
print("RMSE Average Score : ", statistics.mean(RMSE_list))


The largest r2 score is :  0.39347495152467515
MAE Average Score :  545.5025699894009
RMSE Average Score :  686.3695450577889


In [17]:
# Checking for important features
feature_list = list(X.columns)
importance = pd.DataFrame({'feature': feature_list, 'importance':randForestModel.feature_importances_})
importance.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,DAY,0.565813
4,temp,0.165553
0,MONTH,0.160337
2,HOUR,0.090006
3,rain,0.018291


### 6.0 Random Forest Model Building

In [18]:
# Building Random Forest models for LINEID Direction 1
for key in LineList:
    df_out = merged_data[merged_data['LINEID'] == key]
    df_out = df_out[df_out['DIRECTION'] == 1]
    if not df_out.empty:
        X = df_out[["MONTH", "DAY", "HOUR", "rain", "temp"]]
        y = df_out[["JOURNEY_TIME"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=20)
        randForestModel = RandomForestRegressor(n_estimators = 100, random_state=10)
        randForestModel.fit(X_train, y_train.values.ravel())
        pickle.dump(randForestModel, open(f'C:\\Users\\jason\\OneDrive - University College Dublin\\Documents\\MSc Computer Science\\Summer Semester\\Data\\Notebooks\\RouteModels\\Out\\RandForest_{key}.pkl','wb'))

In [19]:
# Building Random Forest models for LINEID Direction 2
for key in LineList:
    df_in = merged_data[merged_data['LINEID'] == key]
    df_in = df_in[df_in['DIRECTION'] == 2]
    if not df_in.empty:
        X = df_in[["MONTH", "DAY", "HOUR", "rain", "temp"]]
        y = df_in[["JOURNEY_TIME"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=20)
        randForestModel = RandomForestRegressor(n_estimators = 100, random_state=10)
        randForestModel.fit(X_train, y_train.values.ravel())
        pickle.dump(randForestModel, open(f'C:\\Users\\jason\\OneDrive - University College Dublin\\Documents\\MSc Computer Science\\Summer Semester\\Data\\Notebooks\\RouteModels\\In\\RandForest_{key}.pkl','wb'))