In [1]:
# import required packages
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### 1.0 Functions to clean the dfs

In [2]:
# Function to clean the trips df
# Based off of the CleaningNotebook cleaning methods
def clean_trips(x):
    
    # Converting dates to datetime64
    x['DAYOFSERVICE'] = pd.to_datetime(x['DAYOFSERVICE'], infer_datetime_format=True)
    x['LASTUPDATE'] = pd.to_datetime(x['LASTUPDATE'], infer_datetime_format=True)
    
    # Adding new features
    x['MONTH'] = x['DAYOFSERVICE'].dt.month
    x['DAY'] = x['DAYOFSERVICE'].dt.dayofweek
    x['HOUR'] = x['LASTUPDATE'].dt.hour
    
    # Creating an additional feature called Journey Time
    x['JOURNEY_TIME'] = x['ACTUALTIME_ARR'] - x['ACTUALTIME_DEP']
    
    # Convert data type to category for these columns
    categorical_columns = x[["TRIPID","LINEID", "ROUTEID", "DIRECTION", "PLANNEDTIME_ARR","PLANNEDTIME_DEP", "ACTUALTIME_ARR", "ACTUALTIME_DEP", "NOTE","JOURNEY_TIME", "MONTH", "DAY", "HOUR"]].columns
    for column in categorical_columns:
        x[column] = x[column].astype('category')
    
    # Sorting df by LASTUPDATE
    x.sort_values('LASTUPDATE', inplace=True)
    
    return x

In [3]:
# Function to clean the weather df
# Based off of the CleaningNotebook cleaning methods

def cleaning_weather(x):
    
    x = x.drop(columns=["msl"])

    # Converting date to datetime
    x['date'] = pd.to_datetime(x['date'], infer_datetime_format=True)
    
    # Convert empty string to null pandas (to convert to float64)
    x = x.replace(r'^\s*$', np.NaN, regex=True)
    
    # Converting rain to float64
    x['rain'] = x['rain'].astype('float64')
    
    # Sorting df by date
    x.sort_values(by=['date'])
    return x

### 2.0 Read In Trips Data for merging

In [4]:
#read in cleaned trips data
df_trips = pd.read_csv(r'C:\Users\jason\OneDrive - University College Dublin\Documents\MSc Computer Science\Summer Semester\Data\Notebooks\CleanedCSVs\rt_trips_DB_2018_cleaned.csv', index_col=[0])

In [5]:
# Pass the leavetimes df through the cleaning function
df_trips = clean_trips(df_trips)

In [6]:
df_trips.dtypes

DAYOFSERVICE       datetime64[ns]
TRIPID                   category
LINEID                   category
ROUTEID                  category
DIRECTION                category
PLANNEDTIME_ARR          category
PLANNEDTIME_DEP          category
ACTUALTIME_ARR           category
ACTUALTIME_DEP           category
LASTUPDATE         datetime64[ns]
NOTE                     category
MONTH                    category
DAY                      category
HOUR                     category
JOURNEY_TIME             category
dtype: object

In [7]:
# Stops = df_trips.groupby(['ROUTEID']).JOURNEY_TIME.agg(std_dev=(lambda x:np.abs(x.mean()-2*x.std())))
# Stops

In [8]:
# indices, values = list(Stops.index), list(Stops['std_dev']) 

# cutoffs_Upper = {}

# for ix, value in zip(indices, values):
#     cutoffs_Upper[ix] = value
# print(cutoffs_Upper)

In [9]:
# df_trips['CutOff'] = df_trips[['ROUTEID','JOURNEY_TIME']].apply(lambda x:x[1] < cutoffs_Upper[x[0]], axis = 1)

In [10]:
# df_trips.head(50)

In [11]:
# dftripstrue = df_trips[df_trips['CutOff'] == True]

In [12]:
# dftripstrue.shape

In [13]:
# dftripsfalse = df_trips[df_trips['CutOff'] == False]

In [14]:
# dftripsfalse.shape

In [15]:
# z = np.abs(stats.zscore(boston_df))

In [16]:
# sns.boxplot(x=df_trips['JOURNEY_TIME'])

In [17]:
# fig, ax = plt.subplots(figsize=(16,8))
# ax.scatter(df_trips['JOURNEY_TIME'], df_trips['TRIPID'])
# ax.set_xlabel('Journey Time')
# ax.set_ylabel('TripID')
# plt.show()

### 3.0 Read in weather CSV

In [18]:
#read in cleaned weath~er csc
df_weather = pd.read_csv(r'C:\Users\jason\OneDrive - University College Dublin\Documents\MSc Computer Science\Summer Semester\Data\Notebooks\CleanedCSVs\weather2018_cleaned.csv', index_col=[0])

In [19]:
# Pass the leavetimes df through the cleaning function
df_weather = cleaning_weather(df_weather)

In [20]:
df_weather.dtypes

date    datetime64[ns]
rain           float64
temp           float64
dtype: object

In [21]:
df_trips.head(20)

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,NOTE,MONTH,DAY,HOUR,JOURNEY_TIME
2076901,2018-01-01,5955732,83A,83A_26,2,84031,81000,83236.0,80960.0,2018-01-08 17:21:10,",2023740,",1,0,17,2276.0
2072856,2018-01-01,5967193,39,39_21,2,84594,81900,84738.0,81965.0,2018-01-08 17:21:10,",2422352,",1,0,17,2773.0
2072857,2018-01-01,5962309,27,27_17,2,36719,32400,36023.0,32413.0,2018-01-08 17:21:10,",2432996,",1,0,17,3610.0
2072858,2018-01-01,5962310,27,27_19,1,43300,37800,42255.0,37844.0,2018-01-08 17:21:10,",2432997,",1,0,17,4411.0
2072859,2018-01-01,5962363,31,31_15,1,54978,51600,53845.0,51591.0,2018-01-08 17:21:10,",2665261,",1,0,17,2254.0
2072860,2018-01-01,5962383,122,122_14,1,67649,64200,67393.0,64210.0,2018-01-08 17:21:10,",1749163,",1,0,17,3183.0
2072861,2018-01-01,5962437,150,150_8,1,75629,73800,75299.0,74103.0,2018-01-08 17:21:10,",2961678,2961678,",1,0,17,1196.0
2072862,2018-01-01,5968358,17A,17A_11,1,50770,46800,50360.0,46811.0,2018-01-08 17:21:10,",1757043,",1,0,17,3549.0
2072864,2018-01-01,5961968,25,25_272,1,73437,70800,73545.0,70806.0,2018-01-08 17:21:10,",2414716,",1,0,17,2739.0
2072865,2018-01-01,5961980,25A,25A_270,2,63271,60120,63163.0,60141.0,2018-01-08 17:21:10,",2282934,",1,0,17,3022.0


In [22]:
df_weather.head(20)

Unnamed: 0,date,rain,temp
0,2018-01-01 00:00:00,0.0,4.6
1,2018-01-01 01:00:00,0.1,4.7
2,2018-01-01 02:00:00,0.0,4.8
3,2018-01-01 03:00:00,0.0,4.9
4,2018-01-01 04:00:00,0.0,5.3
5,2018-01-01 05:00:00,0.0,5.1
6,2018-01-01 06:00:00,0.0,5.0
7,2018-01-01 07:00:00,0.0,4.6
8,2018-01-01 08:00:00,0.0,4.3
9,2018-01-01 09:00:00,0.0,4.5


### Merging Trips and Weather CSV

In [23]:
# Merging weather and trips
merged_data = pd.merge_asof(df_trips, df_weather, left_on="LASTUPDATE", right_on="date")

In [24]:
merged_data.sort_values('LINEID', inplace=True)
merged_data

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,NOTE,MONTH,DAY,HOUR,JOURNEY_TIME,date,rain,temp
80446,2018-01-15,6116275,1,1_40,2,81765,79200,81484.0,79292.0,2018-01-23 10:24:03,",1947800,",1,0,10,2192.0,2018-01-23 10:00:00,0.0,12.6
1640379,2018-10-12,8019885,1,1_40,2,83613,81000,83638.0,81070.0,2018-11-20 15:30:07,",2852818,",10,4,15,2568.0,2018-11-20 15:00:00,1.0,6.1
353193,2018-03-11,6400979,1,1_37,1,59526,57000,59847.0,57097.0,2018-03-19 15:02:59,",2856406,",3,6,15,2750.0,2018-03-19 15:00:00,0.0,5.9
353184,2018-03-11,6398500,1,1_37,1,55926,53400,56874.0,53432.0,2018-03-19 15:02:59,",2862089,",3,6,15,3442.0,2018-03-19 15:00:00,0.0,5.9
353173,2018-03-11,6394475,1,1_37,1,58326,55800,59178.0,56004.0,2018-03-19 15:02:59,",1942279,",3,6,15,3174.0,2018-03-19 15:00:00,0.0,5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251012,2018-02-12,6255244,9,9_7,2,56720,52200,56869.0,52178.0,2018-02-28 13:18:29,",3089594,",2,0,13,4691.0,2018-02-28 13:00:00,0.4,-0.2
1565290,2018-10-26,8055866,9,9_7,2,34090,28500,33607.0,28648.0,2018-11-05 14:27:37,",3094132,",10,4,14,4959.0,2018-11-05 14:00:00,0.0,11.3
1565289,2018-10-26,8064365,9,9_7,2,33490,27900,33129.0,27906.0,2018-11-05 14:27:37,",3092288,",10,4,14,5223.0,2018-11-05 14:00:00,0.0,11.3
195092,2018-01-29,6237945,9,9_7,2,64821,60000,65963.0,60015.0,2018-02-28 10:24:30,",3100200,",1,0,10,5948.0,2018-02-28 10:00:00,0.2,-1.5


In [25]:
# Remove outliers (see henry's comment)

In [26]:
# For each line id:
    # store it in a df
    # pass it into the model
    # save it as a pickle file
    

In [27]:
Route1 = merged_data[merged_data['LINEID'] == '1']

In [28]:
Route1

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,NOTE,MONTH,DAY,HOUR,JOURNEY_TIME,date,rain,temp
80446,2018-01-15,6116275,1,1_40,2,81765,79200,81484.0,79292.0,2018-01-23 10:24:03,",1947800,",1,0,10,2192.0,2018-01-23 10:00:00,0.0,12.6
1640379,2018-10-12,8019885,1,1_40,2,83613,81000,83638.0,81070.0,2018-11-20 15:30:07,",2852818,",10,4,15,2568.0,2018-11-20 15:00:00,1.0,6.1
353193,2018-03-11,6400979,1,1_37,1,59526,57000,59847.0,57097.0,2018-03-19 15:02:59,",2856406,",3,6,15,2750.0,2018-03-19 15:00:00,0.0,5.9
353184,2018-03-11,6398500,1,1_37,1,55926,53400,56874.0,53432.0,2018-03-19 15:02:59,",2862089,",3,6,15,3442.0,2018-03-19 15:00:00,0.0,5.9
353173,2018-03-11,6394475,1,1_37,1,58326,55800,59178.0,56004.0,2018-03-19 15:02:59,",1942279,",3,6,15,3174.0,2018-03-19 15:00:00,0.0,5.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600316,2018-05-22,6782424,1,1_37,1,77777,75600,77698.0,75666.0,2018-06-13 18:52:05,",1947798,",5,1,18,2032.0,2018-06-13 18:00:00,0.1,15.6
11313,2018-01-03,5965220,1,1_40,2,36781,33600,37542.0,33632.0,2018-01-11 18:46:25,",2857182,",1,2,18,3910.0,2018-01-11 18:00:00,0.0,1.8
223747,2018-02-07,6255009,1,1_37,1,43566,40800,43907.0,40817.0,2018-02-28 12:05:11,",2852799,",2,2,12,3090.0,2018-02-28 12:00:00,0.0,-2.1
1499607,2018-09-10,8096017,1,1_40,2,68185,64800,68163.0,64781.0,2018-11-02 10:47:06,",2852810,",9,0,10,3382.0,2018-11-02 10:00:00,0.0,6.4


In [30]:
# Create a df with unique values of LineID
unique_lineid = pd.DataFrame({'LINEID':merged_data.LINEID.unique()})
unique_lineid

Unnamed: 0,LINEID
0,1
1,102
2,104
3,11
4,111
...,...
125,83A
126,84
127,84A
128,84X


In [31]:
LineList = list(unique_lineid['LINEID'])

In [35]:
merged_data.head(5)

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,LASTUPDATE,NOTE,MONTH,DAY,HOUR,JOURNEY_TIME,date,rain,temp
80446,2018-01-15,6116275,1,1_40,2,81765,79200,81484.0,79292.0,2018-01-23 10:24:03,",1947800,",1,0,10,2192.0,2018-01-23 10:00:00,0.0,12.6
1640379,2018-10-12,8019885,1,1_40,2,83613,81000,83638.0,81070.0,2018-11-20 15:30:07,",2852818,",10,4,15,2568.0,2018-11-20 15:00:00,1.0,6.1
353193,2018-03-11,6400979,1,1_37,1,59526,57000,59847.0,57097.0,2018-03-19 15:02:59,",2856406,",3,6,15,2750.0,2018-03-19 15:00:00,0.0,5.9
353184,2018-03-11,6398500,1,1_37,1,55926,53400,56874.0,53432.0,2018-03-19 15:02:59,",2862089,",3,6,15,3442.0,2018-03-19 15:00:00,0.0,5.9
353173,2018-03-11,6394475,1,1_37,1,58326,55800,59178.0,56004.0,2018-03-19 15:02:59,",1942279,",3,6,15,3174.0,2018-03-19 15:00:00,0.0,5.9


In [37]:
merged_data.dtypes

DAYOFSERVICE       datetime64[ns]
TRIPID                   category
LINEID                   category
ROUTEID                  category
DIRECTION                category
PLANNEDTIME_ARR          category
PLANNEDTIME_DEP          category
ACTUALTIME_ARR           category
ACTUALTIME_DEP           category
LASTUPDATE         datetime64[ns]
NOTE                     category
MONTH                    category
DAY                      category
HOUR                     category
JOURNEY_TIME             category
date               datetime64[ns]
rain                      float64
temp                      float64
dtype: object

In [84]:
# Building linear models for LINEID Direction 1
for key in LineList:
    df_out = merged_data[merged_data['LINEID'] == key]
    df_out = df_out[df_out['DIRECTION'] == 1]
    if not df_out.empty:
        X = df_out[["MONTH", "DAY", "HOUR", "rain", "temp"]]
        y = df_out[["JOURNEY_TIME"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=20)
        linearModel = LinearRegression().fit(X_train, y_train.values.ravel())
        pickle.dump(linearModel, open(f'C:\\Users\\jason\\OneDrive - University College Dublin\\Documents\\MSc Computer Science\\Summer Semester\\Data\\Notebooks\\RouteModels\\Out\\linearReg_{key}_Out.pkl','wb'))

In [85]:
# Building linear models for LINEID Direction 2
for key in LineList:
    df_in = merged_data[merged_data['LINEID'] == key]
    df_in = df_in[df_in['DIRECTION'] == 2]
    if not df_in.empty:
        X = df_in[["MONTH", "DAY", "HOUR", "rain", "temp"]]
        y = df_in[["JOURNEY_TIME"]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=20)
        linearModel = LinearRegression().fit(X_train, y_train.values.ravel())
        pickle.dump(linearModel, open(f'C:\\Users\\jason\\OneDrive - University College Dublin\\Documents\\MSc Computer Science\\Summer Semester\\Data\\Notebooks\\RouteModels\\In\\linearReg_{key}_In.pkl','wb'))