In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Data viz

In [3]:
file = "../raw_data/mta_1706.csv"
data = pd.read_csv(file, on_bad_lines="skip", nrows=1000000)
data.head(10)

Unnamed: 0,RecordedAtTime,DirectionRef,PublishedLineName,OriginName,OriginLat,OriginLong,DestinationName,DestinationLat,DestinationLong,VehicleRef,VehicleLocation.Latitude,VehicleLocation.Longitude,NextStopPointName,ArrivalProximityText,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,B8,4 AV/95 ST,40.616104,-74.031143,BROWNSVILLE ROCKAWAY AV,40.656048,-73.907379,NYCT_430,40.63517,-73.960803,FOSTER AV/E 18 ST,approaching,76.0,2017-06-01 00:03:59,24:06:14
1,2017-06-01 00:03:43,1,S61,ST GEORGE FERRY/S61 & S91,40.643169,-74.073494,S I MALL YUKON AV,40.575935,-74.167686,NYCT_8263,40.590802,-74.15834,MERRYMOUNT ST/TRAVIS AV,approaching,62.0,2017-06-01 00:03:56,23:58:02
2,2017-06-01 00:03:49,0,Bx10,E 206 ST/BAINBRIDGE AV,40.875008,-73.880142,RIVERDALE 263 ST,40.912376,-73.902534,NYCT_4223,40.88601,-73.912647,HENRY HUDSON PKY E/W 235 ST,at stop,5.0,2017-06-01 00:03:56,24:00:53
3,2017-06-01 00:03:31,0,Q5,TEARDROP/LAYOVER,40.701748,-73.802399,ROSEDALE LIRR STA via MERRICK,40.666012,-73.735939,NYCT_8422,40.668002,-73.729348,HOOK CREEK BL/SUNRISE HY,< 1 stop away,267.0,2017-06-01 00:04:03,24:03:00
4,2017-06-01 00:03:22,1,Bx1,RIVERDALE AV/W 231 ST,40.881187,-73.90934,MOTT HAVEN 136 ST via CONCOURSE,40.809654,-73.92836,NYCT_4710,40.868134,-73.893032,GRAND CONCOURSE/E 196 ST,at stop,11.0,2017-06-01 00:03:56,23:59:38
5,2017-06-01 00:03:40,0,M1,4 AV/E 10 ST,40.731342,-73.990288,HARLEM 147 ST via MADISON,40.82111,-73.935898,NYCT_3831,40.792897,-73.950023,MADISON AV/E 106 ST,approaching,73.0,2017-06-01 00:03:56,24:02:35
6,2017-06-01 00:03:24,0,B31,GERRITSEN AV/GERRITSEN BEACH,40.587101,-73.918503,MIDWOOD KINGS HWY STA,40.608433,-73.9571,NYCT_4611,40.587024,-73.918623,GERRITSEN AV/GERRITSEN BEACH,at stop,0.0,,24:08:00
7,2017-06-01 00:03:29,0,B83,GATEWAY CTR TERM/GATEWAY DR,40.652649,-73.877029,BWAY JCT VN SNDRN AV,40.678139,-73.903572,NYCT_4841,40.648801,-73.882682,PENNSYLVANIA AV/DELMAR LOOP N,< 1 stop away,196.0,2017-06-01 00:04:13,23:58:47
8,2017-06-01 00:03:27,0,B82,STILLWELL TERMINAL BUS LOOP,40.57708,-73.981293,SPRING CRK TWRS SEAVIEW AV via KINGS HWY,40.64299,-73.878326,NYCT_6592,40.632258,-73.918318,FLATLANDS AV/RALPH AV,approaching,35.0,2017-06-01 00:03:56,24:00:00
9,2017-06-01 00:03:51,1,S59,RICHMOND TER/PARK AV #3,40.640167,-74.130966,HYLAN BL,40.53426,-74.154213,NYCT_8279,40.590689,-74.165811,RICHMOND AV/NOME AV,approaching,31.0,2017-06-01 00:03:56,24:01:14


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 17 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   RecordedAtTime             1000000 non-null  object 
 1   DirectionRef               1000000 non-null  int64  
 2   PublishedLineName          1000000 non-null  object 
 3   OriginName                 991537 non-null   object 
 4   OriginLat                  991537 non-null   float64
 5   OriginLong                 991537 non-null   float64
 6   DestinationName            1000000 non-null  object 
 7   DestinationLat             998754 non-null   float64
 8   DestinationLong            998754 non-null   float64
 9   VehicleRef                 1000000 non-null  object 
 10  VehicleLocation.Latitude   1000000 non-null  float64
 11  VehicleLocation.Longitude  1000000 non-null  float64
 12  NextStopPointName          998951 non-null   object 
 13  ArrivalProxim

In [5]:
data.dtypes

RecordedAtTime                object
DirectionRef                   int64
PublishedLineName             object
OriginName                    object
OriginLat                    float64
OriginLong                   float64
DestinationName               object
DestinationLat               float64
DestinationLong              float64
VehicleRef                    object
VehicleLocation.Latitude     float64
VehicleLocation.Longitude    float64
NextStopPointName             object
ArrivalProximityText          object
DistanceFromStop             float64
ExpectedArrivalTime           object
ScheduledArrivalTime          object
dtype: object

## Convert Time Columns to Datetime Format

In [6]:
data['RecordedAtTime'] = pd.to_datetime(data['RecordedAtTime'])
data['ExpectedArrivalTime'] = pd.to_datetime(data['ExpectedArrivalTime'])

In [7]:
# Using Vectorized Operations
# since the db is very very large and classical python iterations is too slow.

times = data['ScheduledArrivalTime'].str.split(':', expand=True)
times = times.fillna('0').astype(float)
hours, minutes, seconds = times[0], times[1], times[2]

new_hours = np.where(hours >= 24, hours - 24, hours)
days_to_add = np.where(hours >= 24, 1, 0)

valid_rows = ~hours.isna() & ~minutes.isna() & ~seconds.isna()

data.loc[valid_rows, 'ScheduledArrivalTime'] = (
    pd.to_datetime(
        new_hours.astype(int).astype(str) + ':' +
        minutes.astype(int).astype(str) + ':' +
        seconds.astype(int).astype(str),
        format='%H:%M:%S'
    ) + pd.to_timedelta(days_to_add[valid_rows], unit='days')
)

data.loc[~valid_rows, 'ScheduledArrivalTime'] = pd.NaT

In [8]:
data['ScheduledArrivalTime'] = pd.to_datetime(data['ScheduledArrivalTime'])

In [9]:
data['ScheduledArrivalTime'] = (
    pd.to_datetime(data['RecordedAtTime'].dt.date.astype(str) + ' ' + data['ScheduledArrivalTime'].dt.time.astype(str))
)
data['ScheduledArrivalTime']

0        2017-06-01 00:06:14
1        2017-06-01 23:58:02
2        2017-06-01 00:00:53
3        2017-06-01 00:03:00
4        2017-06-01 23:59:38
                 ...        
999995   2017-06-05 17:20:00
999996   2017-06-05 17:32:22
999997   2017-06-05 17:29:06
999998   2017-06-05 17:27:50
999999   2017-06-05 17:25:00
Name: ScheduledArrivalTime, Length: 1000000, dtype: datetime64[ns]

In [10]:
print(data['RecordedAtTime'].isna().sum())

0


In [11]:
print(data['ExpectedArrivalTime'].isna().sum())

110897


In [12]:
print(data['ScheduledArrivalTime'].isna().sum())

0


## Data Cleaning

In [13]:
# List of columns to drop
columns_to_drop = ['PublishedLineName', 'OriginName', 'DestinationName', 'VehicleRef', 'NextStopPointName', 'ArrivalProximityText']

# Drop the columns
data_cleaned = data.drop(columns=columns_to_drop)

# Display the first few rows to confirm the columns are dropped
data_cleaned.head(10)

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.63517,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.15834,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.88601,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00
4,2017-06-01 00:03:22,1,40.881187,-73.90934,40.809654,-73.92836,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38
5,2017-06-01 00:03:40,0,40.731342,-73.990288,40.82111,-73.935898,40.792897,-73.950023,73.0,2017-06-01 00:03:56,2017-06-01 00:02:35
6,2017-06-01 00:03:24,0,40.587101,-73.918503,40.608433,-73.9571,40.587024,-73.918623,0.0,NaT,2017-06-01 00:08:00
7,2017-06-01 00:03:29,0,40.652649,-73.877029,40.678139,-73.903572,40.648801,-73.882682,196.0,2017-06-01 00:04:13,2017-06-01 23:58:47
8,2017-06-01 00:03:27,0,40.57708,-73.981293,40.64299,-73.878326,40.632258,-73.918318,35.0,2017-06-01 00:03:56,2017-06-01 00:00:00
9,2017-06-01 00:03:51,1,40.640167,-74.130966,40.53426,-74.154213,40.590689,-74.165811,31.0,2017-06-01 00:03:56,2017-06-01 00:01:14


In [14]:
missing_values = data_cleaned.isnull().sum()

# Display the missing values per column
print(missing_values)

RecordedAtTime                    0
DirectionRef                      0
OriginLat                      8463
OriginLong                     8463
DestinationLat                 1246
DestinationLong                1246
VehicleLocation.Latitude          0
VehicleLocation.Longitude         0
DistanceFromStop               1049
ExpectedArrivalTime          110897
ScheduledArrivalTime              0
dtype: int64


In [15]:
# Drop rows where 'ExpectedArrivalTime' is missing (since it's needed for target)
data_cleaned = data_cleaned.dropna(subset=['ExpectedArrivalTime'])
data_cleaned

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.635170,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.158340,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.886010,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00
4,2017-06-01 00:03:22,1,40.881187,-73.909340,40.809654,-73.928360,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38
...,...,...,...,...,...,...,...,...,...,...,...
999994,2017-06-05 17:31:22,0,40.577080,-73.981293,40.642990,-73.878326,40.607248,-73.962166,70.0,2017-06-05 17:31:54,2017-06-05 17:26:29
999995,2017-06-05 17:31:33,0,40.578247,-73.939743,40.621841,-74.028366,40.621746,-74.028266,13.0,2017-06-05 17:31:41,2017-06-05 17:20:00
999996,2017-06-05 17:31:24,0,40.836311,-73.948433,40.810085,-73.876396,40.828490,-73.935679,570.0,2017-06-05 17:37:34,2017-06-05 17:32:22
999997,2017-06-05 17:31:20,0,40.849327,-73.936508,40.885086,-73.900436,40.845467,-73.925123,490.0,2017-06-05 17:35:34,2017-06-05 17:29:06


In [18]:
# Fill missing values in 'DistanceFromStop' with the column mean
data_cleaned = data_cleaned.dropna(subset=['DistanceFromStop'])
data_cleaned

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.635170,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.158340,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.886010,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00
4,2017-06-01 00:03:22,1,40.881187,-73.909340,40.809654,-73.928360,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38
...,...,...,...,...,...,...,...,...,...,...,...
999994,2017-06-05 17:31:22,0,40.577080,-73.981293,40.642990,-73.878326,40.607248,-73.962166,70.0,2017-06-05 17:31:54,2017-06-05 17:26:29
999995,2017-06-05 17:31:33,0,40.578247,-73.939743,40.621841,-74.028366,40.621746,-74.028266,13.0,2017-06-05 17:31:41,2017-06-05 17:20:00
999996,2017-06-05 17:31:24,0,40.836311,-73.948433,40.810085,-73.876396,40.828490,-73.935679,570.0,2017-06-05 17:37:34,2017-06-05 17:32:22
999997,2017-06-05 17:31:20,0,40.849327,-73.936508,40.885086,-73.900436,40.845467,-73.925123,490.0,2017-06-05 17:35:34,2017-06-05 17:29:06


In [19]:
# Inspect the data after handling missing values
data_cleaned.isnull().sum()

RecordedAtTime                  0
DirectionRef                    0
OriginLat                    7339
OriginLong                   7339
DestinationLat               1016
DestinationLong              1016
VehicleLocation.Latitude        0
VehicleLocation.Longitude       0
DistanceFromStop                0
ExpectedArrivalTime             0
ScheduledArrivalTime            0
dtype: int64

In [20]:
# Drop rows with missing geographical data
data_cleaned = data_cleaned.dropna(subset=['OriginLat', 'OriginLong', 'DestinationLat', 'DestinationLong'])

# Verify if any geographical data is missing after dropping rows
data_cleaned[['OriginLat', 'OriginLong', 'DestinationLat', 'DestinationLong']].isnull().sum()

OriginLat          0
OriginLong         0
DestinationLat     0
DestinationLong    0
dtype: int64

In [21]:
# Inspect the data after handling missing values
data_cleaned.isnull().sum()

RecordedAtTime               0
DirectionRef                 0
OriginLat                    0
OriginLong                   0
DestinationLat               0
DestinationLong              0
VehicleLocation.Latitude     0
VehicleLocation.Longitude    0
DistanceFromStop             0
ExpectedArrivalTime          0
ScheduledArrivalTime         0
dtype: int64

In [22]:
# Remove duplicate rows if any
data_cleaned = data_cleaned.drop_duplicates()

# Inspect the data after removing duplicates
data_cleaned.head(10)

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.63517,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.15834,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.88601,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00
4,2017-06-01 00:03:22,1,40.881187,-73.90934,40.809654,-73.92836,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38
5,2017-06-01 00:03:40,0,40.731342,-73.990288,40.82111,-73.935898,40.792897,-73.950023,73.0,2017-06-01 00:03:56,2017-06-01 00:02:35
7,2017-06-01 00:03:29,0,40.652649,-73.877029,40.678139,-73.903572,40.648801,-73.882682,196.0,2017-06-01 00:04:13,2017-06-01 23:58:47
8,2017-06-01 00:03:27,0,40.57708,-73.981293,40.64299,-73.878326,40.632258,-73.918318,35.0,2017-06-01 00:03:56,2017-06-01 00:00:00
9,2017-06-01 00:03:51,1,40.640167,-74.130966,40.53426,-74.154213,40.590689,-74.165811,31.0,2017-06-01 00:03:56,2017-06-01 00:01:14
10,2017-06-01 00:03:48,0,40.864079,-73.894615,40.860828,-73.82267,40.876032,-73.829543,207.0,2017-06-01 00:04:29,2017-06-01 23:48:35


In [23]:
# Inspect the data after converting time columns
data_cleaned[['ExpectedArrivalTime', 'ScheduledArrivalTime']].head(10)

Unnamed: 0,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:59,2017-06-01 00:06:14
1,2017-06-01 00:03:56,2017-06-01 23:58:02
2,2017-06-01 00:03:56,2017-06-01 00:00:53
3,2017-06-01 00:04:03,2017-06-01 00:03:00
4,2017-06-01 00:03:56,2017-06-01 23:59:38
5,2017-06-01 00:03:56,2017-06-01 00:02:35
7,2017-06-01 00:04:13,2017-06-01 23:58:47
8,2017-06-01 00:03:56,2017-06-01 00:00:00
9,2017-06-01 00:03:56,2017-06-01 00:01:14
10,2017-06-01 00:04:29,2017-06-01 23:48:35


In [24]:
# Calculate the time difference between ExpectedArrivalTime and ScheduledArrivalTime
data_cleaned['TimeDifference'] = data_cleaned['ExpectedArrivalTime'] - data_cleaned['ScheduledArrivalTime']

# Convert TimeDifference to minutes (or seconds)
data_cleaned['TimeDifference_minutes'] = data_cleaned['TimeDifference'].dt.total_seconds() / 60

# Inspect the data with the new target column
data_cleaned[['ExpectedArrivalTime', 'ScheduledArrivalTime', 'TimeDifference_minutes']].head(10)

Unnamed: 0,ExpectedArrivalTime,ScheduledArrivalTime,TimeDifference_minutes
0,2017-06-01 00:03:59,2017-06-01 00:06:14,-2.25
1,2017-06-01 00:03:56,2017-06-01 23:58:02,-1434.1
2,2017-06-01 00:03:56,2017-06-01 00:00:53,3.05
3,2017-06-01 00:04:03,2017-06-01 00:03:00,1.05
4,2017-06-01 00:03:56,2017-06-01 23:59:38,-1435.7
5,2017-06-01 00:03:56,2017-06-01 00:02:35,1.35
7,2017-06-01 00:04:13,2017-06-01 23:58:47,-1434.566667
8,2017-06-01 00:03:56,2017-06-01 00:00:00,3.933333
9,2017-06-01 00:03:56,2017-06-01 00:01:14,2.7
10,2017-06-01 00:04:29,2017-06-01 23:48:35,-1424.1


In [25]:
# Drop rows with missing 'TimeDifference_minutes'
data_cleaned = data_cleaned.dropna(subset=['TimeDifference_minutes'])

# Inspect the data after dropping missing values in 'TimeDifference_minutes'
data_cleaned.head(10)

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime,TimeDifference,TimeDifference_minutes
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.63517,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14,-1 days +23:57:45,-2.25
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.15834,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02,-1 days +00:05:54,-1434.1
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.88601,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53,0 days 00:03:03,3.05
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00,0 days 00:01:03,1.05
4,2017-06-01 00:03:22,1,40.881187,-73.90934,40.809654,-73.92836,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38,-1 days +00:04:18,-1435.7
5,2017-06-01 00:03:40,0,40.731342,-73.990288,40.82111,-73.935898,40.792897,-73.950023,73.0,2017-06-01 00:03:56,2017-06-01 00:02:35,0 days 00:01:21,1.35
7,2017-06-01 00:03:29,0,40.652649,-73.877029,40.678139,-73.903572,40.648801,-73.882682,196.0,2017-06-01 00:04:13,2017-06-01 23:58:47,-1 days +00:05:26,-1434.566667
8,2017-06-01 00:03:27,0,40.57708,-73.981293,40.64299,-73.878326,40.632258,-73.918318,35.0,2017-06-01 00:03:56,2017-06-01 00:00:00,0 days 00:03:56,3.933333
9,2017-06-01 00:03:51,1,40.640167,-74.130966,40.53426,-74.154213,40.590689,-74.165811,31.0,2017-06-01 00:03:56,2017-06-01 00:01:14,0 days 00:02:42,2.7
10,2017-06-01 00:03:48,0,40.864079,-73.894615,40.860828,-73.82267,40.876032,-73.829543,207.0,2017-06-01 00:04:29,2017-06-01 23:48:35,-1 days +00:15:54,-1424.1


In [33]:
data_cleaned = data_cleaned [data_cleaned["TimeDifference_minutes"].between(-100, +100,inclusive="both")]
data_cleaned

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime,TimeDifference,TimeDifference_minutes
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.635170,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14,-1 days +23:57:45,-2.250000
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.886010,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53,0 days 00:03:03,3.050000
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00,0 days 00:01:03,1.050000
5,2017-06-01 00:03:40,0,40.731342,-73.990288,40.821110,-73.935898,40.792897,-73.950023,73.0,2017-06-01 00:03:56,2017-06-01 00:02:35,0 days 00:01:21,1.350000
7,2017-06-01 00:03:27,0,40.577080,-73.981293,40.642990,-73.878326,40.632258,-73.918318,35.0,2017-06-01 00:03:56,2017-06-01 00:00:00,0 days 00:03:56,3.933333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
881755,2017-06-05 17:31:22,0,40.577080,-73.981293,40.642990,-73.878326,40.607248,-73.962166,70.0,2017-06-05 17:31:54,2017-06-05 17:26:29,0 days 00:05:25,5.416667
881756,2017-06-05 17:31:33,0,40.578247,-73.939743,40.621841,-74.028366,40.621746,-74.028266,13.0,2017-06-05 17:31:41,2017-06-05 17:20:00,0 days 00:11:41,11.683333
881757,2017-06-05 17:31:24,0,40.836311,-73.948433,40.810085,-73.876396,40.828490,-73.935679,570.0,2017-06-05 17:37:34,2017-06-05 17:32:22,0 days 00:05:12,5.200000
881758,2017-06-05 17:31:20,0,40.849327,-73.936508,40.885086,-73.900436,40.845467,-73.925123,490.0,2017-06-05 17:35:34,2017-06-05 17:29:06,0 days 00:06:28,6.466667


In [30]:
# Reset index after cleaning
data_cleaned = data_cleaned.reset_index(drop=True)
data_cleaned

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime,TimeDifference,TimeDifference_minutes
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.635170,-73.960803,76.0,2017-06-01 00:03:59,2017-06-01 00:06:14,-1 days +23:57:45,-2.250000
1,2017-06-01 00:03:43,1,40.643169,-74.073494,40.575935,-74.167686,40.590802,-74.158340,62.0,2017-06-01 00:03:56,2017-06-01 23:58:02,-1 days +00:05:54,-1434.100000
2,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.886010,-73.912647,5.0,2017-06-01 00:03:56,2017-06-01 00:00:53,0 days 00:03:03,3.050000
3,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,2017-06-01 00:03:00,0 days 00:01:03,1.050000
4,2017-06-01 00:03:22,1,40.881187,-73.909340,40.809654,-73.928360,40.868134,-73.893032,11.0,2017-06-01 00:03:56,2017-06-01 23:59:38,-1 days +00:04:18,-1435.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
881755,2017-06-05 17:31:22,0,40.577080,-73.981293,40.642990,-73.878326,40.607248,-73.962166,70.0,2017-06-05 17:31:54,2017-06-05 17:26:29,0 days 00:05:25,5.416667
881756,2017-06-05 17:31:33,0,40.578247,-73.939743,40.621841,-74.028366,40.621746,-74.028266,13.0,2017-06-05 17:31:41,2017-06-05 17:20:00,0 days 00:11:41,11.683333
881757,2017-06-05 17:31:24,0,40.836311,-73.948433,40.810085,-73.876396,40.828490,-73.935679,570.0,2017-06-05 17:37:34,2017-06-05 17:32:22,0 days 00:05:12,5.200000
881758,2017-06-05 17:31:20,0,40.849327,-73.936508,40.885086,-73.900436,40.845467,-73.925123,490.0,2017-06-05 17:35:34,2017-06-05 17:29:06,0 days 00:06:28,6.466667


## Feature engineering

## Set target and features

In [26]:
# Set the target variable (y) as the time difference in minutes
y = data_cleaned['TimeDifference_minutes']

# Set the features (X) by dropping the target column and time columns
X = data_cleaned.drop(columns=['TimeDifference_minutes', 'ExpectedArrivalTime', 'ScheduledArrivalTime', 'RecordedAtTime'])

# Inspect the feature set (X) and target (y)
X.head(10), y.head(10)

(    DirectionRef  OriginLat  OriginLong  DestinationLat  DestinationLong  \
 0              0  40.616104  -74.031143       40.656048       -73.907379   
 1              1  40.643169  -74.073494       40.575935       -74.167686   
 2              0  40.875008  -73.880142       40.912376       -73.902534   
 3              0  40.701748  -73.802399       40.666012       -73.735939   
 4              1  40.881187  -73.909340       40.809654       -73.928360   
 5              0  40.731342  -73.990288       40.821110       -73.935898   
 7              0  40.652649  -73.877029       40.678139       -73.903572   
 8              0  40.577080  -73.981293       40.642990       -73.878326   
 9              1  40.640167  -74.130966       40.534260       -74.154213   
 10             0  40.864079  -73.894615       40.860828       -73.822670   
 
     VehicleLocation.Latitude  VehicleLocation.Longitude  DistanceFromStop  \
 0                  40.635170                 -73.960803              76.

##Feature scaling

In [27]:
# Scale numerical features (e.g., latitude, longitude, distance)
scaler = StandardScaler()
X[['OriginLat', 'OriginLong', 'DestinationLat', 'DestinationLong',
   'VehicleLocation.Latitude', 'VehicleLocation.Longitude', 'DistanceFromStop']] = scaler.fit_transform(
    X[['OriginLat', 'OriginLong', 'DestinationLat', 'DestinationLong',
        'VehicleLocation.Latitude', 'VehicleLocation.Longitude', 'DistanceFromStop']])

# Inspect the scaled feature set
X.head(10)

Unnamed: 0,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,TimeDifference
0,0,-1.252322,-1.061153,-0.812281,0.2568,-1.077873,-0.339389,-0.1614,-1 days +23:57:45
1,1,-0.953419,-1.510986,-1.702737,-2.522572,-1.588648,-2.578276,-0.176105,-1 days +00:05:54
2,0,1.606985,0.542712,2.036807,0.308531,1.809859,0.206412,-0.235974,0 days 00:03:03
3,0,-0.306479,1.368463,-0.701531,2.087314,-0.699903,2.283925,0.039212,0 days 00:01:03
4,1,1.675225,0.232584,0.895051,0.03278,1.604066,0.428728,-0.229672,-1 days +00:04:18
5,0,0.020353,-0.627209,1.022384,-0.047705,0.737919,-0.217208,-0.164551,0 days 00:01:21
7,0,-0.848723,0.575777,-0.566739,0.297448,-0.920949,0.546035,-0.035361,-1 days +00:05:26
8,0,-1.683299,-0.531668,-0.957421,0.567007,-1.111397,0.142136,-0.204464,0 days 00:03:56
9,1,-0.986573,-2.121427,-2.165955,-2.378717,-1.589949,-2.662953,-0.208665,0 days 00:02:42
10,0,1.486286,0.388986,1.46385,1.161262,1.69499,1.148314,-0.023808,-1 days +00:15:54


In [28]:
# Inspect the shapes of X and y
print(X.shape, y.shape)

(881760, 9) (881760,)
