In [64]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [65]:
# Step 1: load the dataset
df = pd.read_csv("Dataset_Uber Traffic.csv")
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,01/11/15 0:00,1,15,20151101001
1,01/11/15 1:00,1,13,20151101011
2,01/11/15 2:00,1,10,20151101021
3,01/11/15 3:00,1,7,20151101031
4,01/11/15 4:00,1,9,20151101041


In [66]:
# Step 2: Data Cleaning and Pre-processing
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%y %H:%M')
df = df.dropna().drop_duplicates()
df = df.astype({'Junction': int, 'Vehicles': int, 'ID': int})
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,-1323735479
1,2015-11-01 01:00:00,1,13,-1323735469
2,2015-11-01 02:00:00,1,10,-1323735459
3,2015-11-01 03:00:00,1,7,-1323735449
4,2015-11-01 04:00:00,1,9,-1323735439


In [67]:
# Step 3: Aggregate traffic data into hourly intervals for each junction
df_hourly = df.groupby([pd.Grouper(key='DateTime', freq='h'), 'Junction'])['Vehicles'].sum().reset_index()
df_hourly.head()

Unnamed: 0,DateTime,Junction,Vehicles
0,2015-11-01 00:00:00,1,15
1,2015-11-01 00:00:00,2,6
2,2015-11-01 00:00:00,3,9
3,2015-11-01 01:00:00,1,13
4,2015-11-01 01:00:00,2,6


In [68]:
# Step 4: Normalize vehicle counts
scaler = StandardScaler()
df_hourly['Vehicles_scaled'] = scaler.fit_transform(df_hourly[['Vehicles']])
df_hourly['Vehicles_scaled'].head()

0   -0.375489
1   -0.809227
2   -0.664648
3   -0.471875
4   -0.809227
Name: Vehicles_scaled, dtype: float64

In [69]:
# Step 5: Feature Engineering
df_hourly['Hour'] = df_hourly['DateTime'].dt.hour
df_hourly['DayOfWeek'] = df_hourly['DateTime'].dt.dayofweek
df_hourly['Month'] = df_hourly['DateTime'].dt.month
df_hourly['Lag_1'] = df_hourly.groupby('Junction')['Vehicles'].shift(1)
df_hourly['Lag_24'] = df_hourly.groupby('Junction')['Vehicles'].shift(24)
df_hourly['IsWeekend'] = df_hourly['DayOfWeek'].isin([5, 6]).astype(int)


In [70]:
# Finding the NaN values in each columns
df_hourly.isnull().sum()

DateTime            0
Junction            0
Vehicles            0
Vehicles_scaled     0
Hour                0
DayOfWeek           0
Month               0
Lag_1               4
Lag_24             96
IsWeekend           0
dtype: int64

In [71]:
# Drop rows with any NaN values before calculating correlation
df_hourly_cleaned = df_hourly.dropna(subset=['Lag_1', 'Lag_24'])
df_hourly_cleaned.isnull().sum()


DateTime           0
Junction           0
Vehicles           0
Vehicles_scaled    0
Hour               0
DayOfWeek          0
Month              0
Lag_1              0
Lag_24             0
IsWeekend          0
dtype: int64

In [72]:
# Calculate the correlation matrix
correlation_matrix_cleaned = df_hourly_cleaned[['Vehicles', 'Hour', 'DayOfWeek', 'Month', 'Lag_1', 'Lag_24', 'IsWeekend']].corr()

correlation_matrix_cleaned


Unnamed: 0,Vehicles,Hour,DayOfWeek,Month,Lag_1,Lag_24,IsWeekend
Vehicles,1.0,0.2201674,-0.1241237,-0.021901,0.970122,0.905146,-0.1486294
Hour,0.220167,1.0,1.651634e-17,1.483313e-15,0.200565,0.219917,2.164045e-17
DayOfWeek,-0.124124,1.651634e-17,1.0,-0.005431359,-0.120821,0.00941,0.7891759
Month,-0.021901,1.483313e-15,-0.005431359,1.0,-0.021698,-0.018188,-0.007949083
Lag_1,0.970122,0.2005647,-0.1208206,-0.02169809,1.0,0.896069,-0.147177
Lag_24,0.905146,0.2199172,0.009410208,-0.01818813,0.896069,1.0,-0.05069604
IsWeekend,-0.148629,2.164045e-17,0.7891759,-0.007949083,-0.147177,-0.050696,1.0
