In [2]:
import pandas as pd

In [12]:
import pandas as pd

# Load the dataset
file_path = "C://Users//shaik//Mentor_Ship - Upgrad//Dataset_Uber Traffic.csv"
df = pd.read_csv(file_path)

# Display basic info and preview of the data
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


(None,
         DateTime  Junction  Vehicles           ID
 0  01/11/15 0:00         1        15  20151101001
 1  01/11/15 1:00         1        13  20151101011
 2  01/11/15 2:00         1        10  20151101021
 3  01/11/15 3:00         1         7  20151101031
 4  01/11/15 4:00         1         9  20151101041)

In [16]:
from sklearn.preprocessing import StandardScaler
import numpy as np


In [14]:
# Step 1 & 2: Data Cleaning & Pre-processing
# Convert DateTime to datetime format
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%y %H:%M')

# Handle missing values (if any)
df = df.dropna()

# Remove duplicates
df = df.drop_duplicates()

# Ensure correct data types
df = df.astype({'Junction': int, 'Vehicles': int, 'ID': int})


In [17]:
# Step 3: Aggregate traffic data into hourly intervals for each junction
df_hourly = df.groupby([pd.Grouper(key='DateTime', freq='h'), 'Junction'])['Vehicles'].sum().reset_index()


In [18]:
# Step 4: Normalize vehicle counts
scaler = StandardScaler()
df_hourly['Vehicles_scaled'] = scaler.fit_transform(df_hourly[['Vehicles']])


In [19]:
# Step 5: Feature Engineering
# Time-based features
df_hourly['Hour'] = df_hourly['DateTime'].dt.hour
df_hourly['DayOfWeek'] = df_hourly['DateTime'].dt.dayofweek
df_hourly['Month'] = df_hourly['DateTime'].dt.month

# Lag features
df_hourly['Lag_1'] = df_hourly.groupby('Junction')['Vehicles'].shift(1)
df_hourly['Lag_24'] = df_hourly.groupby('Junction')['Vehicles'].shift(24)

# Binary indicator for weekend
df_hourly['IsWeekend'] = df_hourly['DayOfWeek'].isin([5, 6]).astype(int)

# (Optional) Binary indicator for special events (if known, otherwise skipped)


In [20]:
# Step 6: Evaluate feature importance using correlation
correlation_matrix = df_hourly[['Vehicles', 'Hour', 'DayOfWeek', 'Month', 'Lag_1', 'Lag_24', 'IsWeekend']].corr()

# Outputting processed dataset preview and correlation matrix
df_hourly.head(), correlation_matrix


(             DateTime  Junction  Vehicles  Vehicles_scaled  Hour  DayOfWeek  \
 0 2015-11-01 00:00:00         1        15        -0.375489     0          6   
 1 2015-11-01 00:00:00         2         6        -0.809227     0          6   
 2 2015-11-01 00:00:00         3         9        -0.664648     0          6   
 3 2015-11-01 01:00:00         1        13        -0.471875     1          6   
 4 2015-11-01 01:00:00         2         6        -0.809227     1          6   
 
    Month  Lag_1  Lag_24  IsWeekend  
 0     11    NaN     NaN          1  
 1     11    NaN     NaN          1  
 2     11    NaN     NaN          1  
 3     11   15.0     NaN          1  
 4     11    6.0     NaN          1  ,
            Vehicles          Hour     DayOfWeek         Month     Lag_1  \
 Vehicles   1.000000  2.199377e-01 -1.260265e-01 -2.272345e-02  0.970143   
 Hour       0.219938  1.000000e+00  4.629477e-17  1.496685e-15  0.200284   
 DayOfWeek -0.126027  4.629477e-17  1.000000e+00 -3.208219e-0