In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pltm
import seaborn as sns

LOAD DATA

In [2]:
df = pd.read_csv("D:\upgrad\internship\Dataset_Uber Traffic - Dataset_Uber Traffic.csv")
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,01/11/15 0:00,1,15,20151101001
1,01/11/15 1:00,1,13,20151101011
2,01/11/15 2:00,1,10,20151101021
3,01/11/15 3:00,1,7,20151101031
4,01/11/15 4:00,1,9,20151101041


Cleaning Data

In [5]:
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%y %H:%M')
df.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,20151101001
1,2015-11-01 01:00:00,1,13,20151101011
2,2015-11-01 02:00:00,1,10,20151101021
3,2015-11-01 03:00:00,1,7,20151101031
4,2015-11-01 04:00:00,1,9,20151101041


1.Checking for missing values and removing duplicates

In [6]:
df = df.drop_duplicates()
missing_values = df.isnull().sum()

In [8]:
df['Hour'] = df['DateTime'].dt.hour
hourly_traffic = df.groupby(['Junction', 'Hour'], as_index=False)['Vehicles'].sum()



Normalizing Data

In [9]:
hourly_traffic['Normalized_Vehicles'] = (hourly_traffic['Vehicles'] - hourly_traffic['Vehicles'].min()) / (
    hourly_traffic['Vehicles'].max() - hourly_traffic['Vehicles'].min()
)

In [10]:
df.info(), hourly_traffic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  48120 non-null  datetime64[ns]
 1   Junction  48120 non-null  int64         
 2   Vehicles  48120 non-null  int64         
 3   ID        48120 non-null  int64         
 4   Hour      48120 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(3)
memory usage: 1.7 MB


(None,
    Junction  Hour  Vehicles  Normalized_Vehicles
 0         1     0     27809             0.773385
 1         1     1     23807             0.659221
 2         1     2     20616             0.568193
 3         1     3     17894             0.490543
 4         1     4     15598             0.425046)

Feature Engineering and Selection

1. Extracting time-based features

In [11]:
df['DayOfWeek'] = df['DateTime'].dt.dayofweek  
df['Month'] = df['DateTime'].dt.month

2. Creating binary indicators for weekends and special events to account for their impact on traffic.

In [12]:
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

Creating lag features (traffic data from previous hours)

In [13]:
df['Lag_1Hour'] = df.groupby('Junction')['Vehicles'].shift(1)  
df['Lag_2Hours'] = df.groupby('Junction')['Vehicles'].shift(2)  



Correlation_matrix

In [15]:
correlation_matrix = df[['Vehicles', 'Hour', 'DayOfWeek', 'Month', 'IsWeekend', 'Lag_1Hour', 'Lag_2Hours']].corr()
correlation_matrix

Unnamed: 0,Vehicles,Hour,DayOfWeek,Month,IsWeekend,Lag_1Hour,Lag_2Hours
Vehicles,1.0,0.2199377,-0.1260265,-0.02272345,-0.1505503,0.970143,0.937142
Hour,0.219938,1.0,6.4421240000000004e-18,4.967201e-16,1.909347e-18,0.200284,0.169536
DayOfWeek,-0.126027,6.4421240000000004e-18,1.0,-0.003208219,0.7901746,-0.122658,-0.118394
Month,-0.022723,4.967201e-16,-0.003208219,1.0,-0.005589111,-0.02248,-0.022213
IsWeekend,-0.15055,1.909347e-18,0.7901746,-0.005589111,1.0,-0.149027,-0.14716
Lag_1Hour,0.970143,0.2002844,-0.1226579,-0.02247991,-0.1490272,1.0,0.970145
Lag_2Hours,0.937142,0.1695357,-0.1183936,-0.02221253,-0.1471595,0.970145,1.0


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


Dropping rows with NaN values from lag features

In [21]:
df_clean = df.dropna()

Feature selection

In [22]:
features = ['Hour', 'DayOfWeek', 'Month', 'IsWeekend', 'Lag_1Hour', 'Lag_2Hours']
X = df_clean[features]
y = df_clean['Vehicles']

Splitting the data

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training a Random Forest model

In [25]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)


feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)


feature_importance

Unnamed: 0,Feature,Importance
4,Lag_1Hour,0.946777
0,Hour,0.022545
5,Lag_2Hours,0.01418
2,Month,0.009096
1,DayOfWeek,0.006258
3,IsWeekend,0.001144
