In [12]:
import pandas as pd
import numpy as np

# Loading Integrated Dataset

df = pd.read_csv("Integrated_Dataset_Harini_Mukesh.csv")
df.head()


Unnamed: 0,DateTime,Junction,Vehicles,ID,timestamp,temperature,humidity,precipitation,windspeed,event_flag,event_type,date,hour,dayofweek
0,11/1/2015 0:00,1,15,20151101001,11/1/2015 0:00,24.4,96,0.2,10.1,0,none,11/1/2015,0,6
1,11/1/2015 1:00,1,13,20151101011,11/1/2015 1:00,24.4,95,0.3,10.6,0,none,11/1/2015,1,6
2,11/1/2015 2:00,1,10,20151101021,11/1/2015 2:00,25.1,92,0.2,10.4,0,none,11/1/2015,2,6
3,11/1/2015 3:00,1,7,20151101031,11/1/2015 3:00,25.9,88,0.5,11.0,0,none,11/1/2015,3,6
4,11/1/2015 4:00,1,9,20151101041,11/1/2015 4:00,27.0,82,0.2,10.3,0,none,11/1/2015,4,6


In [2]:
df.shape
df.info()
df.isna().sum()
df.duplicated().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48220 entries, 0 to 48219
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   DateTime       48220 non-null  object 
 1   Junction       48220 non-null  int64  
 2   Vehicles       48220 non-null  int64  
 3   ID             48220 non-null  int64  
 4   timestamp      48220 non-null  object 
 5   temperature    48220 non-null  float64
 6   humidity       48220 non-null  int64  
 7   precipitation  48220 non-null  float64
 8   windspeed      48220 non-null  float64
 9   event_flag     48220 non-null  int64  
 10  event_type     48220 non-null  object 
 11  date           48220 non-null  object 
 12  hour           48220 non-null  int64  
 13  dayofweek      48220 non-null  int64  
dtypes: float64(3), int64(7), object(4)
memory usage: 5.2+ MB


0

In [3]:
agg_df = (
    df.groupby(["timestamp", "Junction"])
      .agg({
          "Vehicles": "sum",
          "temperature": "mean",
          "humidity": "mean",
          "precipitation": "mean",
          "windspeed": "mean",
          "event_flag": "max"
      })
      .reset_index()
)

agg_df.head()


Unnamed: 0,timestamp,Junction,Vehicles,temperature,humidity,precipitation,windspeed,event_flag
0,1/1/2016 0:00,1,13,23.0,80.0,0.0,11.1,1
1,1/1/2016 0:00,2,8,23.0,80.0,0.0,11.1,1
2,1/1/2016 0:00,3,3,23.0,80.0,0.0,11.1,1
3,1/1/2016 10:00,1,11,27.5,58.0,0.0,21.0,0
4,1/1/2016 10:00,2,7,27.5,58.0,0.0,21.0,0


Explanation:

Traffic data was aggregated at an hourly level per junction to ensure consistent temporal granularity.

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

num_cols = ["Vehicles", "temperature", "humidity", "precipitation", "windspeed"]

agg_df[num_cols] = scaler.fit_transform(agg_df[num_cols])

agg_df.head()


Unnamed: 0,timestamp,Junction,Vehicles,temperature,humidity,precipitation,windspeed,event_flag
0,1/1/2016 0:00,1,0.067039,0.258197,0.726027,0.0,0.170245,1
1,1/1/2016 0:00,2,0.039106,0.258197,0.726027,0.0,0.170245,1
2,1/1/2016 0:00,3,0.011173,0.258197,0.726027,0.0,0.170245,1
3,1/1/2016 10:00,1,0.055866,0.442623,0.424658,0.0,0.322086,0
4,1/1/2016 10:00,2,0.03352,0.442623,0.424658,0.0,0.322086,0


Explanation:

Normalization was applied to ensure comparability across variables.

In [5]:
agg_df["timestamp"] = pd.to_datetime(agg_df["timestamp"])

agg_df["hour"] = agg_df["timestamp"].dt.hour
agg_df["dayofweek"] = agg_df["timestamp"].dt.dayofweek
agg_df["month"] = agg_df["timestamp"].dt.month
agg_df["is_weekend"] = agg_df["dayofweek"].isin([5,6]).astype(int)


In [6]:
agg_df = agg_df.sort_values(["Junction", "timestamp"])

agg_df["lag_1h"] = agg_df.groupby("Junction")["Vehicles"].shift(1)
agg_df["lag_24h"] = agg_df.groupby("Junction")["Vehicles"].shift(24)

agg_df.fillna(method="bfill", inplace=True)


  agg_df.fillna(method="bfill", inplace=True)


Explanation:

Lag features capture short-term and daily temporal dependencies in traffic flow.

In [10]:
# Correlation Analysis

corr = agg_df.corr()
corr["Vehicles"].sort_values(ascending=False)


Vehicles         1.000000
lag_1h           0.970115
lag_24h          0.899664
timestamp        0.266125
hour             0.218596
humidity         0.100527
windspeed        0.001644
precipitation   -0.025224
month           -0.026439
temperature     -0.044250
event_flag      -0.057485
dayofweek       -0.127840
is_weekend      -0.151193
Junction        -0.610565
Name: Vehicles, dtype: float64

In [11]:
# Random Forest feature importance

from sklearn.ensemble import RandomForestRegressor

X = agg_df.drop(columns=["Vehicles", "timestamp"])
y = agg_df["Vehicles"]

model = RandomForestRegressor(random_state=42)
model.fit(X, y)

importance = pd.Series(model.feature_importances_, index=X.columns)
importance.sort_values(ascending=False)


lag_1h           0.942284
lag_24h          0.014823
hour             0.013390
temperature      0.006465
windspeed        0.005765
humidity         0.005175
dayofweek        0.003736
month            0.003301
Junction         0.002560
precipitation    0.001058
is_weekend       0.000867
event_flag       0.000576
dtype: float64

### Feature Importance Analysis

Two approaches were used to evaluate feature relevance:

**Correlation analysis** showed strong linear relationships between traffic volume and lag features, particularly traffic from the previous hour (`lag_1h`) and the same hour on the previous day (`lag_24h`). This highlights strong temporal dependency and recurring daily patterns in traffic flow.

**Random Forest feature importance** indicated that `lag_1h` is the most dominant predictor of traffic volume. Other features such as time of day, weather variables, and event indicators contributed marginally once recent traffic history was known. This suggests that short-term temporal dynamics explain most of the variance in traffic volume.

Differences between the two methods are expected, as correlation measures standalone relationships, while tree-based models evaluate marginal predictive contribution after considering feature interactions.
