# Loading Data

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Loading the combined dataset
combined_data = pd.read_csv('combined_data.csv')

print(combined_data.head())

              DateTime  Junction  Vehicles           ID            date_time  \
0  2015-01-11 00:00:00         1        15  20151101001  2009-01-11 00:00:00   
1  2015-01-11 00:00:00         1        15  20151101001  2010-01-11 00:00:00   
2  2015-01-11 00:00:00         1        15  20151101001  2011-01-11 00:00:00   
3  2015-01-11 00:00:00         1        15  20151101001  2012-01-11 00:00:00   
4  2015-01-11 00:00:00         1        15  20151101001  2013-01-11 00:00:00   

   maxtempC  mintempC  totalSnow_cm  sunHour  uvIndex  ...  precipMM  \
0      27.0      15.0           0.0     11.6      6.0  ...       0.0   
1      26.0      17.0           0.0     11.6      5.0  ...       0.0   
2      28.0      14.0           0.0     11.6      5.0  ...       0.0   
3      29.0      17.0           0.0     11.6      5.0  ...       0.0   
4      29.0      16.0           0.0     11.6      6.0  ...       0.0   

   pressure     tempC visibility winddirDegree windspeedKmph  date  day  \
0    1016.0

In [9]:
#Convert date columns to datetime
combined_data['DateTime'] = pd.to_datetime(combined_data['DateTime'])
combined_data['date_time'] = pd.to_datetime(combined_data['date_time'])
print(combined_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92040 entries, 0 to 92039
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   DateTime           92040 non-null  datetime64[ns]
 1   Junction           92040 non-null  int64         
 2   Vehicles           92040 non-null  int64         
 3   ID                 92040 non-null  int64         
 4   date_time          92040 non-null  datetime64[ns]
 5   maxtempC           92040 non-null  float64       
 6   mintempC           92040 non-null  float64       
 7   totalSnow_cm       92040 non-null  float64       
 8   sunHour            92040 non-null  float64       
 9   uvIndex            92040 non-null  float64       
 10  uvIndex.1          92040 non-null  float64       
 11  moon_illumination  92040 non-null  float64       
 12  moonrise           92040 non-null  object        
 13  moonset            92040 non-null  object        
 14  sunris

# Cleaning data
### Missing Values

In [6]:
import pandas as pd

# Load your data (assuming it's a DataFrame named df)
# df = pd.read_csv('your_data.csv')

# Check for missing values
missing_values = combined_data.isnull().sum()
print(missing_values)


DateTime                0
Junction                0
Vehicles                0
ID                      0
date_time               0
maxtempC                0
mintempC                0
totalSnow_cm            0
sunHour                 0
uvIndex                 0
uvIndex.1               0
moon_illumination       0
moonrise                0
moonset                 0
sunrise                 0
sunset                  0
DewPointC               0
FeelsLikeC              0
HeatIndexC              0
WindChillC              0
WindGustKmph            0
cloudcover              0
humidity                0
precipMM                0
pressure                0
tempC                   0
visibility              0
winddirDegree           0
windspeedKmph           0
date                 3168
day                  3168
holiday              3168
holiday_type         3168
dtype: int64


In [10]:
# Drop columns with a high percentage of missing values
combined_data.drop(columns=['date', 'day', 'holiday', 'holiday_type'], inplace=True)

# Check for any remaining missing values and handle them
# (If there are any remaining missing values in important columns, decide on an imputation strategy)
combined_data.fillna(combined_data.mean(), inplace=True)

# Step 3: Drop irrelevant columns
combined_data.drop(columns=['ID'], inplace=True)

  combined_data.fillna(combined_data.mean(), inplace=True)
  combined_data.fillna(combined_data.mean(), inplace=True)


In [12]:
print(combined_data.head())

    DateTime  Junction  Vehicles  date_time  maxtempC  mintempC  totalSnow_cm  \
0 2015-01-11         1        15 2009-01-11      27.0      15.0           0.0   
1 2015-01-11         1        15 2010-01-11      26.0      17.0           0.0   
2 2015-01-11         1        15 2011-01-11      28.0      14.0           0.0   
3 2015-01-11         1        15 2012-01-11      29.0      17.0           0.0   
4 2015-01-11         1        15 2013-01-11      29.0      16.0           0.0   

   sunHour  uvIndex  uvIndex.1  ...  WindChillC WindGustKmph cloudcover  \
0     11.6      6.0        1.0  ...        17.0         19.0        8.0   
1     11.6      5.0        1.0  ...        19.0         15.0       52.0   
2     11.6      5.0        1.0  ...        17.0         22.0        0.0   
3     11.6      5.0        1.0  ...        19.0         10.0       65.0   
4     11.6      6.0        1.0  ...        17.0         22.0        9.0   

   humidity precipMM  pressure     tempC  visibility  winddirD

In [13]:
# Normalize numerical features
numerical_features = ['Vehicles', 'maxtempC', 'mintempC', 'totalSnow_cm', 'sunHour', 'uvIndex', 'moon_illumination', 
                      'DewPointC', 'FeelsLikeC', 'HeatIndexC', 'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity', 
                      'precipMM', 'pressure', 'tempC', 'visibility', 'winddirDegree', 'windspeedKmph']
scaler = StandardScaler()
combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])


In [16]:
# Verifying data cleaning
print(combined_data.head())
print(combined_data.info())
print(combined_data.describe())

    DateTime  Junction  Vehicles  date_time  maxtempC  mintempC  totalSnow_cm  \
0 2015-01-11         1  -0.14097 2009-01-11  0.219491 -1.564123           0.0   
1 2015-01-11         1  -0.14097 2010-01-11 -0.177560 -0.616582           0.0   
2 2015-01-11         1  -0.14097 2011-01-11  0.616543 -2.037894           0.0   
3 2015-01-11         1  -0.14097 2012-01-11  1.013594 -0.616582           0.0   
4 2015-01-11         1  -0.14097 2013-01-11  1.013594 -1.090352           0.0   

    sunHour   uvIndex  uvIndex.1  ...  WindChillC WindGustKmph cloudcover  \
0  0.586658  1.165652        1.0  ...   -1.304933     0.030593  -1.529921   
1  0.586658 -0.417307        1.0  ...   -0.702150    -0.749465   0.367961   
2  0.586658 -0.417307        1.0  ...   -1.304933     0.615638  -1.874991   
3  0.586658 -0.417307        1.0  ...   -0.702150    -1.724539   0.928699   
4  0.586658  1.165652        1.0  ...   -1.304933     0.615638  -1.486788   

   humidity  precipMM  pressure     tempC  visibil

# Feature Engineering

In [17]:
import pandas as pd
import numpy as np

# Create time-based features
combined_data['hour'] = combined_data['DateTime'].dt.hour
combined_data['day'] = combined_data['DateTime'].dt.day
combined_data['month'] = combined_data['DateTime'].dt.month
combined_data['day_of_week'] = combined_data['DateTime'].dt.dayofweek
combined_data['is_weekend'] = combined_data['day_of_week'].isin([5, 6]).astype(int)  # 5: Saturday, 6: Sunday

# Create lag features
combined_data['lag_1'] = combined_data['Vehicles'].shift(1)
combined_data['lag_2'] = combined_data['Vehicles'].shift(2)
combined_data['lag_3'] = combined_data['Vehicles'].shift(3)

# Drop rows with NaN values created by lag features
combined_data.dropna(inplace=True)

# Check the resulting DataFrame
print(combined_data.head())
print(combined_data.info())


    DateTime  Junction  Vehicles  date_time  maxtempC  mintempC  totalSnow_cm  \
3 2015-01-11         1  -0.14097 2012-01-11  1.013594 -0.616582           0.0   
4 2015-01-11         1  -0.14097 2013-01-11  1.013594 -1.090352           0.0   
5 2015-01-11         1  -0.14097 2014-01-11  0.616543 -1.564123           0.0   
6 2015-01-11         1  -0.14097 2015-01-11 -0.177560 -2.985436           0.0   
7 2015-01-11         1  -0.14097 2016-01-11  0.219491 -2.037894           0.0   

    sunHour   uvIndex  uvIndex.1  ...  winddirDegree windspeedKmph hour day  \
3  0.586658 -0.417307        1.0  ...       1.286170     -1.859814    0  11   
4  0.586658  1.165652        1.0  ...      -0.411496     -0.171996    0  11   
5  0.586658  1.165652        1.0  ...      -0.156846     -0.413113    0  11   
6  0.586658 -0.417307        1.0  ...      -1.090562     -0.654230    0  11   
7  0.586658 -0.417307        1.0  ...      -0.377542     -0.413113    0  11   

  month  day_of_week  is_weekend    la

# Splitting the Dataset

In [18]:
from sklearn.model_selection import TimeSeriesSplit

# Define features and target
features = combined_data.drop(columns=['DateTime', 'Vehicles', 'date_time', 'moonrise', 'moonset', 'sunrise', 'sunset'])
target = combined_data['Vehicles']

# Time-based splitting
tscv = TimeSeriesSplit(n_splits=5)

# Loop over splits
for train_index, val_index in tscv.split(features):
    X_train, X_val = features.iloc[train_index], features.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]

    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_val.shape}")

    break  # Remove this break after verifying the split


Training data shape: (15342, 29)
Validation data shape: (15339, 29)


# ARIMA Model:

In [28]:
combined_data = combined_data.sort_values(by='DateTime')


In [29]:
combined_data.set_index('DateTime', inplace=True)
combined_data = combined_data.asfreq('D')  # or 'H' for hourly, as per your data


  combined_data = combined_data.asfreq('D')  # or 'H' for hourly, as per your data


ValueError: cannot reindex on an axis with duplicate labels

In [30]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Ensure data is sorted and set frequency
combined_data = combined_data.sort_values(by='DateTime')
combined_data.set_index('DateTime', inplace=True)
combined_data = combined_data.asfreq('D')  # Assuming daily frequency, adjust if needed

# Define features and target for ARIMA
train = combined_data.loc[:'2021-12-31', 'Vehicles']  # Example split, adjust as needed
test = combined_data.loc['2022-01-01:', 'Vehicles']

# Train ARIMA model
arima_model = ARIMA(train, order=(5, 1, 0))  # Example order, adjust based on model selection
arima_model_fit = arima_model.fit()

# Make predictions
arima_predictions = arima_model_fit.forecast(steps=len(test))
arima_mse = mean_squared_error(test, arima_predictions)
print(f"ARIMA Model MSE: {arima_mse}")



KeyError: "None of ['DateTime'] are in the columns"

# Training the Gradient Boosting Regressor Model

In [19]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor()

# Train the model on the training data
gbr.fit(X_train, y_train)

# Make predictions on the validation data
predictions = gbr.predict(X_val)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_val, predictions)

print(f"Gradient Boosting Regressor MSE: {mse}")


Gradient Boosting Regressor MSE: 1.155235206746355


# Hyperparameter Tuning with Grid Search