 # Loading the Data

In [5]:
import pandas as pd

# Loading the combined dataset
combined_data = pd.read_csv('combined_data.csv')

print(combined_data.head())


              DateTime  Junction  Vehicles           ID            date_time  \
0  2015-01-11 00:00:00         1        15  20151101001  2009-01-11 00:00:00   
1  2015-01-11 00:00:00         1        15  20151101001  2010-01-11 00:00:00   
2  2015-01-11 00:00:00         1        15  20151101001  2011-01-11 00:00:00   
3  2015-01-11 00:00:00         1        15  20151101001  2012-01-11 00:00:00   
4  2015-01-11 00:00:00         1        15  20151101001  2013-01-11 00:00:00   

   maxtempC  mintempC  totalSnow_cm  sunHour  uvIndex  ...  precipMM  \
0      27.0      15.0           0.0     11.6      6.0  ...       0.0   
1      26.0      17.0           0.0     11.6      5.0  ...       0.0   
2      28.0      14.0           0.0     11.6      5.0  ...       0.0   
3      29.0      17.0           0.0     11.6      5.0  ...       0.0   
4      29.0      16.0           0.0     11.6      6.0  ...       0.0   

   pressure     tempC visibility winddirDegree windspeedKmph  date  day  \
0    1016.0

In [6]:
# Converting all DateTime columns to datetime type
combined_data['DateTime'] = pd.to_datetime(combined_data['DateTime'])
combined_data['date_time'] = pd.to_datetime(combined_data['DateTime'])
combined_data['date'] = pd.to_datetime(combined_data['DateTime'])

print(combined_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92040 entries, 0 to 92039
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   DateTime           92040 non-null  datetime64[ns]
 1   Junction           92040 non-null  int64         
 2   Vehicles           92040 non-null  int64         
 3   ID                 92040 non-null  int64         
 4   date_time          92040 non-null  datetime64[ns]
 5   maxtempC           92040 non-null  float64       
 6   mintempC           92040 non-null  float64       
 7   totalSnow_cm       92040 non-null  float64       
 8   sunHour            92040 non-null  float64       
 9   uvIndex            92040 non-null  float64       
 10  uvIndex.1          92040 non-null  float64       
 11  moon_illumination  92040 non-null  float64       
 12  moonrise           92040 non-null  object        
 13  moonset            92040 non-null  object        
 14  sunris

# Splitting the Dataset

### training and validation sets based on the DateTime column

In [7]:
# Check the starting and ending dates
start_date = combined_data['DateTime'].min()
end_date = combined_data['DateTime'].max()

print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")

Start Date: 2015-01-11 00:00:00
End Date: 2017-12-06 23:00:00


In [8]:
start_date = pd.Timestamp('2015-01-11 00:00:00')
# Setting end_date a few days before the max_date for validation
end_date = pd.Timestamp('2017-12-01 00:00:00')  # Adjusted end_date for splitting

# Creating training and validation datasets
train_data = combined_data[(combined_data['DateTime'] >= start_date) & (combined_data['DateTime'] < end_date)]
validation_data = combined_data[combined_data['DateTime'] >= end_date]

# Checking the shapes
print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {validation_data.shape}")

Training data shape: (91464, 33)
Validation data shape: (576, 33)


# Arima Model training

In [9]:
pip install pmdarima


Note: you may need to restart the kernel to use updated packages.


In [None]:
from pmdarima import auto_arima

# Find the best ARIMA parameters automatically
auto_model = auto_arima(train_data['Vehicles'], seasonal=False, trace=True)

# Extract the optimal parameters
p, d, q = auto_model.order
print(f'Optimal ARIMA parameters: p={p}, d={d}, q={q}')


In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Define the ARIMA model with the optimal parameters
model = ARIMA(train_data['Vehicles'], order=(5, 1, 2))

# Fit the model
model_fit = model.fit()

# Print the summary of the model
print(model_fit.summary())


# Making Predictions

In [None]:
# Make predictions on the validation set
predictions = model_fit.predict(start=len(train_data), end=len(train_data) + len(validation_data) - 1, dynamic=False)

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, index=validation_data.index, columns=['Predicted'])

# Evaluation metrics 

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

target_column = 'Vehicles'

# Calculate evaluation metrics
mae = mean_absolute_error(validation_data[target_column], predictions)
rmse = np.sqrt(mean_squared_error(validation_data[target_column], predictions))

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


### Arima model is trained

# Training LSTM (Long Short-Term Memory) Model

In [None]:
pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import numpy as np


In [None]:
# Normalize the data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_data[[target_column]])
val_scaled = scaler.transform(validation_data[[target_column]])

# Convert data to sequences for LSTM
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 10  # Example sequence length
X_train, y_train = create_sequences(train_scaled, seq_length)
X_val, y_val = create_sequences(val_scaled, seq_length)

# Reshape the data to fit the LSTM input
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))


### training LSTM Model


In [None]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))



### Making Predictions with LSTM

In [None]:
# Make predictions
lstm_predictions = model.predict(X_val)

# Inverse transform the predictions and actual values
lstm_predictions = scaler.inverse_transform(lstm_predictions)
y_val = scaler.inverse_transform(y_val.reshape(-1, 1))

# Calculate evaluation metrics
lstm_mae = mean_absolute_error(y_val, lstm_predictions)
lstm_rmse = np.sqrt(mean_squared_error(y_val, lstm_predictions))

print(f'LSTM Mean Absolute Error (MAE): {lstm_mae}')
print(f'LSTM Root Mean Squared Error (RMSE): {lstm_rmse}')


# Training Gradient Boosting Trees

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# No need to reshape the data for tree-based models
X_train = train_data.drop(columns=[target_column])
y_train = train_data[target_column]
X_val = validation_data.drop(columns=[target_column])
y_val = validation_data[target_column]


### Checking Data

In [None]:
print(X_train.dtypes)  # Check feature data types
print(y_train.dtypes)  # Check target data type

# Example for validation set
print(X_val.dtypes)
print(y_val.dtypes)


###  Convert DateTime Columns

In [None]:
# Convert DateTime column to numeric features
X_train['DateTime'] = pd.to_datetime(X_train['DateTime'])
X_val['DateTime'] = pd.to_datetime(X_val['DateTime'])

X_train['Year'] = X_train['DateTime'].dt.year
X_train['Month'] = X_train['DateTime'].dt.month
X_train['Day'] = X_train['DateTime'].dt.day
X_train['Hour'] = X_train['DateTime'].dt.hour

X_val['Year'] = X_val['DateTime'].dt.year
X_val['Month'] = X_val['DateTime'].dt.month
X_val['Day'] = X_val['DateTime'].dt.day
X_val['Hour'] = X_val['DateTime'].dt.hour

# Drop the original DateTime column if not needed
X_train = X_train.drop(columns=['DateTime'])
X_val = X_val.drop(columns=['DateTime'])


### Convert Time Strings to Numeric Features

In [None]:
from datetime import datetime

def time_to_numeric(time_str):
    try:
        # Convert time string to datetime object
        time_obj = datetime.strptime(time_str, '%I:%M %p')
        return time_obj.hour + time_obj.minute / 60.0  # Convert to decimal hours
    except ValueError:
        return np.nan  # Handle cases where time_str might be invalid or missing

# Apply the conversion to time columns
X_train['moonrise_numeric'] = X_train['moonrise'].apply(time_to_numeric)
X_train['moonset_numeric'] = X_train['moonset'].apply(time_to_numeric)
X_train['sunrise_numeric'] = X_train['sunrise'].apply(time_to_numeric)
X_train['sunset_numeric'] = X_train['sunset'].apply(time_to_numeric)

X_val['moonrise_numeric'] = X_val['moonrise'].apply(time_to_numeric)
X_val['moonset_numeric'] = X_val['moonset'].apply(time_to_numeric)
X_val['sunrise_numeric'] = X_val['sunrise'].apply(time_to_numeric)
X_val['sunset_numeric'] = X_val['sunset'].apply(time_to_numeric)

# Drop the original time columns if no longer needed
X_train = X_train.drop(columns=['moonrise', 'moonset', 'sunrise', 'sunset'])
X_val = X_val.drop(columns=['moonrise', 'moonset', 'sunrise', 'sunset'])


### Apply One-Hot Encoding

In [None]:
# Apply one-hot encoding to categorical columns
X_train = pd.get_dummies(X_train, columns=['day', 'holiday', 'holiday_type'])
X_val = pd.get_dummies(X_val, columns=['day', 'holiday', 'holiday_type'])

# Ensure that the same columns are present in both train and validation sets
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

# Define and train the model

### Identify Missing Values

In [None]:
# Check for missing values in the training and validation sets
print(X_train.isna().sum())
print(X_val.isna().sum())


###  Handle Missing Values

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer object with a strategy (mean, median, etc.)
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform the validation data
X_val_imputed = imputer.transform(X_val)


### Dropping Missing Values

In [None]:
# Drop rows with missing values
X_train_dropped = X_train.dropna()
y_train_dropped = y_train[X_train_dropped.index]  # Ensure target variable is aligned

X_val_dropped = X_val.dropna()
y_val_dropped = y_val[X_val_dropped.index]  # Ensure target variable is aligned


### Running model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Define the model
gbr = GradientBoostingRegressor()

# Train the model with imputed data
gbr.fit(X_train_imputed, y_train)

# Make predictions
gbr_predictions = gbr.predict(X_val_imputed)

# Calculate evaluation metrics
gbr_mae = mean_absolute_error(y_val, gbr_predictions)
gbr_rmse = np.sqrt(mean_squared_error(y_val, gbr_predictions))

print(f'Gradient Boosting Trees Mean Absolute Error (MAE): {gbr_mae}')
print(f'Gradient Boosting Trees Root Mean Squared Error (RMSE): {gbr_rmse}')


# ARIMA Model:

### MAE: 17.75
### RMSE: 29.20

# LSTM Model:

### MAE: 0.01
### RMSE: 0.03

# Gradient Boosting Trees Model:

### MAE: 6.76
### RMSE: 9.94

# Comparison
### LSTM Model: Shows the lowest MAE and RMSE, indicating the best performance among the models tested.
### Gradient Boosting Trees: Offers better performance compared to ARIMA but is not as accurate as LSTM.
### ARIMA Model: Has the highest MAE and RMSE, suggesting it’s less effective compared to the other models for this dataset.

# Hyperparameter Tuning for LSTM
### using a grid search to find the optimal configuration

In [None]:
import tensorflow as tf
import sklearn

print("TensorFlow version:", tf.__version__)
print("scikit-learn version:", sklearn.__version__)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from kerastuner import HyperModel

class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), 
                       return_sequences=True, 
                       input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50)))
        model.add(Dense(1))
        model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
                      loss='mean_squared_error')
        return model


In [None]:
import tensorflow as tf
from keras_tuner import HyperModel

class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), 
                                       return_sequences=True, 
                                       input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(tf.keras.layers.LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50)))
        model.add(tf.keras.layers.Dense(1))
        model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
                      loss='mean_squared_error')
        return model


### Checking data shapes

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


### Reshaping data

In [None]:
import numpy as np

def reshape_for_lstm(data, time_steps):
    samples = data.shape[0] // time_steps
    features = data.shape[1]
    reshaped_data = data[:samples * time_steps].reshape((samples, time_steps, features))
    return reshaped_data

# Define the time steps you want to use
time_steps = 10

# Reshape the training and validation data
X_train_reshaped = reshape_for_lstm(X_train.values, time_steps)
X_val_reshaped = reshape_for_lstm(X_val.values, time_steps)

# Check the new shapes
print(X_train_reshaped.shape)
print(X_val_reshaped.shape)


### Adjusting Model Input Shape

In [None]:
class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), 
                                       return_sequences=True, 
                                       input_shape=(time_steps, X_train_reshaped.shape[2])))
        model.add(tf.keras.layers.LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50)))
        model.add(tf.keras.layers.Dense(1))
        model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
                      loss='mean_squared_error')
        return model


### Defining the HyperModel

In [None]:
import tensorflow as tf
from kerastuner import HyperModel

class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.LSTM(
            units=hp.Int('units', min_value=50, max_value=200, step=50),
            return_sequences=True,
            input_shape=(time_steps, X_train_reshaped.shape[2])
        ))
        model.add(tf.keras.layers.LSTM(
            units=hp.Int('units', min_value=50, max_value=200, step=50)
        ))
        model.add(tf.keras.layers.Dense(1))
        model.compile(
            optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
            loss='mean_squared_error'
        )
        return model


### Creating the Tuner

In [None]:
from kerastuner import RandomSearch

hypermodel = LSTMHyperModel()

tuner = RandomSearch(
    hypermodel,
    objective='val_loss',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='intro_to_kt'
)


### Handling NaN values

In [None]:
# Check for NaN or Inf values in the training and validation data
print(np.any(np.isnan(X_train_reshaped)))
print(np.any(np.isnan(y_train[:X_train_reshaped.shape[0]])))
print(np.any(np.isnan(X_val_reshaped)))
print(np.any(np.isnan(y_val[:X_val_reshaped.shape[0]])))

print(np.any(np.isinf(X_train_reshaped)))
print(np.any(np.isinf(y_train[:X_train_reshaped.shape[0]])))
print(np.any(np.isinf(X_val_reshaped)))
print(np.any(np.isinf(y_val[:X_val_reshaped.shape[0]])))



## Reshaping Data
### Flattening the 3D Arrays

In [None]:
from sklearn.impute import SimpleImputer

# Flatten the 3D arrays to 2D arrays
X_train_flattened = X_train_reshaped.reshape(-1, X_train_reshaped.shape[-1])
X_val_flattened = X_val_reshaped.reshape(-1, X_val_reshaped.shape[-1])

# Create an imputer object with strategy 'mean'
imputer = SimpleImputer(strategy='mean')

# Fit and transform the training data
X_train_imputed = imputer.fit_transform(X_train_flattened)

# Transform the validation data
X_val_imputed = imputer.transform(X_val_flattened)

# Reshape back to original 3D dimensions
X_train_reshaped = X_train_imputed.reshape(X_train_reshaped.shape)
X_val_reshaped = X_val_imputed.reshape(X_val_reshaped.shape)

# Check for NaN values to confirm imputation
print(np.any(np.isnan(X_train_reshaped)))
print(np.any(np.isnan(X_val_reshaped)))


###  Model Initialization

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming the data is 3D (samples, time_steps, features)
n_samples, n_time_steps, n_features = X_train_reshaped.shape

# Flatten the data for scaling
X_train_flattened = X_train_reshaped.reshape(-1, n_features)
X_val_flattened = X_val_reshaped.reshape(-1, n_features)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_flattened)
X_val_scaled = scaler.transform(X_val_flattened)

# Reshape back to original shape
X_train_reshaped = X_train_scaled.reshape(n_samples, n_time_steps, n_features)
X_val_reshaped = X_val_scaled.reshape(X_val_reshaped.shape[0], n_time_steps, n_features)


### Data Scaling

In [None]:
from tensorflow.keras.layers import Input

def create_lstm_model(units=50, optimizer='adam'):
    model = Sequential()
    model.add(Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
    model.add(LSTM(units))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model


###  Dropout Layers
### Adding dropout layers can help prevent overfitting

In [None]:
from tensorflow.keras.layers import Dropout

def create_lstm_model_with_dropout(units=50, optimizer='adam'):
    model = Sequential([
        Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
        LSTM(units, return_sequences=True),
        Dropout(0.2),
        LSTM(units),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

### Hyperparameter Tuning

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Sequential

def create_simple_lstm_model():
    model = Sequential([
        Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
        LSTM(50),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

model = create_simple_lstm_model()
history = model.fit(X_train_reshaped, y_train[:X_train_reshaped.shape[0]],
                    epochs=10, validation_data=(X_val_reshaped, y_val[:X_val_reshaped.shape[0]]))


# Model Development and Training is done......

# Model Evaluation and Cross validation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


### Verifying and Adjusting Data:

In [None]:
# Check how many complete samples fit into the reshaping dimensions
num_complete_samples = X_val.shape[0] // 10  # Number of complete samples with 10 timesteps

# Adjust X_val to have a shape that fits (num_complete_samples, 10, 43)
X_val_correct = X_val[:num_complete_samples * 10].reshape(num_complete_samples, 10, 43)

# Predict using the corrected validation data
y_pred = model.predict(X_val_correct).flatten()


### Ensuring y_val Matches:

In [None]:
# Adjust y_val to match the number of complete samples
y_val_correct = y_val[:num_complete_samples]


# Calculate Metrics:

In [None]:
# Check lengths before calculating metrics
assert len(y_pred) == len(y_val_correct), "Mismatch in prediction and validation labels lengths"

# Calculate metrics
mae = mean_absolute_error(y_val_correct, y_pred)
rmse = mean_squared_error(y_val_correct, y_pred, squared=False)
r2 = r2_score(y_val_correct, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


# Visualization for Model Evaluation