In [121]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import cos, sin, arcsin, sqrt
from math import radians
from datetime import date
import holidays
from sklearn.cluster import KMeans
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

## Train, test and validation split

In [132]:
df_train = pd.read_csv("data/task_2/pre_task2_2014_2018.csv", index_col=0)
date_train = df_train['start_date']
df_train = df_train.drop(["start_date"], axis=1)

df_valid = pd.read_csv("data/task_2/pre_task2_2019.csv", index_col=0)
date_valid = df_valid['start_date']
df_valid = df_valid.drop(["start_date"], axis=1)

df_test = pd.read_csv("data/task_2/pre_task2_2022.csv", index_col=0)
date_test = df_test['start_date']
df_test = df_test.drop(["start_date"], axis=1)

def df_split(train, valid, test):
    X_train = train.copy()
    y_train = X_train['count']
    X_train = X_train.drop(["count", 'duration_sec'], axis=1)
    
    X_valid = valid.copy()
    y_valid = X_valid['count']
    X_valid = X_valid.drop(["count", 'duration_sec'], axis=1)
    
    X_test = test.copy()
    y_test = X_test['count']
    X_test = X_test.drop(["count", 'duration_sec'], axis=1)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

## Modelling

In the following block we have made functions for 4 different models:
- RandomForrestRegressor
- GradientBoostingRegressor
- TensorFlow
- CatBoost

In [141]:
def random_forrest_regressor(train, valid, test):
    X_train, y_train, X_valid, y_valid, X_test, y_test = df_split(train, valid, test)

    X_train = pd.concat([X_train, X_valid])
    y_train = pd.concat([y_train, y_valid])
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    
    #res.to_csv("results/pred_random_forrest_regressor_all.csv")
    return model, res

def gradient_boosting_regression(train, valid, test):
    X_train, y_train, X_valid, y_valid, X_test, y_test = df_split(train, valid, test)

    X_train = pd.concat([X_train, X_valid])
    y_train = pd.concat([y_train, y_valid])
    
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    
    #res.to_csv("results/pred_gradient_boosting_regressor_all.csv")
    return model, res

def tensor_flow(train, valid, test):
    
    def model_prep(train, valid, test):
        df_train = train.copy()
        df_valid = valid.copy()
        df_test = test.copy()
        
        # One hot encoding of Boolean variables
        encoder = OneHotEncoder()
        encoded = pd.DataFrame(encoder.fit_transform(df_train[['is_holiday', 'is_weekend']]).toarray(), columns=encoder.get_feature_names_out(['is_holiday', 'is_weekend']))
        df_train = df_train.drop(columns=['is_holiday', 'is_weekend'])
        df_train = df_train.join(encoded)
        encoded = pd.DataFrame(encoder.transform(df_valid[['is_holiday', 'is_weekend']]).toarray(), columns=encoder.get_feature_names_out(['is_holiday', 'is_weekend']))
        df_valid = df_valid.drop(columns=['is_holiday', 'is_weekend'])
        df_valid = df_valid.join(encoded)
        encoded = pd.DataFrame(encoder.transform(df_test[['is_holiday', 'is_weekend']]).toarray(), columns=encoder.get_feature_names_out(['is_holiday', 'is_weekend']))
        df_test = df_test.drop(columns=['is_holiday', 'is_weekend'])
        df_test = df_test.join(encoded)
        
        # Standard scaler for continuous variables
        scaler = StandardScaler()
        df_train[['duration_sec', 'mean_temperature', 'total_precipitation']] = scaler.fit_transform(df_train[['duration_sec', 'mean_temperature', 'total_precipitation']])
        df_valid[['duration_sec', 'mean_temperature', 'total_precipitation']] = scaler.transform(df_valid[['duration_sec', 'mean_temperature', 'total_precipitation']])
        df_test[['duration_sec', 'mean_temperature', 'total_precipitation']] = scaler.transform(df_test[['duration_sec', 'mean_temperature', 'total_precipitation']])
        
        # Minmax scaler for station cluster ids
        scaler = MinMaxScaler()
        df_train[['start_station_cluster', 'end_station_cluster']] = scaler.fit_transform(df_train[['start_station_cluster', 'end_station_cluster']])
        df_valid[['start_station_cluster', 'end_station_cluster']] = scaler.transform(df_valid[['start_station_cluster', 'end_station_cluster']])
        df_test[['start_station_cluster', 'end_station_cluster']] = scaler.transform(df_test[['start_station_cluster', 'end_station_cluster']])
        
        return df_train, df_valid, df_test
    
    train, valid, test = model_prep(train, valid, test)
    X_train, y_train, X_valid, y_valid, X_test, y_test = df_split(train, valid, test)
    
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_valid, y_valid))

    y_pred = model.predict(X_test)
    
    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    
    #res.to_csv("results/pred_tensorflow_all.csv")
    return model, res

def cat_boost(train, valid, test):
    X_train, y_train, X_valid, y_valid, X_test, y_test = df_split(train, valid, test)
    
    X_train = pd.concat([X_train, X_valid])
    y_train = pd.concat([y_train, y_valid])

    model = CatBoostRegressor()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    
    #res.to_csv("results/pred_cat_boost_all.csv")
    return model, res

rfr_mod, rfr_res = random_forrest_regressor(df_train, df_valid, df_test)
gbr_mod, gbr_res = gradient_boosting_regression(df_train, df_valid, df_test)
tsf_mod, tsf_res = tensor_flow(df_train, df_valid, df_test)
cbt_mod, cbt_res = cat_boost(df_train, df_valid, df_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Learning rate set to 0.121413
0:	learn: 62.6754924	total: 95ms	remaining: 1m 34s
1:	learn: 62.1563048	total: 125ms	remaining: 1m 2s
2:	learn: 61.5915647	total: 156ms	remaining: 51.9s
3:	learn: 60.8648934	total: 187ms	remaining: 46.5s
4:	learn: 60.4278915	total: 213ms	remaining: 42.3s
5:	learn: 59.9358057	total: 243ms	remaining: 40.3s
6:	learn: 59.5217106	total: 268ms	remaining: 38s
7:	learn: 58.9365875	total: 295ms	remaining: 36.6s
8:	learn: 58.4883733	total: 323ms	remaining: 35.5s
9:	learn: 58.2273121	total: 350ms	remaining: 34.7s
10:	learn: 57.9856311	total: 379ms	remaining: 34.1s
11:	learn: 57.7709592	total: 407ms	remaining: 33.5s
12:	learn: 57.5562120	total: 437ms	remaining: 33.2s
13:	learn: 57.0106052	total: 465ms	remaining: 32.7s
14:	learn: 56.8133586	t

## Evaluating the models

In [153]:
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score

def evaluate(df):
    actual_values = df['actual']
    predicted_values = df['pred']
    
    mse = mean_squared_error(actual_values, predicted_values)
    rmse = np.sqrt(mse)
    correlation_coefficient, p_value = pearsonr(actual_values, predicted_values)
    r2 = r2_score(actual_values, predicted_values)
    
    return rmse, mse, correlation_coefficient, p_value, r2

def compare_results(all_results):    
    results = []
    for name, df in all_results.items():
        rmse, mse, correlation_coefficient, p_value, r2 = evaluate(df)
        result = {
            'Model': name,  # Extract the model name from the filepath
            'RMSE': rmse,
            'MSE': mse,
            'Correlation Coefficient': correlation_coefficient,
            'P-Value': p_value,
            'R-squared': r2
        }
        results.append(result)
    
    results_df = pd.DataFrame(results)
    #results_df.to_csv("results/evaluations.csv")
    return results_df

all_results = {
    "Random Forrest Regressor": rfr_res,
    "GradientBoosting Regressor": gbr_res,
    "TensorFlow": tsf_res,
    "CatBoost": cbt_res
}

print(compare_results(all_results))

                        Model       RMSE          MSE   
0    Random Forrest Regressor  51.173811  2618.758890  \
1  GradientBoosting Regressor  85.247879  7267.200816   
2                  TensorFlow  85.339111  7282.763826   
3                    CatBoost  60.481705  3658.036639   

   Correlation Coefficient  P-Value  R-squared  
0                 0.927707      0.0   0.694800  
1                 0.506357      0.0   0.153054  
2                 0.446606      0.0   0.151240  
3                 0.879918      0.0   0.573679  


## Hyperparametertuning (RandomForrestRegressor)

In [125]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

def hpt_random_forrest(train, test):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }
    
    rf = RandomForestRegressor(random_state=42)
    
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    random_search = RandomizedSearchCV(
        rf, param_distributions=param_grid, n_iter=5, scoring=scorer, cv=5, random_state=42
    )

    random_search.fit(train, test)
    
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    print("Best Hyperparameters:", best_params)
    print("Best Score:", best_score)
    
    predictions = best_model.predict(X_test)
    
    return best_model, predictions


### Baseline

In [138]:
def baseline(X_train, y_train, X_test, y_test):
    baseline = X_test.copy()
    baseline["count"] = y_test
    baseline = baseline[["start_station_cluster", "end_station_cluster", "count"]]

    traning_df = X_train.copy()
    traning_df["count"] = y_train
    traning_df = traning_df[["start_station_cluster", "end_station_cluster", "count"]]
    
    mean_df = traning_df.groupby(["start_station_cluster", "end_station_cluster"]).mean()[["count"]]
    
    merged_df = pd.merge(baseline, mean_df, left_on=['start_station_cluster', 'end_station_cluster'], right_on=['start_station_cluster', 'end_station_cluster'])
    merged_df = merged_df.rename(columns={'count_x': 'actual', 'count_y': 'pred'})

    evaluation = evaluate(merged_df)
    
    return merged_df, evaluation