In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import cos, sin, arcsin, sqrt
from math import radians
from datetime import date
import holidays
from sklearn.cluster import KMeans
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

## Pre processing for task 2

In [3]:
# List with files for stations 2022
def prepare_stations():
    lst = []
    for year in range(2014, 2023):
        df = pd.read_csv(f'data/stations/Stations_{year}.csv')
        lst.append(df)
    
    # Remove Duplicates
    df_stations = pd.concat(lst, ignore_index=True)
    df_stations.drop_duplicates(subset=['code'], inplace=True, keep="first")
    df_stations.to_csv("data/task_2/all_stations.csv")
    
    def distanceToCenter(row):
        lon1 = -73.554167
        lat1 = 45.508888
        lon2 = row['longitude']
        lat2 = row['latitude']
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * arcsin(sqrt(a)) 
        km = 6367 * c
        return km
    
    df_stations['distance_to_center'] = df_stations_2022.apply(lambda row: distanceToCenter(row), axis=1)
    coordinates = df_stations[['latitude', 'longitude']]
    kmeans = KMeans(n_clusters=50, random_state=0).fit(coordinates)
    df_stations['stations_cluster'] =  kmeans.labels_
    
    df_stations.to_csv("data/task_2/all_stations_clustered.csv")
    return df_stations

In [4]:
holidaysCanada = holidays.country_holidays('CA', subdiv='QC')

def all_rides(start_year, end_year):
    all_rides = pd.DataFrame()
    for year in range(start_year, end_year + 1):
        for month in range(4, 11):
            rides_file_path = f'data/bike_rides/OD_{year}-{month:02d}.csv'
            df = pd.read_csv(rides_file_path, parse_dates=["start_date", "end_date"])
            all_rides = pd.concat([all_rides, df])
    all_rides['year'] = all_rides['start_date'].dt.year
    all_rides['month'] = all_rides['start_date'].dt.month
    all_rides['weekday'] = all_rides['start_date'].dt.weekday
    all_rides['is_holiday'] = all_rides['start_date'].apply(lambda x: x.date() in holidaysCanada)
    all_rides['is_weekend'] = all_rides['start_date'].dt.weekday > 4
    all_rides['start_date'] = pd.to_datetime(all_rides['start_date']).dt.date
    return all_rides

def get_weather():
    df_weather = pd.read_csv('data/preprocessed_data/weather.csv', parse_dates=[4])
    df_weather.columns = df_weather.columns.str.lower()
    df_weather = df_weather[["date/time", "mean temp (°c)", "total precip (mm)"]]
    df_weather = df_weather.rename(columns={"date/time": "tmp_date",'mean temp (°c)': 'mean_temperature','total precip (mm)': 'total_precipitation'})
    df_weather['tmp_date'] = pd.to_datetime(df_weather['tmp_date']).dt.date
    
    # interpolate missing data
    df_weather[['mean_temperature','total_precipitation']] = df_weather[['mean_temperature','total_precipitation']].interpolate()
    return df_weather

def include_clusters(rides_df, stations_df):
    merged_df = pd.merge(rides_df, stations_df, left_on='start_station_code', right_on='code', how='left')
    merged_df = merged_df.rename(columns={'stations_cluster': 'start_station_cluster'})
    merged_df.drop('code', axis=1, inplace=True)
    
    merged_df = pd.merge(merged_df, stations_df, left_on='end_station_code', right_on='code', how='left')
    merged_df = merged_df.rename(columns={'stations_cluster': 'end_station_cluster'})
    merged_df.drop('code', axis=1, inplace=True)
    
    grouped_df = merged_df.groupby(['start_date', 'start_station_cluster', 'end_station_cluster']).agg(count=('start_date', 'size'), duration_sec=('duration_sec', 'mean'), is_holiday=("is_holiday", "first"), is_weekend=("is_weekend", "first")).reset_index()
    return grouped_df
    
def include_weather(grouped, weather):
    w_weather = grouped.merge(weather, left_on='start_date', right_on='tmp_date', how='left')
    w_weather.drop('tmp_date', axis=1, inplace=True)
    return w_weather

# run this function to complete the pre-processing for task 2
def complete_pre_task2():
    rides = all_rides(2014, 2018)
    stations = prepare_stations()[["code", "stations_cluster"]]
    grouped = include_clusters(rides, stations)
    weather = get_weather()
    include_weather(grouped, weather).to_csv("data/task_2/pre_task2_2014_2018.csv")


## Modelling task 2

In [51]:
df = pd.read_csv("data/task_2/pre_task2_2014_2018.csv", index_col=0)
df = df.drop(["start_date"], axis=1)

In [53]:
def random_forrest_regressor(df):
    X = df
    y = df['count']
    X = X.drop("count", axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred

    res.to_csv("results/pred_random_forrest_regressor_all.csv")

def gradient_boosting_regression(df):
    X = df
    y = df['count']
    X = X.drop("count", axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
   
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    res.to_csv("results/pred_gradient_boosting_regressor_all.csv")

def tensor_flow(df):
    X = df
    y = df['count']
    X = X.drop("count", axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train)

    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_scaled, y_train, epochs=10, batch_size=32)

    new_data_scaled = scaler.transform(X_test)
    y_pred = model.predict(new_data_scaled)
    
    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    res.to_csv("results/pred_tensorflow_all.csv")

def cat_boost(df):
    X = df
    y = df['count']
    X = X.drop("count", axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = CatBoostRegressor()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(y_pred)
    
    res = X_test.copy()
    res["actual"] = y_test
    res["pred"] = y_pred
    res.to_csv("results/pred_cat_boost_all.csv")

## Model evaluation

In [55]:
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

rfr_res = "results/pred_random_forrest_regressor_all.csv"
gbr_res = "results/pred_gradient_boosting_regressor_all.csv"
tfl_res = "results/pred_tensorflow_all.csv"
cat_res = "results/pred_cat_boost_all.csv"

def evaluate(filepath):
    res = pd.read_csv(filepath)
    actual_values = res['actual']
    predicted_values = res['pred']
    mse = mean_squared_error(actual_values, predicted_values)
    rmse = np.sqrt(mse)
    correlation_coefficient, p_value = pearsonr(actual_values, predicted_values)
    return rmse, mse, correlation_coefficient, p_value

def compare_results(all_result_files):    
    results = []
    for filepath in all_result_files:
        rmse, mse, correlation_coefficient, p_value = evaluate(filepath)
        result = {
            'Model': filepath[12:].split(".")[0].replace("_", " ").replace(" all", ""),  # Extract the model name from the filepath
            'RMSE': rmse,
            'MSE': mse,
            'Correlation Coefficient': correlation_coefficient,
            'P-Value': p_value
        }
        results.append(result)
    
    results_df = pd.DataFrame(results)
    results_df.to_csv("results/evaluations.csv")
    print(results_df)

all_result_files = [rfr_res, gbr_res, tfl_res, cat_res]