In [58]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.regularizers import L1, L2, L1L2

import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
import tensorflow as tf
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)



import absl.logging
import logging
tf.get_logger().setLevel(logging.ERROR)
absl.logging.set_verbosity(absl.logging.ERROR)

In [59]:
data_fpath = os.path.join('hour.csv')
raw_df = pd.read_csv(data_fpath,index_col=0)
raw_df.drop(columns=['dteday'], inplace=True)

In [60]:
print(raw_df.head(10))
raw_df.describe()

         season  yr  mnth  hr  holiday  weekday  workingday  weathersit  temp  \
instant                                                                         
1             1   0     1   0        0        6           0           1  0.24   
2             1   0     1   1        0        6           0           1  0.22   
3             1   0     1   2        0        6           0           1  0.22   
4             1   0     1   3        0        6           0           1  0.24   
5             1   0     1   4        0        6           0           1  0.24   
6             1   0     1   5        0        6           0           2  0.24   
7             1   0     1   6        0        6           0           1  0.22   
8             1   0     1   7        0        6           0           1  0.20   
9             1   0     1   8        0        6           0           1  0.24   
10            1   0     1   9        0        6           0           1  0.32   

          atemp   hum  wind

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [61]:
raw_df.isna().any()

season        False
yr            False
mnth          False
hr            False
holiday       False
weekday       False
workingday    False
weathersit    False
temp          False
atemp         False
hum           False
windspeed     False
casual        False
registered    False
cnt           False
dtype: bool

In [62]:
def eda(raw_df):
    raw_df.describe()
    corr_matrix = raw_df.corr(numeric_only=True).round(2)
    plt.figure(figsize=(10,10))
    sns.heatmap(corr_matrix,annot=True)
    plt.show()
    
    plt.figure(figsize=(8,6))
    sns.histplot(raw_df['temp'])
    plt.show()
    
    plt.figure(figsize=(8,6))
    sns.scatterplot(x='temp',y='cnt',data=raw_df)
    plt.show()
    
    plt.figure(figsize=(8,6))
    sns.boxenplot(data=raw_df,x='weathersit',y='cnt')
    plt.show()
    
    return raw_df


In [63]:
# atemp and temp have 0.99 corr.
raw_df.drop(columns=['atemp'], inplace=True)

In [64]:
print(raw_df.shape)
raw_df.head(5)

(17379, 14)


Unnamed: 0_level_0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,1,0,0,6,0,1,0.24,0.81,0.0,3,13,16
2,1,0,1,1,0,6,0,1,0.22,0.8,0.0,8,32,40
3,1,0,1,2,0,6,0,1,0.22,0.8,0.0,5,27,32
4,1,0,1,3,0,6,0,1,0.24,0.75,0.0,3,10,13
5,1,0,1,4,0,6,0,1,0.24,0.75,0.0,0,1,1


In [65]:
raw_df.dtypes

season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [66]:
raw_df.memory_usage()

Index         139032
season        139032
yr            139032
mnth          139032
hr            139032
holiday       139032
weekday       139032
workingday    139032
weathersit    139032
temp          139032
hum           139032
windspeed     139032
casual        139032
registered    139032
cnt           139032
dtype: int64

In [67]:
dtype_df = raw_df.dtypes
cols_to_convert = dtype_df.loc[dtype_df==np.int64].index
for c in cols_to_convert:
    if c in ('casual', 'registered', 'cnt'):
        raw_df[c] = raw_df[c].astype(np.int16)
    else:
        raw_df[c] = raw_df[c].astype(np.int8)

In [68]:
raw_df.memory_usage()

Index         139032
season         17379
yr             17379
mnth           17379
hr             17379
holiday        17379
weekday        17379
workingday     17379
weathersit     17379
temp          139032
hum           139032
windspeed     139032
casual         34758
registered     34758
cnt            34758
dtype: int64

In [69]:
raw_df.dtypes

season           int8
yr               int8
mnth             int8
hr               int8
holiday          int8
weekday          int8
workingday       int8
weathersit       int8
temp          float64
hum           float64
windspeed     float64
casual          int16
registered      int16
cnt             int16
dtype: object

In [70]:
def split_data(df:pd.DataFrame, which_target: str|list):
    X = df.iloc[:, :-3].values # remove all 3 targets (casual, registered, cnt)
    Y = df.loc[:, which_target].values
    X_train, X_vt, Y_train, Y_vt = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=SEED)
    assert X_train.shape[0] == Y_train.shape[0]
    assert X_vt.shape[0] == Y_vt.shape[0]
    assert X_train.shape[1] == X_vt.shape[1]

    X_val, X_test, Y_val, Y_test = train_test_split(X_vt, Y_vt, test_size=0.5, shuffle=True, random_state=SEED)
    assert X_val.shape[0] == Y_val.shape[0]
    assert X_test.shape[0] == Y_test.shape[0]
    assert X_val.shape[1] == X_test.shape[1]
    return X_train, Y_train, X_val, Y_val, X_test, Y_test

In [71]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = split_data(raw_df, 'casual')

# Defining architectures

In [81]:
hyper_dict = {
    'h_layers': [64, 32, 16],
    'batch_norm_layers': [False, False, True],
    'activations': ['relu', 'tanh', 'elu'],
    'dropout_layers': [0.1, 0.2, 0.1],
    'regularizers': [L1L2(l1=1e-5, l2=1e-4), L1L2(l1=1e-5, l2=1e-4), L1L2(l1=1e-5, l2=1e-4)],
    'output_activation': 'sigmoid',
    'optimizer': 'adam',
    'loss': 'mae',
    'metrics': ['mae', 'mse', 'accuracy']
}


In [88]:
def model_loop(hyper_dict, raw_df: pd.DataFrame, model_name: str, which_source: str|list):
    # Get data
    print("Getting data...", end='')
    data_parts = split_data(raw_df, which_source)
    print("Done.")
    assert len(data_parts) == 6

    hyper_dict['input_size'] = data_parts[0].shape[1]
    assert data_parts[0].shape[1] == 11 # REMOVE AFTER TEST
    
    hyper_dict['output_size'] = data_parts[1].shape[1]
    assert data_parts[1].shape[1] == len(which_source) # REMOVE AFTER TEST
    
    # Make model
    print("Making model...", end='')
    model = Sequential()
    model.add(Input(hyper_dict['input_size']))
    for hu, ac, rg, bn, dp in zip(hyper_dict['h_layers'], hyper_dict['activations'], hyper_dict['regularizers'], hyper_dict['batch_norm_layers'], hyper_dict['dropout_layers']):
        if rg is None:                    
            model.add(Dense(hu, activation=ac))
        else:
            model.add(Dense(hu, activation=ac, kernel_regularizer=rg))
        if bn:
            model.add(BatchNormalization())
        if dp > 0.0:
            model.add(Dropout(dp))
    model.add(Dense(hyper_dict['output_size'], activation=hyper_dict['output_activation']))
    model.compile(optimizer=hyper_dict['optimizer'], loss=hyper_dict['loss'], metrics=hyper_dict['metrics'])
    print("Done.")

    # Train model
    print("Training model...", end='')
    es_cbk = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    chp_cbk = ModelCheckpoint(monitor='val_loss', filepath=os.path.join("models", model_name), save_best_only=True)
    model_history = model.fit(data_parts[0], data_parts[1], epochs=200, batch_size=16, validation_data=(data_parts[2], data_parts[3]), callbacks=[es_cbk, chp_cbk], verbose=0)
    print("Done.")
    
    # Evaluate model
    print("Evaluating model...", end='')
    Y_pred = model.predict(data_parts[-2], verbose=0)
    mae = mean_absolute_error(data_parts[-1], Y_pred)
    rmse = np.sqrt(mean_squared_error(data_parts[-1], Y_pred))
    r2 = r2_score(data_parts[-1], Y_pred)
    print(f"MAE:{mae}")
    print(f"RMSE:{rmse}")
    print(f"R2:{r2}")
    print("Done")
    return mae, rmse, r2


In [83]:
model_loop(hyper_dict=hyper_dict ,raw_df=raw_df, model_name='casual_single', which_source=['casual'])

Getting data...Done.
Making model...Done.
Training model...Done.
Evaluating model...MAE:34.59532165527344
RMSE:59.784080505371094
R2:-0.49538491877289315
Done


(34.59532, 59.78408, -0.49538491877289315)

In [85]:
model_loop(hyper_dict=hyper_dict, raw_df=raw_df, model_name='registered_single', which_source=['registered'])


Getting data...Done.
Making model...Done.
Training model...Done.
Evaluating model...MAE:150.47755432128906
RMSE:209.51992797851562
R2:-1.0652531481083285
Done


(150.47755, 209.51993, -1.0652531481083285)

In [None]:
model_loop(hyper_dict=hyper_dict, raw_df=raw_df, model_name='both', which_source=['casual','registered'])

Getting data...Done.
Making model...Done.
Training model...Done.
Evaluating model...MAE:26.94536781311035
RMSE:51.045265197753906
R2:0.7726866684665508
Done


(26.945368, 51.045265, 0.7726866684665508)