# Various Imports

In [260]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [261]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.model_selection import learning_curve

In [262]:
from lightgbm import LGBMRegressor

In [263]:
df = pd.read_csv("../../data/train.csv")
pd.set_option('display.max_columns', None)
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [264]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [265]:
X = df.drop(["count", "registered", "casual"], axis=1) # , "temp", "atemp"
y = df["count"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

## Custom Transformers

In [266]:
class DateParser(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = pd.to_datetime(X["datetime"])
        return_X = pd.DataFrame(
            {
                "weekday": X.dt.weekday,
                "hour": X.dt.hour,
                "month": X.dt.month,
                "year": X.dt.year,
            }
        )
        return return_X


DateParser().fit_transform(X_train[["datetime"]])

Unnamed: 0,weekday,hour,month,year
2815,2,5,7,2011
8695,5,16,8,2012
8406,2,15,7,2012
1543,6,4,4,2011
4952,5,10,11,2011
...,...,...,...,...
5734,5,2,1,2012
5191,5,9,12,2011
5390,6,16,12,2011
860,5,7,2,2011


In [267]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

## Preprocessor

In [268]:
date_pipeline = Pipeline(steps=[
    ("parser", DateParser()),
    ("onehot", OneHotEncoder())
])

preprocessor = make_column_transformer( # parse date
    (DateParser(), ["datetime"]),
    (OneHotEncoder(), ["season"]),
    remainder="passthrough"
)

# Model

In [269]:
def eval_metrics(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    r2 = r2_score(actual, pred)
    return mae, np.sqrt(mse), r2

In [270]:
model = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['season'])])),
                ('model', LGBMRegressor())])

In [271]:
y_pred = model.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

from sklearn.metrics import mean_squared_log_error
print(np.sqrt(mean_squared_log_error(y_test, np.abs(y_pred))))

MAE: 24.318836467131643
RMSE: 37.20844024554987
R2: 0.9580552393240116
0.401338700199911


## No onehot but date parse
MAE: 42.43831200011359
RMSE: 61.947172486264684
R2: 0.8837380060138225

## No onehot, date parse and year, no temp and atemp
MAE: 26.844148005775768
RMSE: 41.27493563595885
R2: 0.9483859901834405

## No onehot, date parse and year
MAE: 24.318836467131643
RMSE: 37.20844024554987
R2: 0.9580552393240116

## Date parse and onehot but no temp and atemp
MAE: 47.04598656840119
RMSE: 67.92056836649193
R2: 0.8602353267176805

## Date parse, onehot, temp and atemp
MAE: 48.0921046082554
RMSE: 68.55929768569396
R2: 0.8575942547840498

In [273]:
# Tests with other years => weird results

X_test = X_test.append(
        [
            {'datetime': "2011-12-19 20:00:00", 'season': 4, 'holiday': 0, 'workingday': 1, 'weather': 1, 'temp': 14.76, 'atemp': 17.425, 'humidity': 57, 'windspeed': 15.0013},
            {'datetime': "2012-12-19 20:00:00", 'season': 4, 'holiday': 0, 'workingday': 1, 'weather': 1, 'temp': 14.76, 'atemp': 17.425, 'humidity': 57, 'windspeed': 15.0013},
            {'datetime': "2022-12-19 20:00:00", 'season': 4, 'holiday': 0, 'workingday': 1, 'weather': 1, 'temp': 14.76, 'atemp': 17.425, 'humidity': 57, 'windspeed': 15.0013}
        ]
    )
model.predict(X_test) # => changes are due to the weekday being different

  X_test = X_test.append(


array([132.31782745, -15.19364089, 162.02276985, ..., 187.91388462,
       265.61232242, 258.79637453])

In [None]:
X.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed'],
      dtype='object')

In [None]:
# Try without workingday
# learningcurve

In [None]:
raise
N, train_score, val_score = learning_curve(model, X_train, Y_train, train_sizes = np.linspace(0.1,1,20), cv=3, scoring='r2',verbose = 3,n_jobs=-1) 

plt.subplots(figsize=(22,14))
plt.plot(N, val_score.mean(axis=1), label='validation') 
plt.plot(N, train_score.mean(axis=1), label='train')
plt.xlabel('train_sizes') 
plt.title('Xgb learning curve') 
plt.legend() 
plt.show()