# Various imports

In [181]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [182]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingClassifier

In [183]:
from lightgbm import LGBMRegressor

In [184]:
import pickle # => Might want to use cPickle

In [185]:
df = pd.read_csv("data/train.csv")
pd.set_option('display.max_columns', None)

# Outliers removing

In [186]:
df = df[(np.abs(df["temp"] - df["atemp"]) < 10)] # Removes values where the difference between real and felt temperature is too high

# Split

In [187]:
# X = df.drop(["count", "registered", "casual", "season", "holiday", "workingday"], axis=1) # , "temp", "atemp"
# y = df["count"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [188]:
date_mask = ((df["datetime"] >= "2012-09-01") & (df["datetime"] <= "2012-12-31"))
X = df.drop(["registered", "casual", "season", "holiday", "workingday"], axis=1)

X_train = X[~date_mask].drop(["count"], axis=1)
y_train = X[~date_mask]["count"]

X_test = X[date_mask].drop(["count"], axis=1)
y_test = X[date_mask]["count"]

# Pipeline

## Custom Transformers

In [189]:
%load_ext autoreload
%autoreload 2
from model_api.app.custom_transformers import DateParser

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preprocessor

In [190]:
preprocessor = make_column_transformer(
    (DateParser(), ["datetime"]),
    # (OneHotEncoder(), ["season"]),
    remainder="passthrough"
)

# Model

In [191]:
def eval_metrics(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    r2 = r2_score(actual, pred)
    return mae, np.sqrt(mse), r2

In [192]:
model = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime'])])),
                ('model', LGBMRegressor())])

In [193]:
y_pred = model.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

MAE: 44.02253031206944
RMSE: 68.13303562399096
R2: 0.90004843965176


## Save as pickle file

In [194]:
pickle.dump(model, open("data/lightgbm.pkl", "wb"))

# Double Model 

## Registered model

In [195]:
X = df.drop(["casual", "season", "holiday", "workingday", "count"], axis=1)

X_train = X[~date_mask].drop(["registered"], axis=1)
y_train = X[~date_mask]["registered"]

X_test = X[date_mask].drop(["registered"], axis=1)
y_test = X[date_mask]["registered"]

In [196]:
model_re = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model_re.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime'])])),
                ('model', LGBMRegressor())])

In [197]:
y_pred = model_re.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

MAE: 40.913355042278546
RMSE: 62.54841451417858
R2: 0.8865507044100843


## Casual model

In [198]:
X = df.drop(["registered", "season", "holiday", "workingday", "count"], axis=1)

X_train = X[~date_mask].drop(["casual"], axis=1)
y_train = X[~date_mask]["casual"]

X_test = X[date_mask].drop(["casual"], axis=1)
y_test = X[date_mask]["casual"]

In [199]:
model_ca = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model_ca.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime'])])),
                ('model', LGBMRegressor())])

In [200]:
y_pred = model_ca.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

MAE: 11.89284631614057
RMSE: 20.410499100928064
R2: 0.8616637994138227


## Combining both

In [201]:
clf = make_column_transformer

In [202]:
y_pred = clf.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

AttributeError: 'function' object has no attribute 'predict'