# Various imports

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [26]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator,TransformerMixin

In [27]:
from lightgbm import LGBMRegressor

In [28]:
import pickle # => Might want to use cPickle

In [29]:
df = pd.read_csv("data/train.csv")
pd.set_option('display.max_columns', None)

In [30]:
X = df.drop(["count", "registered", "casual", "season", "holiday", "workingday"], axis=1) # , "temp", "atemp"
y = df["count"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

## Custom Transformers

In [31]:
# class DateParser(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         super().__init__()

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         X = pd.to_datetime(X["datetime"])
#         return_X = pd.DataFrame(
#             {
#                 "weekday": X.dt.weekday,
#                 "hour": X.dt.hour,
#                 "month": X.dt.month,
#                 "year": X.dt.year,
#             }
#         )
#         return return_X
%load_ext autoreload
%autoreload 2
from model_api.app.custom_transformers import DateParser


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preprocessor

In [32]:
preprocessor = make_column_transformer(
    (DateParser(), ["datetime"]),
    # (OneHotEncoder(), ["season"]),
    remainder="passthrough"
)

# Model

In [33]:
def eval_metrics(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    r2 = r2_score(actual, pred)
    return mae, np.sqrt(mse), r2

In [34]:
model = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime'])])),
                ('model', LGBMRegressor())])

In [35]:
y_pred = model.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

MAE: 25.892471238736977
RMSE: 41.242403747010655
R2: 0.9484673199055353


## Save as pickle file

In [36]:
pickle.dump(model, open("data/lightgbm.pkl", "wb"))