# Various imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator,TransformerMixin

In [3]:
from lightgbm import LGBMRegressor

In [4]:
import pickle # => Might want to use cPickle

In [5]:
df = pd.read_csv("/home/michelle/Documents/DEV_IA/Projet/Brief_machine_learning_Vlille_210322/Rendu/data/df.csv")
pd.set_option('display.max_columns', None)

In [23]:
df.sample(2)

Unnamed: 0.1,Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,label_kmeans
2321,2321,2011-06-04 15:00:00,2,0,0,2,30.34,32.575,27,27.9993,180,224,404,1
10746,10746,2012-12-14 04:00:00,4,0,1,1,8.2,11.365,75,6.0032,1,9,10,2


In [16]:
X = df.drop(["label_kmeans","count", "registered", "casual"], axis=1) # , "temp", "atemp"
y = df["label_kmeans"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline

## Custom Transformers

In [17]:
class DateParser(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = pd.to_datetime(X["datetime"])
        return_X = pd.DataFrame(
            {
                "weekday": X.dt.weekday,
                "hour": X.dt.hour,
                "month": X.dt.month,
                "year": X.dt.year,
            }
        )
        return return_X

## Preprocessor

In [18]:
preprocessor = make_column_transformer(
    (DateParser(), ["datetime"]),
    (OneHotEncoder(), ["season"]),
    remainder="passthrough"
)

# Model

In [19]:
def eval_metrics(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    r2 = r2_score(actual, pred)
    return mae, np.sqrt(mse), r2

In [20]:
model = Pipeline([("preprocessor", preprocessor), ("model", LGBMRegressor())])
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dateparser', DateParser(),
                                                  ['datetime']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['season'])])),
                ('model', LGBMRegressor())])

In [21]:
y_pred = model.predict(X_test)
mae, rmse, r2 = eval_metrics(y_test, y_pred)
print(f"MAE: {mae}\nRMSE: {rmse}\nR2: {r2}")

MAE: 0.06240782180912808
RMSE: 0.20097733080070496
R2: 0.9668912863305092


In [26]:
abs(np.round(y_pred))

array([0., 3., 3., ..., 1., 2., 0.])

## Save as pickle file

In [72]:
#pickle.dump(model, open("data/lightgbm.pkl", "wb"))