# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [139]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import numpy as np
import joblib
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [119]:
class FeatureExtractor(object):
    def __init__(self,df:pd.DataFrame):
        self.df = df
    def extract(self):
        self.df['hour'] = self.df['timestamp'].dt.hour
        self.df['weekday'] = self.df['timestamp'].dt.weekday
        self.df.drop(columns=['timestamp'], inplace=True)
        return self.df

df = FeatureExtractor(pd.read_csv('../data/checker_submits.csv',parse_dates=['timestamp']))
df = df.extract()
df

Unnamed: 0,uid,labname,numTrials,hour,weekday
0,user_4,project1,1,5,4
1,user_4,project1,2,5,4
2,user_4,project1,3,5,4
3,user_4,project1,4,5,4
4,user_4,project1,5,5,4
...,...,...,...,...,...
1681,user_19,laba06s,9,20,3
1682,user_1,laba06s,6,20,3
1683,user_1,laba06s,7,20,3
1684,user_1,laba06s,8,20,3


In [120]:
df['uid'] = df['uid'].astype('category')
df['labname'] = df['labname'].astype('category')
df

Unnamed: 0,uid,labname,numTrials,hour,weekday
0,user_4,project1,1,5,4
1,user_4,project1,2,5,4
2,user_4,project1,3,5,4
3,user_4,project1,4,5,4
4,user_4,project1,5,5,4
...,...,...,...,...,...
1681,user_19,laba06s,9,20,3
1682,user_1,laba06s,6,20,3
1683,user_1,laba06s,7,20,3
1684,user_1,laba06s,8,20,3


In [121]:
class MyOneHotEncoder(object):
    def __init__(self,df:pd.DataFrame,target):
        self.df = df
        self.target = target
    def tranfrom(self):
        cat_col = [col for col in self.df.columns if self.df[col].dtype == 'category' and col != self.target]
        enc = OneHotEncoder(sparse_output=False)
        data = enc.fit_transform(self.df[cat_col])
        temp_df = pd.DataFrame(data, columns=enc.get_feature_names_out())
        self.df=pd.concat([self.df, temp_df], axis=1).drop(columns=['uid', 'labname'])
        return self.df
df = MyOneHotEncoder(df,'weekday').tranfrom()
df
    


Unnamed: 0,numTrials,hour,weekday,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [122]:
X=df.drop(columns=['weekday'])
y=df['weekday']

In [123]:
class TrainValidationTest():
    def __init__(self,X,Y):
        self.X = X
        self.Y = Y
    def train(self):
        X_train,X_test,y_train,y_test = train_test_split(self.X,self.Y,test_size=0.2,random_state=21,stratify=self.Y)
        X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,random_state=21,stratify=y_train)
        return X_train,X_test,y_train,y_test,X_valid,y_valid

In [124]:
X_train,X_test,y_train,y_test,X_valid,y_valid = TrainValidationTest(X,y).train()

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [125]:
model_svm = SVC(probability=True,random_state=21)
param_grid_svm = [{
    'kernel' : ('linear','rbf','sigmoid'),
    'C' : [0.01,0.1,1,1.5,5,10],
    'gamma' : ['scale','auto'],
    'class_weight' : ('balanced','None'),
    'random_state':[21], 
    'probability':[True]
}]
grid_svm = GridSearchCV(model_svm,param_grid=param_grid_svm,cv=2,scoring='accuracy',n_jobs=-1,verbose=1)


In [126]:
param_grid_tree = [{
    'max_depth' : range(1,50,5),
    'criterion' : ['entropy','gini'],
    'class_weight' : ['balanced','None']
}]
model_tree = DecisionTreeClassifier(random_state=21)
grid_tree = GridSearchCV(model_tree,param_grid=param_grid_tree,n_jobs=-1,verbose=1,cv=2,scoring='accuracy')

In [127]:
param_grid_random = {
    'n_estimators' : [5,10,50,100],
    'max_depth' : range(1,50),
    'class_weight' : ['balanced','None'],
    'criterion' : ['entropy','gini']
}
model_random = RandomForestClassifier(random_state=21)
grid_random = GridSearchCV(model_random,param_grid_random,scoring='accuracy',n_jobs=-1,verbose=1,cv=2)

In [128]:
grids = [grid_svm,grid_tree,grid_random]
grid_dict = {0 : 'SVC', 1: 'DecisionTreeClassifier', 2 : 'RandomForestClassifier'}


In [129]:
class ModelSelection(object):
    def __init__(self,grids:list,grid_dict:dict):
        self.grids = grids
        self.grid_dict = grid_dict
    def choose(self, X_train, y_train, X_valid, y_valid):
        best_model_name = None
        best_score = 0
        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            print(f"Estimator {model_name}")

            total_fits = len(grid.param_grid) * 2

            with tqdm(total=total_fits, desc=f"{model_name}", ncols=100, unit="it/s") as pbar:
                grid.fit(X_train, y_train)

                valid_score = grid.score(X_valid, y_valid)
                if valid_score > best_score:
                    best_score = valid_score
                    best_model_name = model_name

                pbar.update(total_fits)
                print(f"Best params: {grid.best_params_}")
                print(f"Best training accuracy: {grid.best_score_:.3f}")
                print(f"Validation set accuracy score for best params: {valid_score:.3f}")

        print(f"\nClassifier with best validation set accuracy: {best_model_name}")
        return best_model_name
    def best_results(self, X_train, y_train, X_valid, y_valid):
        models = []
        params =  []
        scores = []
        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            models.append(model_name)
            grid.fit(X_train, y_train)
            params.append(grid.best_params_)
            scores.append(grid.score(X_valid, y_valid))
        return pd.DataFrame({'model': models, 'param' : params, 'valid_score': scores})




                
            
    

In [130]:

ModelSelection(grids,grid_dict).choose(X_train,y_train,X_valid,y_valid)

Estimator SVC


SVC:   0%|                                                                  | 0/2 [00:00<?, ?it/s/s]

Fitting 2 folds for each of 72 candidates, totalling 144 fits


72 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

Best params: {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.759
Validation set accuracy score for best params: 0.859
Estimator DecisionTreeClassifier


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/V

Fitting 2 folds for each of 40 candidates, totalling 80 fits
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}
Best training accuracy: 0.807
Validation set accuracy score for best params: 0.867
Estimator RandomForestClassifier


RandomForestClassifier:   0%|                                               | 0/8 [00:00<?, ?it/s/s]

Fitting 2 folds for each of 784 candidates, totalling 1568 fits


784 fits failed out of a total of 1568.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
393 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framewo

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'n_estimators': 100}
Best training accuracy: 0.853
Validation set accuracy score for best params: 0.900

Classifier with best validation set accuracy: RandomForestClassifier





'RandomForestClassifier'

In [131]:
ModelSelection(grids,grid_dict).best_results(X_train,y_train,X_valid,y_valid)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


72 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

Fitting 2 folds for each of 40 candidates, totalling 80 fits
Fitting 2 folds for each of 784 candidates, totalling 1568 fits


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/V

Unnamed: 0,model,param,valid_score
0,SVC,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.859259
1,DecisionTreeClassifier,"{'class_weight': 'balanced', 'criterion': 'gin...",0.866667
2,RandomForestClassifier,"{'class_weight': 'balanced', 'criterion': 'gin...",0.9


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [136]:
class Finalize(object):
    def __init__(self,estimators):
        self.estinators = estimators
    def final_score(self,X_train,y_train,X_test,y_test):
        self.estinators.fit(X_train,y_train)
        y_pr = self.estinators.predict(X_test)
        return f'Accurancy of the final model is {accuracy_score(y_test,y_pr)}'
    def save_model(self,path):
        joblib.dump(self.estinators,path)
        print("Модель успешно сохранена")
    
Finalize(model_random).final_score(X_train,y_train,X_test,y_test)


'Accurancy of the final model is 0.908284023668639'

In [137]:
Finalize(model_random).save_model('best_model')

Модель успешно сохранена


## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [149]:
data = pd.read_csv('../data/checker_submits.csv')
data['uid'] = data["uid"].astype('category')
data['labname'] = data["labname"].astype('category')
data

Unnamed: 0,uid,labname,numTrials,timestamp
0,user_4,project1,1,2020-04-17 05:19:02.744528
1,user_4,project1,2,2020-04-17 05:22:45.549397
2,user_4,project1,3,2020-04-17 05:34:24.422370
3,user_4,project1,4,2020-04-17 05:43:27.773992
4,user_4,project1,5,2020-04-17 05:46:32.275104
...,...,...,...,...
1681,user_19,laba06s,9,2020-05-21 20:01:48.959966
1682,user_1,laba06s,6,2020-05-21 20:18:54.487900
1683,user_1,laba06s,7,2020-05-21 20:19:06.872761
1684,user_1,laba06s,8,2020-05-21 20:22:41.877806


In [153]:
preprocessing_pipeline = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('onehot_encoder', MyOneHotEncoder(target='dayofweek'))
])


processed_data = preprocessing_pipeline.fit_transform(data)


X = processed_data.drop(columns=['dayofweek']) 
Y = processed_data['dayofweek']

train_val_test = TrainValidationTest(X, Y)
X_train, X_test, y_train, y_test, X_valid, y_valid = train_val_test.train()

TypeError: FeatureExtractor.__init__() missing 1 required positional argument: 'df'