# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from joblib import dump

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [11]:
class FeatureExtractor:
    def __init__(self, filepath):
        self.filepath = filepath
        self.df = None

    def read(self):
        self.df = pd.read_csv(self.filepath, parse_dates=["timestamp"])
        self.df["hour"] = self.df["timestamp"].dt.hour
        self.df["weekday"] = self.df["timestamp"].dt.weekday
        self.df.drop(columns="timestamp", inplace=True)
        
        return self.df


In [12]:
fe = FeatureExtractor("../data/checker_submits.csv")
df = fe.read()

In [13]:
class MyOneHotEncoder:
    def __init__(self, df):
        self.df = df

    def identifies(self):
        self.x = self.df.drop(columns="weekday")
        self.y = self.df["weekday"]
        encoder = OneHotEncoder(sparse=False)
        encoded = encoder.fit_transform(self.x[["uid", "labname"]])
        feature_names = encoder.get_feature_names(["uid", "labname"])
        encoded_df = pd.DataFrame(encoded, columns=feature_names, index=self.x.index)
        self.x = pd.concat([self.x, encoded_df], axis=1)
        self.x.drop(columns=["uid", "labname"], inplace=True)
        return self.x, self.y

In [14]:
me_hot = MyOneHotEncoder(df)
x, y = me_hot.identifies()

In [15]:
x

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
class TrainValidationTest:
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def transform(self):
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=21, stratify=self.y)
        return x_train, x_test, y_train, y_test
        

In [17]:
train_val = TrainValidationTest(x, y)
x_train, x_test, y_train, y_test = train_val.transform()

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [18]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids, self.grid_dict = grids, grid_dict
        self.results = []

    def choose(self, x_train, x_test, y_train, y_test):
        best_model_name = None
        best_score = 0
        for i, grid in enumerate(tqdm(self.grids)):
            print(f"Estimator {self.grid_dict[i]}")
            grid.fit(x_train, y_train)

            valid_score = grid.score(x_test, y_test)

            if valid_score > best_score:
                best_score = valid_score
                best_model_name = self.grid_dict[i]

            self.results.append({
                "model": self.grid_dict[i],
                "params": grid.best_params_,
                "valid_score": valid_score
            })

            print(f"Best params: {grid.best_params_}")
            print(f"Best training accuracy: {grid.best_score_:.3f}")
            print(f"Validation set accuracy score for best params: {grid.score(x_test, y_test):.3f}")

        print(f"Classifier with best validation set accuracy: {best_model_name}")
        return best_model_name
    
    def best_results(self):
        return pd.DataFrame(self.results)
        


In [19]:
svm = SVC()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 
               'C':[0.01, 0.1, 1, 1.5, 5, 10], 
               'gamma': ['scale', 'auto'], 
               'class_weight':('balanced', None), 
               'random_state':[21], 
               'probability':[True]}]

tree_params = [{
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'class_weight': ['balanced', None],
    'random_state': [21]
}]

rf_params = [{
    'n_estimators': [10, 50],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30],
    'class_weight': [None, 'balanced'],
    'random_state': [21]
}]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=-1)

grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: "SVM", 1: "Decision Tree", 2: "Random Forest"}
model_selection = ModelSelection(grids, grid_dict)
model_selection.choose(x_train, x_test, y_train, y_test)

  0%|          | 0/3 [00:00<?, ?it/s]

Estimator SVM
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.809
Validation set accuracy score for best params: 0.888
Estimator Decision Tree
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'random_state': 21}
Best training accuracy: 0.827
Validation set accuracy score for best params: 0.861
Estimator Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.867
Validation set accuracy score for best params: 0.929
Classifier with best validation set accuracy: Random Forest


'Random Forest'

In [20]:
df_best_model = model_selection.best_results()
df_best_model

Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.887574
1,Decision Tree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.860947
2,Random Forest,"{'class_weight': None, 'criterion': 'entropy',...",0.928994


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [21]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator

    def final_score(self, x_train, x_test, y_train, y_test):
        self.estimator.fit(x_train, y_train)
        predict = self.estimator.predict(x_test)
        print(f"Accuracy of the final model is {accuracy_score(y_test, predict)}")

    def save_model(self, path):
        try:
            dump(self.estimator, path)
            print(f"Model was successfuly saved on path: {path}")
        except Exception as e:
            raise Exception(f"Error while saving model: {e}")
        

In [22]:
final = Finalize(rf)
final.final_score(x_train, x_test, y_train, y_test)
final.save_model("rf_model.joblib")

Accuracy of the final model is 0.9319526627218935
Model was successfuly saved on path: rf_model.joblib


## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [23]:
class Main:
    def __init__(self, name_of_file):
        self.name_of_file = name_of_file

    def main(self):
        feature_extractor = FeatureExtractor(self.name_of_file)
        df = feature_extractor.read()

        one_hot_encoder = MyOneHotEncoder(df)
        x, y = one_hot_encoder.identifies()

        train_validation_test = TrainValidationTest(x, y)
        x_train, x_test, y_train, y_test = train_validation_test.transform()

        svm = SVC()
        tree = DecisionTreeClassifier()
        rf = RandomForestClassifier()
        svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 
                    'C':[0.01, 0.1, 1, 1.5, 5, 10], 
                    'gamma': ['scale', 'auto'], 
                    'class_weight':('balanced', None), 
                    'random_state':[21], 
                    'probability':[True]}]
        tree_params = [{
            'criterion': ['gini', 'entropy'],
            'max_depth': [5, 10, 15, 20],
            'class_weight': ['balanced', None],
            'random_state': [21]
        }]
        rf_params = [{
            'n_estimators': [10, 50],
            'criterion': ['gini', 'entropy'],
            'max_depth': [10, 20, 30],
            'class_weight': [None, 'balanced'],
            'random_state': [21]
        }]
        gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
        gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=-1)
        gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=-1)
        grids = [gs_svm, gs_tree, gs_rf]
        grid_dict = {0: "SVM", 1: "Decision Tree", 2: "Random Forest"}        
    
        self.model_selection = ModelSelection(grids, grid_dict)
        model_selection.choose(x_train, x_test, y_train, y_test)
        df_best = model_selection.best_results()

        param = df_best.iloc[2][1]
        model_forest = RandomForestClassifier(**param)

        self.finalize = Finalize(model_forest)
        final.final_score(x_train, x_test, y_train, y_test)
        final.save_model("rf_model.joblib")

In [24]:
main_obj = Main("../data/checker_submits.csv")
main_obj.main()

  0%|          | 0/3 [00:00<?, ?it/s]

Estimator SVM
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.809
Validation set accuracy score for best params: 0.888
Estimator Decision Tree
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'random_state': 21}
Best training accuracy: 0.827
Validation set accuracy score for best params: 0.861
Estimator Random Forest
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.867
Validation set accuracy score for best params: 0.929
Classifier with best validation set accuracy: Random Forest
Accuracy of the final model is 0.9349112426035503
Model was successfuly saved on path: rf_model.joblib
