# Day 08. Exercise 02
# Multiclass classification. One-hot encoding. Random forest

## 0. Imports

In [27]:
import pandas as pd
import numpy as np

In [28]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
output_notebook()

In [29]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Прочитайте файл [`checker-submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
2. Создайте кадр данных `df` со столбцами: `uid`, `labname`, `numTrials`, `hour`, `dayofweek`, где `hour` извлекается из `timestamp`, а также `dayofweek` (`0` - понедельник, `6` - воскресенье). Мы попытаемся предсказать день недели, имея данные о том, какой пользователь сделал коммит для какой лаборатории в какой час и какая это была попытка.
3. Используя `OneHotEncoder()` преобразование категориальных признаков, удалите из датафрейма исходные столбцы.
4. Используйте `StandardScaler()` и масштабируйте непрерывные признаки.
5. Сохраните кадр данных в формате `dayofweek.csv`.
6. Прежде чем пробовать различные алгоритмы, выясните точность наивного алгоритма - того, который предсказывает все как наиболее популярный класс.

In [30]:
df = pd.read_csv('../data/checker_submits.csv', parse_dates=['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df.head()

Unnamed: 0,uid,labname,numTrials,timestamp,hour,dayofweek
0,user_4,project1,1,2020-04-17 05:19:02.744528,5,4
1,user_4,project1,2,2020-04-17 05:22:45.549397,5,4
2,user_4,project1,3,2020-04-17 05:34:24.422370,5,4
3,user_4,project1,4,2020-04-17 05:43:27.773992,5,4
4,user_4,project1,5,2020-04-17 05:46:32.275104,5,4


In [31]:
onehotencoder = OneHotEncoder(handle_unknown='ignore')
uid = onehotencoder.fit_transform(df[['uid']]).toarray()
uid = pd.DataFrame(uid, columns=df['uid'].unique())
uid.head()

Unnamed: 0,user_4,user_17,user_30,user_2,user_14,user_12,user_8,user_29,user_22,user_16,...,user_15,user_13,user_20,user_6,user_1,user_27,user_18,user_11,user_7,user_23
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [32]:
labname = onehotencoder.fit_transform(df[['labname']]).toarray()
labname = pd.DataFrame(labname, columns=df['labname'].unique())
labname.head()

Unnamed: 0,project1,laba04,laba04s,lab03,lab03s,lab02,code_rvw,laba05,lab05s,laba06,laba06s
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
df.drop(['uid', 'labname', 'timestamp'], axis=1, inplace=True)
df = pd.concat([df, uid, labname], axis=1)
df.head()

Unnamed: 0,numTrials,hour,dayofweek,user_4,user_17,user_30,user_2,user_14,user_12,user_8,...,laba04,laba04s,lab03,lab03s,lab02,code_rvw,laba05,lab05s,laba06,laba06s
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
standartscaler = StandardScaler()
df['numTrials'] = standartscaler.fit_transform(df[['numTrials']])
df['hour'] = standartscaler.fit_transform(df[['hour']])
df.head()

Unnamed: 0,numTrials,hour,dayofweek,user_4,user_17,user_30,user_2,user_14,user_12,user_8,...,laba04,laba04s,lab03,lab03s,lab02,code_rvw,laba05,lab05s,laba06,laba06s
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
# df.to_csv('../data/dayofweek.csv', index=False)

In [36]:
print(f'accuracy = {accuracy_score([df['dayofweek'].mode()] * len(df), df['dayofweek'])}')

accuracy = 0.23487544483985764


## 2. Algorithms

### a. Logreg

1. Обучите логистическую регрессию, для базовой модели используйте `random_state=21`, `fit_intercept=False`. 
2. Рассчитайте точность.
3. Напишите функцию, которая строит график (`barh`) с коэффициентами всех обученных моделей, именами признаков и количеством `top-n` наиболее важных признаков для отображения.
4. Постройте график (`barh`) для базовой модели с топ-10 наиболее важных признаков (абсолютное значение) для обученной модели.
5. Помните, что это многоклассовая классификация и `coef_` возвращает матрицу, чтобы вычислить важность признака, нужно просуммировать все индивидуальные важности признаков для всех целевых значений.

In [37]:
x = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']

In [38]:
model = LogisticRegression(random_state=21, fit_intercept=False)
model.fit(x, y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')

accuracy = 0.6405693950177936


In [39]:
coef = pd.DataFrame(model.coef_, columns=x.columns)
coef

Unnamed: 0,numTrials,hour,user_4,user_17,user_30,user_2,user_14,user_12,user_8,user_29,...,laba04,laba04s,lab03,lab03s,lab02,code_rvw,laba05,lab05s,laba06,laba06s
0,-0.029706,0.060742,0.887855,-0.722041,-1.209618,-0.135506,0.118553,-0.242924,0.993158,-0.57437,...,0.887855,0.55905,0.560158,-0.343194,0.050205,-0.523393,-1.747894,-0.422551,-0.803889,0.227731
1,0.520633,-0.322071,-0.092215,-0.754495,0.478224,-0.071531,0.045304,1.872677,1.070649,-0.348605,...,-0.092215,-0.0676,-0.060694,-0.008029,0.00508,-0.754884,-1.235137,1.754113,1.382146,0.492415
2,0.64472,0.543219,-0.102669,-0.553443,1.531979,-0.179436,-0.437954,-0.789167,-1.999897,-0.422269,...,-0.102669,-0.026437,-0.023188,-0.512791,-0.187417,0.076019,-0.765157,-0.923292,1.65027,-0.089549
3,0.847089,-0.148051,-0.077951,2.079764,0.978925,-0.26365,0.122552,-1.066014,-1.895669,0.836701,...,-0.077951,-0.109452,-0.093234,-0.874854,-0.628207,0.140535,0.702804,0.579607,1.395373,1.677414
4,-0.808451,0.040166,-0.220369,-0.587152,-0.347626,-0.52512,-1.192528,-1.019025,-0.982025,-0.450843,...,-0.220369,-0.063324,-0.071402,-0.897463,-1.231017,-0.879798,0.822831,-1.679946,-1.985332,-0.784483
5,-0.795752,-0.019169,-0.219953,-0.648729,0.075317,-0.936193,-0.91736,0.513349,2.461771,-0.045063,...,-0.219953,-0.038542,-0.043516,0.395595,1.182763,0.067202,1.903114,-0.879109,-1.470127,-1.268375
6,-0.378532,-0.154836,-0.174698,1.186094,-1.5072,2.111436,2.261434,0.731104,0.352012,1.004449,...,-0.174698,-0.253694,-0.268123,2.240736,0.808592,1.874319,0.319439,1.571178,-0.168442,-0.255154


In [40]:
def plot_features_bokeh(coef, features, n):
    coef = coef.mean(axis=0)
    coef = coef / coef.sum()

    indices = coef.argsort()[::-1][:n]
    sorted_coef = coef[indices]
    sorted_features = features[indices].tolist()

    source = ColumnDataSource(data=dict(
        features=sorted_features,
        coef=sorted_coef
    ))

    p = figure(
        y_range=sorted_features,
        title=f'Top {n} Features',
        x_axis_label='Feature Importance',
        y_axis_label='Features',
        toolbar_location=None,
        width=800,
        height=400
    )

    p.hbar(
        y='features',
        right='coef',
        height=0.5,
        source=source,
    )

    p.x_range.start = 0

    show(p)

In [41]:
plot_features_bokeh(model.coef_, x.columns, 10)

### b. SVC

1. Обучите модель `SVC`, для базовой модели используйте параметры `kernel='linear'`, `probability=True`, `random_state=21`. 
2. Попробуйте разные ядра, подсчитайте точность.
3. Постройте график (`barh`) для базовой модели с топ-10 наиболее важных признаков (абсолютное значение) для обученной модели для линейного ядра.

* По умолчанию SVC использует стратегию классификации «один против одного», поэтому в `coef_` возвращается матрица. Для расчета важности признака необходимо использовать [OneVsRestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) над SVC и просуммировать все импорты отдельных признаков для всех целевых значений.

In [42]:
model = SVC(kernel='linear', probability=True, random_state=21)
model.fit(x,y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')
plot_features_bokeh(model.coef_, x.columns, 10)

accuracy = 0.702846975088968


In [43]:
model = SVC(probability=True, random_state=21)
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
param = GridSearchCV(model, param_grid, scoring='accuracy')
param.fit(x, y)
param.best_params_

{'kernel': 'poly'}

In [44]:
model = SVC(kernel='poly', probability=True, random_state=21)
model.fit(x,y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')

accuracy = 0.8594306049822064


### c. Decision tree

1. Обучите `DecisionTreeClassifier`, используя для базовой модели `max_depth=4`, `random_state=21`. 
2. Попробуйте различные значения `max_depth`, рассчитайте точность.
3. Постройте график (`barh`) для базовой модели с топ-10 наиболее важных признаков (абсолютное значение) для обученной модели с помощью написанной функции.

In [45]:
model = DecisionTreeClassifier(max_depth=4, random_state=21)
model.fit(x, y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')

accuracy = 0.5516014234875445


In [46]:
for i in range(2, 22, 2):
    model = DecisionTreeClassifier(max_depth=i, random_state=42)
    model.fit(x, y)
    print(f'max_depth = {i} accuracy = {accuracy_score(y, model.predict(x))}')

max_depth = 2 accuracy = 0.4389086595492289
max_depth = 4 accuracy = 0.5516014234875445
max_depth = 6 accuracy = 0.6637010676156584
max_depth = 8 accuracy = 0.7502965599051008
max_depth = 10 accuracy = 0.8196915776986952
max_depth = 12 accuracy = 0.8831553973902728
max_depth = 14 accuracy = 0.9317912218268091
max_depth = 16 accuracy = 0.9608540925266904
max_depth = 18 accuracy = 0.9798339264531435
max_depth = 20 accuracy = 0.9869513641755635


In [47]:
model = DecisionTreeClassifier(max_depth=20, random_state=21)
model.fit(x, y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')
coef = pd.DataFrame(model.feature_importances_).T
plot_features_bokeh(coef, x.columns, 10)

accuracy = 0.9869513641755635


### d. Random forest

В реальной жизни лес - это набор деревьев. То же самое происходит и с машинным обучением. Случайный лес - это набор отдельных деревьев решений (более подробную информацию можно найти в документации).

1. Обучите `RandomForestClassifier`, используя для базовой модели параметры `n_estimators=100`, `max_depth=25`, `random_state=21`. 
2. Попробуйте различные значения `max_depth` и `n_estimators`, рассчитайте точность.
3. Постройте график (`barh`) для базовой модели с топ-10 наиболее важных признаков (абсолютное значение) для обученной модели с помощью написанной функции.

In [48]:
model = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)
model.fit(x, y)
predict = model.predict(x)
print(f'accuracy = {accuracy_score(y, predict)}')
coef = pd.DataFrame(model.feature_importances_).T
plot_features_bokeh(coef, x.columns, 10)

accuracy = 1.0


In [49]:
for i in range(20, 120, 20):
    for j in range(5, 30, 5):
        model = RandomForestClassifier(n_estimators=i, max_depth=j, random_state=21)
        model.fit(x, y)
        print(f'n_estimators = {i} max_depth = {j} accuracy = {accuracy_score(y, model.predict(x))}')

n_estimators = 20 max_depth = 5 accuracy = 0.5830367734282325
n_estimators = 20 max_depth = 10 accuracy = 0.891459074733096
n_estimators = 20 max_depth = 15 accuracy = 0.9644128113879004
n_estimators = 20 max_depth = 20 accuracy = 0.994661921708185
n_estimators = 20 max_depth = 25 accuracy = 0.998220640569395
n_estimators = 40 max_depth = 5 accuracy = 0.5925266903914591
n_estimators = 40 max_depth = 10 accuracy = 0.8807829181494662
n_estimators = 40 max_depth = 15 accuracy = 0.9768683274021353
n_estimators = 40 max_depth = 20 accuracy = 0.9970344009489917
n_estimators = 40 max_depth = 25 accuracy = 1.0
n_estimators = 60 max_depth = 5 accuracy = 0.5747330960854092
n_estimators = 60 max_depth = 10 accuracy = 0.8766310794780545
n_estimators = 60 max_depth = 15 accuracy = 0.9792408066429419
n_estimators = 60 max_depth = 20 accuracy = 0.9958481613285883
n_estimators = 60 max_depth = 25 accuracy = 1.0
n_estimators = 80 max_depth = 5 accuracy = 0.5830367734282325
n_estimators = 80 max_depth =