# Day 08. Exercise 04
# Regression

## 0. Imports

In [18]:
import pandas as pd
import numpy as np

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool
output_notebook()

In [20]:
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## 1. Preprocessing

1. Прочитайте файл [`checker_regression.csv`](https://drive.google.com/file/d/1L8auBzJEghWFewznhhFpcrqxW4GqcfRY/view?usp=sharing).
2. Заполните пропущенные значения в нем значением `0`.
3. Сделайте разбиение вашего набора данных на train и test с параметрами `test_size=0.2`.

In [21]:
df = pd.read_csv('../data/checker_regression.csv')
df.fillna(0, inplace=True)
df

Unnamed: 0,uid,num_commits,pageviews,AVG(diff)
0,user_1,62,28.0,0.00
1,user_1,62,28.0,0.00
2,user_1,62,28.0,0.00
3,user_1,62,28.0,0.00
4,user_1,62,28.0,0.00
...,...,...,...,...
72,user_31,128,0.0,-104.75
73,user_4,40,0.0,-175.60
74,user_6,15,0.0,-62.60
75,user_7,8,0.0,-115.50


In [22]:
x = df.drop('AVG(diff)', axis=1)
x['uid'] = x['uid'].str.replace('user_', '').astype(int)
standardscaler = StandardScaler()
x['uid'] = standardscaler.fit_transform(x[['uid']])
y = df['AVG(diff)']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21,)

## 2. Crossvalidation

1. Напишите функцию `crossval`, которая принимает в качестве аргументов: `n_splits` для `KFold()`, `X`, `y`, инстанцированный класс модели с параметрами модели (помните: `random_state=21`) и возвращает для заданного класса модели результат, подобный этому:

```
train -  2696.4496895424836   |   test -  1589.9979527104958
train -  2660.957874001452   |   test -  2075.102636027137
train -  2847.315529246795   |   test -  320.911928168403
train -  2500.7691099659237   |   test -  4132.461382030178
train -  2643.927917295123   |   test -  2237.8140952197878
train -  2396.295678819444   |   test -  4509.650064742476
train -  2003.402267924976   |   test -  8403.491474908551
train -  2531.876094212613   |   test -  3135.944102735099
train -  2683.1795186023123   |   test -  1796.01426292594
train -  2537.1192483996338   |   test -  3439.29824116941
Average RMSE on crossval is 3164.0686140637476
```

2. Запустите функцию для `Линейной регрессии`, `Деревянного регрессора`, `Регрессора случайного леса`. Вы можете выбрать параметры самостоятельно, но найдите достаточно хорошие для вас.

In [23]:
def crossval(n_splits, x, y, model):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21)

    scores = cross_val_score(model, x_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    scores_test = cross_val_score(model, x_test, y_test, cv=kf, scoring='neg_mean_squared_error')
    predict = cross_val_predict(model, x_test, y_test, cv=kf)
    mse = mean_squared_error(y_test, predict)

    for i in range(len(scores)):
        print(f'train - {scores[i]} | test - {scores_test[i]}')
    rmse = np.sqrt(mse)
    print(f'Average RMSE on crossval is {rmse}')

In [24]:
model_lr = LinearRegression()
crossval(4, x, y, model_lr)

train - -2293.5389211071406 | test - -3838.412756854301
train - -2508.5736671968334 | test - -5206.773318826869
train - -2029.651581850907 | test - -11868.730916838367
train - -1341.8767920770042 | test - -3524.6807871338297
Average RMSE on crossval is 78.16424658955872


In [25]:
model_dtr = DecisionTreeRegressor(random_state=21)
crossval(4, x, y, model_dtr)

train - -2009.5231250000002 | test - -0.0
train - -1218.1840000000002 | test - -828.3125
train - -1551.0886666666668 | test - -11029.365
train - -2455.9486851851852 | test - -410.0625
Average RMSE on crossval is 55.37991513175151


In [26]:
model_rfr = RandomForestRegressor(random_state = 21)
crossval(4, x, y, model_rfr)

train - -1205.509517520833 | test - -831.7798032500001
train - -1208.6727549870368 | test - -3862.6422820000034
train - -1082.4036316666673 | test - -9470.8552125
train - -1950.2907741037018 | test - -525.2713087499999
Average RMSE on crossval is 60.60228668643619


## 3. Predictions and evaluation

1. Сделайте прогнозы для тестового набора данных, используя каждую из трех моделей с окончательно определенными параметрами.
2. Постройте график для каждой из моделей, где ось `x` - фактическая средняя разница, а ось `y` - предсказание, сделанное моделью.
3. Как выглядел бы график в идеальном случае? Поместите ответ в ячейку markdown в конце раздела.

In [27]:
model_lr = LinearRegression()

param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}

grid_search = GridSearchCV(estimator=model_lr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)

print("Best params:", grid_search.best_params_)
y_pred_lr = grid_search.predict(x_test)
print(f'mean_squared_error rfr = {mean_squared_error(y_test, y_pred_lr)}')

Best params: {'copy_X': True, 'fit_intercept': True}
mean_squared_error rfr = 3031.927706672527


In [28]:
model_dtr = DecisionTreeRegressor(random_state=21)

param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
}

grid_search = GridSearchCV(estimator=model_dtr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(x_train, y_train)


print("Best params:", grid_search.best_params_)
y_pred_dtr = grid_search.predict(x_test)
print(f'mean_squared_error dtr = {mean_squared_error(y_test, y_pred_dtr)}')

Best params: {'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}
mean_squared_error dtr = 2621.3257638888886


In [29]:
model_rfr = RandomForestRegressor(random_state = 21)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error']
}

grid_search = GridSearchCV(estimator=model_rfr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(x_train, y_train)

print("Best params:", grid_search.best_params_)
y_pred_rfr = grid_search.predict(x_test)
print(f'mean_squared_error dtr = {mean_squared_error(y_test, y_pred_rfr)}')

Best params: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
mean_squared_error dtr = 2016.7385590182291


In [30]:
def scatter_plot(x1, y1, title):
    fig = figure(title=title,
                 x_axis_label='y_true',
                 y_axis_label='y_pred',
                 width=600, height=400)

    fig.scatter(x1, y1, legend_label='y_true_pred', color='blue', size=10, marker='circle')

    fig.legend.location = 'top_left'
    fig.legend.click_policy = 'hide'
    hover = HoverTool()
    hover.tooltips = [
        ("X", "@x"),
        ("Y", "@y"),
    ]
    fig.add_tools(hover)
    return fig

In [31]:
fig_lr = scatter_plot(y_test, y_pred_lr, 'Linear Regression')
fig_dtr = scatter_plot(y_test, y_pred_dtr, 'Decision Tree Regressor')
fig_rfr = scatter_plot(y_test, y_pred_rfr, 'Random Forest Regressor')
fig_rfr_t = scatter_plot(y_test, y_test, 'Ideal Case')

xx = [0, -300]
yy = [0, -300]

fig_lr.line(xx, yy, legend_label='line', line_width=2, line_color='black')
fig_dtr.line(xx, yy, legend_label='line', line_width=2, line_color='black')
fig_rfr.line(xx, yy, legend_label='line', line_width=2, line_color='black')
fig_rfr_t.line(xx, yy, legend_label='line', line_width=2, line_color='black')

grid = gridplot([[fig_lr, fig_dtr], [fig_rfr, fig_rfr_t]])
show(grid)

В идеальном случае y_pred должен быть точно такойже как y_test и график такой предствален на рисунке Ideal Case.