# Лабораторная работа 1. Введение в машинное обучение. Обучение с учителем. Задача регрессии

<b>Традиционное предупреждение для всех лабораторных работ:</b> перед обучением моделей необходимо выполнить предварительную обработку данных, которая <b>обязательно</b> включает в себя:
- заполнение пропущенных значений (рекомедуется логика заполнения пропусков на основе типа данных, которая использовалась в РГР по Практикуму);
- преобразование категориальных признаков в числовые (используйте one-hot кодирование или map; используйте знания с Практикума).

Предобработка может включать в себя другие действия, но выполнение описанных выше действий обязательно.

Сделайте это один раз и сохраните в отдельный csv файл, а потом его используйте.

<b>Выполните следующие задания:</b>
- загрузите датасет для регрессии, выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- вычислите значения метрик $R^2$, MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель;
- самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2);
- самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента);
- обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.

In [1]:
from pathlib import Path
import os
import sys

sys.path.append(str(Path(os.getcwd()).parent))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from math import sqrt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

Функция для вычисления метрик

In [3]:
def metrics(test, predict):
    print(f'MAE: {mean_absolute_error(test, predict)}')
    print(f'MSE: {mean_squared_error(test, predict)}')
    print(f'RMSE: {sqrt(mean_squared_error(test, predict))}')
    print(f'MAPE: {mean_absolute_percentage_error(test, predict)}')
    print(f'R^2: {r2_score(test, predict)}')

Функция для заполнения пропусков в выборке

In [4]:
def fill_empty_cell(column_name, df):
    if df.dtypes[column_name] == "float64":
        df[column_name] = df[column_name].fillna(df[column_name].mean())
    elif df.dtypes[column_name] == "int64":
        df[column_name] = df[column_name].fillna(df[column_name].median())
    else:
        df[column_name] = df[column_name].fillna(df[column_name].mode())

Ridge

In [5]:
def create_ridge_with_hyperparameter(X_train, y_train) -> Ridge:
    alpha = {'alpha': np.arange(0, 1, 0.1)}
    ridge = GridSearchCV(Ridge(), alpha).fit(X_train, y_train)
    return ridge

Lasso

In [6]:
def create_lasso_with_hyperparameter(X_train, y_train) -> Lasso:
    alpha = {'alpha': np.arange(0, 1, 0.1)}
    lasso = RandomizedSearchCV(Lasso(), alpha).fit(X_train, y_train)
    return lasso

In [7]:
table = pd.read_csv("../data/energy_task.csv", parse_dates=['date'])
table

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
0,2016-11-01 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,18.2000,48.900000,17.033333,45.5300,6.60,733.5,92.000000,7.000000,63.000000,5.3
1,2016-11-01 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,18.2000,48.863333,17.066667,45.5600,6.48,733.6,92.000000,6.666667,59.166667,5.2
2,2016-11-01 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,18.2000,48.730000,17.000000,45.5000,6.37,733.7,92.000000,6.333333,55.333333,5.1
3,2016-11-01 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,18.1000,48.590000,17.000000,45.4000,6.25,733.8,92.000000,6.000000,51.500000,5.0
4,2016-11-01 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,18.1000,48.590000,17.000000,45.4000,6.13,733.9,92.000000,5.666667,47.666667,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,24.7000,50.074000,23.200000,46.7900,22.70,755.2,55.666667,3.333333,23.666667,13.3
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,24.7000,49.790000,23.200000,46.7900,22.60,755.2,56.000000,3.500000,24.500000,13.3
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,24.7000,49.660000,23.200000,46.7900,22.50,755.2,56.333333,3.666667,25.333333,13.3
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,24.6625,49.518750,23.200000,46.8175,22.30,755.2,56.666667,3.833333,26.166667,13.2


In [8]:
table['month'] = pd.DatetimeIndex(table['date']).month
table['day'] = pd.DatetimeIndex(table['date']).day
table.drop(['date'], axis=1, inplace=True)
table

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,month,day
0,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,...,17.033333,45.5300,6.60,733.5,92.000000,7.000000,63.000000,5.3,11.0,1.0
1,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,...,17.066667,45.5600,6.48,733.6,92.000000,6.666667,59.166667,5.2,11.0,1.0
2,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,17.000000,45.5000,6.37,733.7,92.000000,6.333333,55.333333,5.1,11.0,1.0
3,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,...,17.000000,45.4000,6.25,733.8,92.000000,6.000000,51.500000,5.0,11.0,1.0
4,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,...,17.000000,45.4000,6.13,733.9,92.000000,5.666667,47.666667,4.9,11.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,...,23.200000,46.7900,22.70,755.2,55.666667,3.333333,23.666667,13.3,5.0,27.0
19731,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,...,23.200000,46.7900,22.60,755.2,56.000000,3.500000,24.500000,13.3,5.0,27.0
19732,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,...,23.200000,46.7900,22.50,755.2,56.333333,3.666667,25.333333,13.3,5.0,27.0
19733,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,...,23.200000,46.8175,22.30,755.2,56.666667,3.833333,26.166667,13.2,5.0,27.0


In [9]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 28 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Appliances   19735 non-null  int64  
 1   lights       19735 non-null  int64  
 2   T1           19735 non-null  float64
 3   RH_1         19735 non-null  float64
 4   T2           19735 non-null  float64
 5   RH_2         19569 non-null  float64
 6   T3           19735 non-null  float64
 7   RH_3         19735 non-null  float64
 8   T4           19735 non-null  float64
 9   RH_4         19654 non-null  float64
 10  T5           19735 non-null  float64
 11  RH_5         19644 non-null  float64
 12  T6           19735 non-null  float64
 13  RH_6         19651 non-null  float64
 14  T7           19681 non-null  float64
 15  RH_7         19735 non-null  float64
 16  T8           19613 non-null  float64
 17  RH_8         19735 non-null  float64
 18  T9           19651 non-null  float64
 19  RH_9

Проверка датасета на наличие пустых данных

In [10]:
null_columns = []
for i in table.columns:
    if len(table[table[i].isnull()]) > 0:
        null_columns.append(i)
print(null_columns)

['RH_2', 'RH_4', 'RH_5', 'RH_6', 'T7', 'T8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Visibility', 'month', 'day']


In [11]:
for name in null_columns:
    fill_empty_cell(name, table)

In [12]:
for i in table.columns:
    if len(table[table[i].isnull()]) > 0:
        print(f"В столбце {i} есть пустые элементы")

In [13]:
table.to_csv('../data/energy_task_compleated.csv') 

Создание обучающей выборки

In [14]:
y = table["Appliances"]
X = table.drop(["Appliances"], axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

1. Простая линейная регрессия

In [16]:
lr = LinearRegression().fit(X_train, y_train)
y1_predict = lr.predict(X_test)
metrics(y_test, y1_predict)

MAE: 53.1019443956302
MSE: 8590.321257679258
RMSE: 92.6839859829046
MAPE: 0.625739332131595
R^2: 0.1662004552090619


In [17]:
from mylib.regression.LinearRegressionClass import MyLinearRegression_L2


linear = MyLinearRegression_L2(np.array(X_train), np.array(y_train), alpha=0.6, eps=0.001)
linear.fit()
y_my = linear.predict(X_test)
print(y_my)
linear.coef


1) 20150.8
2) 1161468.2
3) 28707449.8
4) 257538210.8
5) 1026783081.6
6) 2005032757.9
7) 1996372403.3
8) 1013284328.0
9) 251688670.9
10) 27733517.5
11) 1105177.4
12) 19411.3
13) 10597.9
14) 10597.5
15) 10597.0
16) 10596.6
17) 10596.3
18) 10595.9
19) 10595.6
20) 10595.3
21) 10595.0
22) 10594.8
23) 10594.5
24) 10594.2
25) 10594.0
26) 10593.8
27) 10593.6
28) 10593.3
29) 10593.1
30) 10592.9
31) 10592.8
32) 10592.6
33) 10592.4
34) 10592.2
35) 10592.1
36) 10591.9
37) 10591.7
38) 10591.6
39) 10591.4
40) 10591.3
41) 10591.1
42) 10591.0
43) 10590.9
44) 10590.7
45) 10590.6
46) 10590.5
47) 10590.4
48) 10590.2
49) 10590.1
50) 10590.0
51) 10589.9
52) 10589.8
53) 10589.7
54) 10589.6
55) 10589.5
56) 10589.4
57) 10589.3
58) 10589.2
59) 10589.1
60) 10589.0
61) 10588.9
62) 10588.8
63) 10588.7
64) 10588.6
65) 10588.5
66) 10588.4
67) 10588.4
68) 10588.3
69) 10588.2
70) 10588.1
71) 10588.0
72) 10588.0
73) 10587.9
74) 10587.8
75) 10587.7
76) 10587.7
77) 10587.6
78) 10587.5
79) 10587.4
80) 10587.4
81) 10587.3

array([ 0.00017222,  0.01741951,  0.00461633,  0.01074292,  0.00626807,
        0.00453758,  0.00556758,  0.008077  ,  0.00440687,  0.00755034,
        0.00364403,  0.00949886,  0.00879609, -0.0161816 ,  0.00400981,
        0.003236  ,  0.004547  ,  0.00229717,  0.00352265,  0.00490494,
        0.00668515,  0.12694461, -0.00976833,  0.00299602,  0.00696547,
        0.00130819,  0.00054102,  0.00186807])

In [18]:
from mylib.regression.Metrics import all_metrics


all_metrics(y_test, y_my)

MAE: 60.34991312357836
MSE: 10288.414570155166
RMSE: 101.43182227563086
MAPE: 0.7717604487719028
R^2: 0.0013789789820533027


In [19]:
from mylib.regression.Metrics import all_metrics


all_metrics(y_test, y1_predict)

MAE: 53.1019443956302
MSE: 8590.321257679258
RMSE: 92.6839859829046
MAPE: 0.625739332131595
R^2: 0.1662004552090619


2. L1 - Lasso (подбор гиперпараметра + модель)

In [20]:
l = create_lasso_with_hyperparameter(X_train, y_train)
y2_predict = l.predict(X_test)
metrics(y_test, y2_predict)

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


MAE: 53.0708462585439
MSE: 8590.638032142686
RMSE: 92.68569486249044
MAPE: 0.6250942314984458
R^2: 0.16616970823284472


3. L2 - Ridge (подбор гиперпараметра + модель)

In [21]:
r = create_ridge_with_hyperparameter(X_train, y_train)
y3_predict = r.predict(X_test)
metrics(y_test, y3_predict)

MAE: 53.10180844294638
MSE: 8590.335535977742
RMSE: 92.68406300965523
MAPE: 0.6257365888985242
R^2: 0.16619906931924255


4. Полиномиальная регрессия

In [22]:
table_p = table.drop(columns=["T1", "RH_4", "RH_6", "T7"])
y_p = table_p["Appliances"]
X_p = table_p.drop(["Appliances"], axis=1)
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size=0.5, random_state=42)

In [23]:
p = PolynomialFeatures(2)
X_PolynomialFeatures_train = p.fit_transform(X_p_train)
X_PolynomialFeatures_test = p.fit_transform(X_p_test)

In [24]:
lr2 = LinearRegression().fit(X_PolynomialFeatures_train, y_p_train)
y_PolynomialFeatures_predict = lr2.predict(X_PolynomialFeatures_test)
metrics(y_p_test, y_PolynomialFeatures_predict)

MAE: 51.41672123946578
MSE: 8319.423805925595
RMSE: 91.21087548053464
MAPE: 0.5778102318319449
R^2: 0.22703334144107457


5. Полиномиальная регрессия + Ridge

In [25]:
r_p = create_ridge_with_hyperparameter(X_PolynomialFeatures_train, y_p_train)
y_p_ridge_predict = r_p.predict(X_PolynomialFeatures_test)
metrics(y_p_test, y_p_ridge_predict)

MAE: 51.29899615713576
MSE: 8308.243184240684
RMSE: 91.14956491525719
MAPE: 0.5753872632660575
R^2: 0.22807214508733697


6. Полиномиальная регрессия + Lasso

In [26]:
l_p = create_lasso_with_hyperparameter(X_PolynomialFeatures_train, y_p_train)
y_p_lasso_predict = l_p.predict(X_PolynomialFeatures_test)
metrics(y_p_test, y_p_ridge_predict)

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordi

MAE: 51.29899615713576
MSE: 8308.243184240684
RMSE: 91.14956491525719
MAPE: 0.5753872632660575
R^2: 0.22807214508733697


  model = cd_fast.enet_coordinate_descent(


In [27]:
print(np.array(table))

[[ 60.    30.    19.89 ...   5.3   11.     1.  ]
 [ 60.    30.    19.89 ...   5.2   11.     1.  ]
 [ 50.    30.    19.89 ...   5.1   11.     1.  ]
 ...
 [270.    10.    25.5  ...  13.3    5.    27.  ]
 [420.    10.    25.5  ...  13.2    5.    27.  ]
 [430.    10.    25.5  ...  13.2    5.    27.  ]]
