## ЛР2 - Линейная регрессия

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.3, palette='Set2')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
df = pd.read_csv("train.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
0,6622,ecom,6488536.0,,0,82221,0,0,18.450527,0.46175,2020-04-21 16:04:41.817367072
1,2047,information_source,874840.0,21100.0,0,12872,0,0,10.721619,-0.022317,2024-07-19 23:50:07.268931816
2,1118,information_source,571210.0,94707.0,0,0,7420,0,1.922243,0.046396,2024-07-13 16:35:54.794883135
3,4992,news,89534.0,924.0,0,834,0,0,2.149243,-0.09336,2024-09-10 21:29:14.006315095
4,9970,information_source,1043953.0,289288.0,0,58375,20260,3948,3.764965,0.027303,2024-05-26 11:07:15.950527838


### Подготавливаем данные

In [3]:
# Заметим, что первая колонка 'Unnamed: 0' соответствует колонке ID. 
# Переименуем её сопоставим индексы DataFrame этой колонке
df.rename(columns={'Unnamed: 0' : 'ID'}, inplace=True)
df.set_index('ID',inplace=True)
df.head()

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,,0,82221,0,0,18.450527,0.46175,2020-04-21 16:04:41.817367072
2047,information_source,874840.0,21100.0,0,12872,0,0,10.721619,-0.022317,2024-07-19 23:50:07.268931816
1118,information_source,571210.0,94707.0,0,0,7420,0,1.922243,0.046396,2024-07-13 16:35:54.794883135
4992,news,89534.0,924.0,0,834,0,0,2.149243,-0.09336,2024-09-10 21:29:14.006315095
9970,information_source,1043953.0,289288.0,0,58375,20260,3948,3.764965,0.027303,2024-05-26 11:07:15.950527838


In [4]:
df.dtypes

category                  object
clicks                   float64
likes                    float64
buys                       int64
4xx_errors                 int64
5xx_errors                 int64
complaints_count          object
average_dwelltime        float64
source_attractiveness    float64
date_of_registration      object
dtype: object

In [5]:
# Заметим, что колонка complaints_count имеет тип object, 
# но в нем должны храниться числовые значения
# преобразуем иные значения в NaN

df['complaints_count'] = pd.to_numeric(df['complaints_count'], errors='coerce')

df.dtypes

category                  object
clicks                   float64
likes                    float64
buys                       int64
4xx_errors                 int64
5xx_errors                 int64
complaints_count         float64
average_dwelltime        float64
source_attractiveness    float64
date_of_registration      object
dtype: object

In [6]:
# Создадим колонку, в которой будет указано сколько лет существует домен

df['date_of_registration'] = pd.to_datetime(df['date_of_registration'])
df['age'] = (pd.Timestamp('2024-10-15') - df['date_of_registration']) / pd.Timedelta(days=365)

df.head()

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6622,ecom,6488536.0,,0,82221,0,0.0,18.450527,0.46175,2020-04-21 16:04:41.817367072,4.485836
2047,information_source,874840.0,21100.0,0,12872,0,0.0,10.721619,-0.022317,2024-07-19 23:50:07.268931816,0.238375
1118,information_source,571210.0,94707.0,0,0,7420,0.0,1.922243,0.046396,2024-07-13 16:35:54.794883135,0.255639
4992,news,89534.0,924.0,0,834,0,0.0,2.149243,-0.09336,2024-09-10 21:29:14.006315095,0.093438
9970,information_source,1043953.0,289288.0,0,58375,20260,3948.0,3.764965,0.027303,2024-05-26 11:07:15.950527838,0.387772


In [7]:
# Удалим колонку 'date_of_registration' за ненадобностью
df = df.drop(['date_of_registration'], axis = 1)
df.head()

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,,0,82221,0,0.0,18.450527,0.46175,4.485836
2047,information_source,874840.0,21100.0,0,12872,0,0.0,10.721619,-0.022317,0.238375
1118,information_source,571210.0,94707.0,0,0,7420,0.0,1.922243,0.046396,0.255639
4992,news,89534.0,924.0,0,834,0,0.0,2.149243,-0.09336,0.093438
9970,information_source,1043953.0,289288.0,0,58375,20260,3948.0,3.764965,0.027303,0.387772


### Очистим данные в тупую
т.е просто удалим все строки с NaN значениями

In [8]:
df.fillna(df.groupby(['category']).transform('mean'), inplace=True)
without_nan_df = df.copy(deep=True)
without_nan_df





Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,1.633087e+06,0,82221,0,0.000000,18.450527,0.461750,4.485836
2047,information_source,874840.0,2.110000e+04,0,12872,0,0.000000,10.721619,-0.022317,0.238375
1118,information_source,571210.0,9.470700e+04,0,0,7420,0.000000,1.922243,0.046396,0.255639
4992,news,89534.0,9.240000e+02,0,834,0,0.000000,2.149243,-0.093360,0.093438
9970,information_source,1043953.0,2.892880e+05,0,58375,20260,3948.000000,3.764965,0.027303,0.387772
...,...,...,...,...,...,...,...,...,...,...
361,information_source,1468601.0,4.745240e+05,0,61307,83928,16841.000000,4.191481,0.017470,1.321027
2621,information_source,93172.0,6.418000e+03,0,3418,705,0.000000,3.745192,-0.019069,0.231877
2605,information_source,82916.0,0.000000e+00,0,4302,1718,1007.000000,5.837475,-0.061523,0.328112
1231,social,823896.0,1.056660e+05,0,0,2892,267.000000,11.994037,0.131620,1.347616


In [9]:
# Разбиваем выборку
X_train, X_test, Y_train, Y_test = train_test_split(
    without_nan_df.drop('source_attractiveness', axis='columns'),
    without_nan_df['source_attractiveness'],
    test_size=0.25,
    shuffle=True,
    stratify=without_nan_df['category']
)

In [10]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train = np.hstack([X_train.drop(['category'], axis='columns'), encoder.fit_transform(X_train['category'].to_frame())])
X_test = np.hstack([X_test.drop(['category'], axis='columns'), encoder.transform(X_test['category'].to_frame())])

In [11]:
without_nan_model = LinearRegression(fit_intercept=True)
without_nan_model.fit(X_train, Y_train)

In [12]:
without_nan_model.coef_

array([-1.44486477e-08,  2.13176508e-08,  4.70067298e-08,  5.16535400e-08,
       -5.13204681e-07, -1.52286339e-07,  1.86771993e-03,  5.26928460e-02,
       -1.97037791e-01, -2.25981485e-01, -2.71486243e-01, -2.47266443e-01])

### Оценим качество

In [13]:
without_nan_predicts_test = without_nan_model.predict(X_test)
print("Test predicts:")
metrics.mean_squared_error(Y_test, without_nan_predicts_test), \
metrics.mean_absolute_error(Y_test, without_nan_predicts_test), \
metrics.mean_absolute_percentage_error(Y_test, without_nan_predicts_test) * 100

Test predicts:


(0.017019454616619675, 0.08869253774794542, 220.85021404267528)

In [14]:
without_nan_predicts_train = without_nan_model.predict(X_train)
print("Train predicts:")
metrics.mean_squared_error(Y_train, without_nan_predicts_train), \
metrics.mean_absolute_error(Y_train, without_nan_predicts_train), \
metrics.mean_absolute_percentage_error(Y_train, without_nan_predicts_train) * 100

Train predicts:


(0.01583549986808764, 0.08734291915848343, 239.57926680182848)

In [15]:
predictions = np.concatenate([without_nan_predicts_test,
                         without_nan_predicts_train])

### Итог

Результат можно и нужно сделать лучше, но метрика MSE <= 0.02, поэтому ок.


## Submit

In [16]:
data = {
    "source_attractiveness": predictions
}
submit = pd.DataFrame(data)
submit.to_csv('submission.csv', index_label="ID")