In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.3, palette='Set2')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [184]:
df = pd.read_csv("train.csv")

In [185]:
df.groupby(['category']).count()

Unnamed: 0_level_0,Unnamed: 0,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ecom,2452,2360,2336,2452,2452,2452,2284,2452,2452,2452
information_source,3934,3815,3759,3934,3934,3934,3722,3934,3934,3934
news,816,779,774,816,816,816,767,816,816,816
porn,383,370,362,383,383,383,360,383,383,383
social,415,403,393,415,415,415,394,415,415,415


### Подготовим данные

In [186]:
# Заметим, что первая колонка 'Unnamed: 0' соответствует колонке ID. 
# Переименуем её сопоставим индексы DataFrame этой колонке
df.rename(columns={'Unnamed: 0' : 'ID'}, inplace=True)
df.set_index('ID',inplace=True)

In [187]:
df['complaints_count'] = pd.to_numeric(df['complaints_count'], errors='coerce')

df['date_of_registration'] = pd.to_datetime(df['date_of_registration'])
df['age'] = (pd.Timestamp('2024-10-15') - df['date_of_registration']) / pd.Timedelta(days=365)

df = df.drop(['date_of_registration'], axis = 1)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 9289
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 7727 non-null   float64
 2   likes                  7624 non-null   float64
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       7116 non-null   float64
 7   average_dwelltime      8000 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 687.5+ KB


In [188]:
cleared_df = df.copy(deep = True)
cleared_df.fillna(cleared_df.groupby(['category']).transform('median'), inplace=True)
#cleared_df.dropna(inplace=True)
cleared_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 9289
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 8000 non-null   float64
 2   likes                  8000 non-null   float64
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       8000 non-null   float64
 7   average_dwelltime      8000 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 687.5+ KB


In [189]:
cleared_df['clicks'] = cleared_df['clicks'].astype('int64')
cleared_df['likes'] = cleared_df['likes'].astype('int64')
cleared_df['buys'] = cleared_df['buys'].astype('int64')
cleared_df['complaints_count'] = cleared_df['complaints_count'].astype('int64')

cleared_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 9289
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 8000 non-null   int64  
 2   likes                  8000 non-null   int64  
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       8000 non-null   int64  
 7   average_dwelltime      8000 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   float64
dtypes: float64(3), int64(6), object(1)
memory usage: 687.5+ KB


In [190]:
# def clear_emissions(df, categories, real_columns):
#     temp_df = df.copy(deep=True)
#     cleared_df = pd.DataFrame()
#     for category in categories:
#         category_df = temp_df[temp_df['category'] == category]
#         for column in real_columns:
#             q1 = category_df[column].quantile(0.25)
#             q3 = category_df[column].quantile(0.75)
#             iqr = q3 - q1
            
#             without_emission_df_mean = category_df[column][(category_df[column] < q3 + 1.5 * iqr) & (category_df[column] > q1 - 1.5 * iqr)].mean()
#             category_df.loc[(category_df[column] >= q3 + 1.5 * iqr) & (category_df[column] <= q1 - 1.5 * iqr), column] = without_emission_df_mean
#         cleared_df = pd.concat([cleared_df, category_df])
#     return cleared_df

# cleared_df = clear_emissions(
#     cleared_df,
#     df['category'].unique(),
#     ['clicks', 'likes', 'average_dwelltime', 'age']
# )

### Начинаем тренировать модель

In [191]:
def make_fit_model(data, target):
    X_train, X_test, Y_train, Y_test = train_test_split(
        data.drop(target, axis='columns'),
        data[target],
        test_size = 0.2,
        shuffle=True
    )
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, Y_train)
    return (model, X_test, Y_test)
    

In [192]:
ecom_data = cleared_df[cleared_df['category'] == 'ecom'].drop(['category'], axis=1)
information_source_data = cleared_df[cleared_df['category'] == 'information_source'].drop(['category'], axis=1)
news_data = cleared_df[cleared_df['category'] == 'news'].drop(['category'], axis=1)
porn_data = cleared_df[cleared_df['category'] == 'porn'].drop(['category'], axis=1)
social_data = cleared_df[cleared_df['category'] == 'social'].drop(['category'], axis=1)
cleared_df[cleared_df['category'] == 'ecom']

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536,277836,0,82221,0,0,18.450527,0.461750,4.485836
5505,ecom,3060369,0,1945683,51293,0,1790,8.586621,0.640110,2.225457
1176,ecom,31677060,8760379,0,924098,0,8519,8.066347,0.080541,0.653280
9772,ecom,1695855,0,1601683,78271,0,1171,9.834370,0.757402,1.265828
3492,ecom,1726730,1022561,548138,14895,18118,3668,5.436714,0.476799,1.321715
...,...,...,...,...,...,...,...,...,...,...
6181,ecom,4146772,4666283,6127488,460792,0,4022,20.870676,0.467366,1.060710
4348,ecom,5042924,1274898,5042924,174661,133724,28645,1.525583,0.904862,2.636767
4858,ecom,9206957,0,4496093,447953,43524,11756,1.872118,0.501125,2.198716
699,ecom,1465768,158540,247116,12447,560,0,5.109422,0.082981,0.157970


In [193]:
model, X_test, Y_test = make_fit_model(cleared_df.drop(['category'], axis=1), 'source_attractiveness')
ecom_model, ecom_X_test, ecom_Y_test = make_fit_model(ecom_data, 'source_attractiveness')
information_source_model, information_source_X_test, information_source_Y_test = make_fit_model(information_source_data, 'source_attractiveness')
news_model, news_X_test, news_Y_test = make_fit_model(news_data, 'source_attractiveness')
porn_model, porn_X_test, porn_Y_test = make_fit_model(porn_data, 'source_attractiveness')
social_model, social_X_test, social_Y_test = make_fit_model(social_data, 'source_attractiveness')

In [194]:
predicts = model.predict(X_test)
ecom_predicts = ecom_model.predict(ecom_X_test)
information_source_predicts = information_source_model.predict(information_source_X_test)
news_predicts = news_model.predict(news_X_test)
porn_predicts = porn_model.predict(porn_X_test)
social_predicts = social_model.predict(social_X_test)

In [195]:
print("Test predicts:")
metrics.mean_squared_error(Y_test, predicts), \
metrics.mean_absolute_error(Y_test, predicts), \
metrics.mean_absolute_percentage_error(Y_test, predicts) * 100

Test predicts:


(0.023326130710716066, 0.10043106542220055, 268.6301954163956)

In [196]:
print("Test ecom predicts:")
metrics.mean_squared_error(ecom_Y_test, ecom_predicts), \
metrics.mean_absolute_error(ecom_Y_test, ecom_predicts), \
metrics.mean_absolute_percentage_error(ecom_Y_test, ecom_predicts) * 100

Test ecom predicts:


(0.0365969265169483, 0.14838909592348473, 312.8373653464516)

In [197]:
print("Test information_source predicts:")
metrics.mean_squared_error(information_source_Y_test, information_source_predicts), \
metrics.mean_absolute_error(information_source_Y_test, information_source_predicts), \
metrics.mean_absolute_percentage_error(information_source_Y_test, information_source_predicts) * 100

Test information_source predicts:


(0.004126453901882393, 0.050523092778184515, 123.7948085965747)

In [198]:
print("Test news predicts:")
metrics.mean_squared_error(news_Y_test, news_predicts), \
metrics.mean_absolute_error(news_Y_test, news_predicts), \
metrics.mean_absolute_percentage_error(news_Y_test, news_predicts) * 100

Test news predicts:


(0.004469208105009104, 0.051205183420112876, 78.10716814812298)

In [199]:
print("Test porn predicts:")
metrics.mean_squared_error(porn_Y_test, porn_predicts), \
metrics.mean_absolute_error(porn_Y_test, porn_predicts), \
metrics.mean_absolute_percentage_error(porn_Y_test, porn_predicts) * 100

Test porn predicts:


(0.004509661709325633, 0.053433316740871414, 84.56194632720415)

In [200]:
print("Test social predicts:")
metrics.mean_squared_error(social_Y_test, social_predicts), \
metrics.mean_absolute_error(social_Y_test, social_predicts), \
metrics.mean_absolute_percentage_error(social_Y_test, social_predicts) * 100

Test social predicts:


(0.003108799931318434, 0.0412302904064251, 137.845766593558)