In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.3, palette='Set2')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
df = pd.read_csv("train.csv")

In [3]:
# Заметим, что первая колонка 'Unnamed: 0' соответствует колонке ID. 
# Переименуем её сопоставим индексы DataFrame этой колонке
df.rename(columns={'Unnamed: 0' : 'ID'}, inplace=True)
df.set_index('ID',inplace=True)
df

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,date_of_registration
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,,0,82221,0,0,18.450527,0.461750,2020-04-21 16:04:41.817367072
2047,information_source,874840.0,21100.0,0,12872,0,0,10.721619,-0.022317,2024-07-19 23:50:07.268931816
1118,information_source,571210.0,94707.0,0,0,7420,0,1.922243,0.046396,2024-07-13 16:35:54.794883135
4992,news,89534.0,924.0,0,834,0,0,2.149243,-0.093360,2024-09-10 21:29:14.006315095
9970,information_source,1043953.0,289288.0,0,58375,20260,3948,3.764965,0.027303,2024-05-26 11:07:15.950527838
...,...,...,...,...,...,...,...,...,...,...
361,information_source,1468601.0,474524.0,0,61307,83928,16841,4.191481,0.017470,2023-06-20 19:48:01.783634440
2621,information_source,93172.0,6418.0,0,3418,705,0,3.745192,-0.019069,2024-07-22 08:45:41.700070757
2605,information_source,82916.0,0.0,0,4302,1718,1007,5.837475,-0.061523,2024-06-17 05:44:08.100200436
1231,social,823896.0,105666.0,0,0,2892,267,11.994037,0.131620,2023-06-11 02:53:14.377302808


In [4]:
df['complaints_count'] = pd.to_numeric(df['complaints_count'], errors='coerce')

In [5]:
df['date_of_registration'] = pd.to_datetime(df['date_of_registration'])
df['age'] = (pd.Timestamp('2024-10-15') - df['date_of_registration']) / pd.Timedelta(days=365)

In [6]:
df = df.drop(['date_of_registration'], axis = 1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 9289
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 7727 non-null   float64
 2   likes                  7624 non-null   float64
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       7116 non-null   float64
 7   average_dwelltime      8000 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 687.5+ KB


In [8]:
# Очистка выбросов и замена NaN на срееднее значение

df_ecom = df[df['category'] == 'ecom']

categories_list = df['category'].unique()

real_columns_list = ['clicks', 'likes', 'average_dwelltime', 'age']


temp_df = df.copy(deep=True)
temp_df.fillna(temp_df.groupby(['category']).transform('mean'), inplace=True)

cleared_df = pd.DataFrame()

for category in categories_list:
    category_df = temp_df[temp_df['category'] == category]
    for column in real_columns_list:
        q1 = category_df[column].quantile(0.25)
        q3 = category_df[column].quantile(0.75)
        iqr = q3 - q1
        
        without_emission_df_mean = category_df[column][(category_df[column] < q3 + 1.5 * iqr) & (category_df[column] > q1 - 1.5 * iqr)].mean()
        category_df.loc[(category_df[column] >= q3 + 1.5 * iqr) & (category_df[column] <= q1 - 1.5 * iqr), column] = without_emission_df_mean
    cleared_df = pd.concat([cleared_df, category_df])


cleared_df



Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,1.633087e+06,0,82221,0,0.000000,18.450527,0.461750,4.485836
5505,ecom,3060369.0,0.000000e+00,1945683,51293,0,8704.745353,8.586621,0.640110,2.225457
1176,ecom,31677060.0,8.760379e+06,0,924098,0,8519.000000,8.066347,0.080541,0.653280
9772,ecom,1695855.0,0.000000e+00,1601683,78271,0,1171.000000,9.834370,0.757402,1.265828
3492,ecom,1726730.0,1.022561e+06,548138,14895,18118,3668.000000,5.436714,0.476799,1.321715
...,...,...,...,...,...,...,...,...,...,...
1415,social,209606.0,3.843200e+04,0,907,2368,0.000000,0.107157,0.432431,11.398507
565,social,413203.0,9.267700e+04,0,1448,0,1186.000000,9.806318,0.251863,2.599806
4330,social,1658795.0,0.000000e+00,0,0,14479,2058.000000,59.962054,0.041906,1.283778
6538,social,283855.0,6.589100e+04,0,0,0,174.000000,0.594842,0.421322,9.577732


In [9]:
#cleared_df.fillna(cleared_df.groupby(['category']).transform('mean'), inplace=True)
without_nan_df = cleared_df.copy(deep=True)
without_nan_df

Unnamed: 0_level_0,category,clicks,likes,buys,4xx_errors,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6622,ecom,6488536.0,1.633087e+06,0,82221,0,0.000000,18.450527,0.461750,4.485836
5505,ecom,3060369.0,0.000000e+00,1945683,51293,0,8704.745353,8.586621,0.640110,2.225457
1176,ecom,31677060.0,8.760379e+06,0,924098,0,8519.000000,8.066347,0.080541,0.653280
9772,ecom,1695855.0,0.000000e+00,1601683,78271,0,1171.000000,9.834370,0.757402,1.265828
3492,ecom,1726730.0,1.022561e+06,548138,14895,18118,3668.000000,5.436714,0.476799,1.321715
...,...,...,...,...,...,...,...,...,...,...
1415,social,209606.0,3.843200e+04,0,907,2368,0.000000,0.107157,0.432431,11.398507
565,social,413203.0,9.267700e+04,0,1448,0,1186.000000,9.806318,0.251863,2.599806
4330,social,1658795.0,0.000000e+00,0,0,14479,2058.000000,59.962054,0.041906,1.283778
6538,social,283855.0,6.589100e+04,0,0,0,174.000000,0.594842,0.421322,9.577732


In [10]:
without_nan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 6622 to 1231
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               8000 non-null   object 
 1   clicks                 8000 non-null   float64
 2   likes                  8000 non-null   float64
 3   buys                   8000 non-null   int64  
 4   4xx_errors             8000 non-null   int64  
 5   5xx_errors             8000 non-null   int64  
 6   complaints_count       8000 non-null   float64
 7   average_dwelltime      8000 non-null   float64
 8   source_attractiveness  8000 non-null   float64
 9   age                    8000 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 687.5+ KB


In [11]:
without_nan_df['clicks'] = without_nan_df['clicks'].astype('int64')
without_nan_df['likes'] = without_nan_df['likes'].astype('int64')
without_nan_df['buys'] = without_nan_df['buys'].astype('int64')
without_nan_df['complaints_count'] = without_nan_df['complaints_count'].astype('int64')

without_nan_df.dtypes

category                  object
clicks                     int64
likes                      int64
buys                       int64
4xx_errors                 int64
5xx_errors                 int64
complaints_count           int64
average_dwelltime        float64
source_attractiveness    float64
age                      float64
dtype: object

In [12]:
# Разбиваем выборку
X_train, X_test, Y_train, Y_test = train_test_split(
    without_nan_df.drop('source_attractiveness', axis='columns'),
    without_nan_df['source_attractiveness'],
    test_size=0.2,
    shuffle=True,
    stratify=without_nan_df['category']
)

In [13]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train = np.hstack([X_train.drop(['category'], axis='columns'), encoder.fit_transform(X_train['category'].to_frame())])
X_test = np.hstack([X_test.drop(['category'], axis='columns'), encoder.transform(X_test['category'].to_frame())])

In [14]:
without_nan_model = LinearRegression(fit_intercept=True)
without_nan_model.fit(X_train, Y_train)

In [15]:
without_nan_model.coef_

array([-1.46206772e-08,  2.23904020e-08,  4.62930256e-08,  4.33616757e-08,
       -5.38161823e-07,  1.14210756e-07,  2.00955918e-03,  5.23087291e-02,
       -2.01394220e-01, -2.30644013e-01, -2.80935109e-01, -2.54098704e-01])

In [16]:
without_nan_predicts_test = without_nan_model.predict(X_test)
print("Test predicts:")
metrics.mean_squared_error(Y_test, without_nan_predicts_test), \
metrics.mean_absolute_error(Y_test, without_nan_predicts_test), \
metrics.mean_absolute_percentage_error(Y_test, without_nan_predicts_test) * 100

Test predicts:


(0.017221581485055537, 0.09049798604628201, 270.9829534984509)

In [17]:
without_nan_predicts_train = without_nan_model.predict(X_train)
print("Train predicts:")
metrics.mean_squared_error(Y_train, without_nan_predicts_train), \
metrics.mean_absolute_error(Y_train, without_nan_predicts_train), \
metrics.mean_absolute_percentage_error(Y_train, without_nan_predicts_train) * 100

Train predicts:


(0.015853773108663084, 0.08735599075557773, 230.03693680632105)

In [18]:
# Попробуем сделать модели для разных категорий, а потом объединить их
categories_list

array(['ecom', 'information_source', 'news', 'porn', 'social'],
      dtype=object)

In [19]:
def make_fit_model(data, target, stratify_by):
    X_train, X_test, Y_train, Y_test = train_test_split(
        data.drop(target, axis='columns'),
        data[target],
        test_size = 0.2,
        shuffle=True,
        stratify=data[stratify_by]
    )
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    X_train = np.hstack([X_train.drop([stratify_by], axis='columns'), encoder.fit_transform(X_train[stratify_by].to_frame())])
    X_test = np.hstack([X_test.drop([stratify_by], axis='columns'), encoder.transform(X_test[stratify_by].to_frame())])
    
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, Y_train)
    return model

ecom_model = make_fit_model(without_nan_df[without_nan_df['category'] == 'ecom'], 'source_attractiveness', 'category')
information_source_model = make_fit_model(without_nan_df[without_nan_df['category'] == 'information_source'], 'source_attractiveness', 'category')
news_model = make_fit_model(without_nan_df[without_nan_df['category'] == 'news'], 'source_attractiveness', 'category')
porn_model = make_fit_model(without_nan_df[without_nan_df['category'] == 'porn'], 'source_attractiveness', 'category')
social_model = make_fit_model(without_nan_df[without_nan_df['category'] == 'social'], 'source_attractiveness', 'category')

In [22]:
# Делаем предикты
def make_predictions(X_test):
    predictions = pd.concat([
        ecom_model.predict(X_test),
        information_source_model.predict(X_test),
        news_model.predict(X_test),
        porn_model.predict(X_test),
        social_model.predict(X_test)
    ])
    return predictions


X_train.shape

predicts = make_predictions(X_train)
predicts



ValueError: X has 12 features, but LinearRegression is expecting 8 features as input.