## Загрузим нужные библиотеки

In [1]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
import xgboost


Выполним загрузу датсета

In [2]:
df_train = pd.read_csv("train_dataset_train.csv", index_col= 0)
df_test = pd.read_csv("test_dataset_test.csv", index_col= 0)

## Проанализируем датасет

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ to 627f1c089a794743b070ff73hVvdVmFxS2SlZ2_lECDEow
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               7000 non-null   object 
 1   publish_date        7000 non-null   object 
 2   session             7000 non-null   object 
 3   authors             7000 non-null   object 
 4   ctr                 7000 non-null   float64
 5   category            7000 non-null   object 
 6   tags                7000 non-null   object 
 7   views               7000 non-null   int64  
 8   depth               7000 non-null   float64
 9   full_reads_percent  7000 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 601.6+ KB


Заменим категорию и автора на число

In [4]:
df_train["category"] = df_train["category"].astype('category')
df_train["category"] = df_train["category"].cat.codes
df_train["category"] = df_train["category"].astype('int')

In [5]:
df_train["authors"] = df_train["authors"].astype('category')
df_train["authors"] = df_train["authors"].cat.codes
df_train["authors"] = df_train["authors"].astype('int')

In [6]:
df_train['day'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%d").astype(int)
df_train['mounth'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%m").astype(int)

In [7]:
df_train.head(3)

Unnamed: 0_level_0,title,publish_date,session,authors,ctr,category,tags,views,depth,full_reads_percent,day,mounth
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,Европейский банк развития приостановил доступ ...,2022-04-04 10:29:44,IDE7mtH4RBqGn-8MXfGffQ,560,1.58,2,"['55928d339a794751dc8303d6', '542d1e28cbb20f86...",20460,1.134,35.85,4,4
620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,Кремль назвал регулярным процессом учебные зап...,2022-02-18 10:00:39,KtVJsteHStO5oditt3Uvzw,38,1.853,0,"['549d25df9a794775979561d2', '58abcf539a7947f1...",19038,1.142,38.355,18,2
620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,Госсекретарь Швеции заявила о нежелании вступа...,2022-02-12 04:24:02,hk7puWJwSziw0m3sfTkKWA,560,0.0,0,"['5430f451cbb20f73931ecd05', '5409f15de063daa0...",51151,1.185,36.424,12,2


In [12]:
len(df_train.category)

7000

In [11]:
features = list(set(df_train.columns) - set(['publish_date']))

_ = df_train[features].hist(figsize=(20,12))

Всего 9 категорий статей

In [12]:
df_train.category.value_counts()

0    3988
5    1456
3     667
1     338
4     283
2     265
8       1
6       1
7       1
Name: category, dtype: int64

## Выделим выборки

In [17]:
X = df_train.drop(["views","full_reads_percent","depth","title","publish_date", "session", "tags"], axis = 1)
y = df_train[["views","depth","full_reads_percent"]]

In [18]:
X.head()

Unnamed: 0_level_0,authors,ctr,category,day,mounth
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
624ac09c9a7947db3d80c98eIDE7mtH4RBqGn-8MXfGffQ,560,1.58,2,4,4
620f6b899a7947701cf489e1KtVJsteHStO5oditt3Uvzw,38,1.853,0,18,2
620730cf9a7947ab96a44e27hk7puWJwSziw0m3sfTkKWA,560,0.0,0,12,2
6262a5889a79470b78c9ca307UKY2SSZTjCcjhwBzxw37w,560,0.0,0,22,4
626678929a79477ca0101568wuMYES90REuV5YhrN75IXg,560,0.0,5,25,4


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Подбор модели

In [100]:
regr = RandomForestRegressor(random_state=0)
my_model_2 = xgboost.XGBRegressor(n_estimators = 900, learning_rate=0.005)

In [24]:
#поиск лучшего параметра
from sklearn.model_selection import GridSearchCV
parametrs = { 'n_estimators': range (10, 1500, 100)}

In [25]:
grid = GridSearchCV(my_model_2, parametrs, cv=5)
grid.fit(X_train, y_train)

KeyboardInterrupt: 

Обучим модель

In [94]:
my_model_2.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.0005, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=900, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [101]:
my_model_2.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

Предскажем значения

In [102]:
pred = my_model_2.predict(X_test)

In [103]:
pred2 = my_model_2.predict(X_test)

NotFittedError: need to call fit or load_model beforehand

## Оценка точности

In [None]:
#xgboost

In [98]:
score_views2 = r2_score(y_test["views"], pred2[:,0])
score_depth2 = r2_score(y_test["depth"], pred2[:,1])
score_frp2 = r2_score(y_test["full_reads_percent"], pred2[:,2])

In [99]:
score = 0.4 * score_views2 + 0.3 * score_depth2 + 0.3 * score_frp2

score

-12.281728240527062