## Загрузим нужные библиотеки

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import re
# import scipy.sparse
import torch
from torch.utils.data import Dataset
# from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim

from bs4 import BeautifulSoup
import json
import socket
from urllib3.connection import HTTPConnection


In [2]:
!pip install navec
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar

from navec import Navec
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')

In [3]:
TOKEN_RE = re.compile(r'[\w\d]+')


def tokenize_text_simple_regex(txt, min_token_size=4):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]


def character_tokenize(txt):
    return list(txt)


def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts] 
#Здесь можно снова попробовать токенизировать по леммам.

def add_fake_token(word2id, token='<PAD>'):
    word2id_new = {token: i + 1 for token, i in word2id.items()}
    word2id_new[token] = 0
    return word2id_new


def texts_to_token_ids(tokenized_texts, word2id):
    return [[word2id[token] for token in text if token in word2id]
            for text in tokenized_texts]


def build_vocabulary(tokenized_texts, max_size=1000000, max_doc_freq=0.8, min_count=5, pad_word=None):
    word_counts = collections.defaultdict(int)
    doc_n = 0

    # посчитать количество документов, в которых употребляется каждое слово
    # а также общее количество документов
    for txt in tokenized_texts:
        doc_n += 1
        unique_text_tokens = set(txt)
        for token in unique_text_tokens:
            word_counts[token] += 1

    # убрать слишком редкие и слишком частые слова
    word_counts = {word: cnt for word, cnt in word_counts.items()
                   if cnt >= min_count and cnt / doc_n <= max_doc_freq}

    # отсортировать слова по убыванию частоты
    sorted_word_counts = sorted(word_counts.items(),
                                reverse=True,
                                key=lambda pair: pair[1])

    # добавим несуществующее слово с индексом 0 для удобства пакетной обработки
    if pad_word is not None:
        sorted_word_counts = [(pad_word, 0)] + sorted_word_counts

    # если у нас по прежнему слишком много слов, оставить только max_size самых частотных
    if len(word_counts) > max_size:
        sorted_word_counts = sorted_word_counts[:max_size]

    # нумеруем слова
    word2id = {word: i for i, (word, _) in enumerate(sorted_word_counts)}

    # нормируем частоты слов
    word2freq = np.array([cnt / doc_n for _, cnt in sorted_word_counts], dtype='float32')

    return word2id, word2freq


PAD_TOKEN = '__PAD__'
NUMERIC_TOKEN = '__NUMBER__'
NUMERIC_RE = re.compile(r'^([0-9.,e+\-]+|[mcxvi]+)$', re.I)


def replace_number_nokens(tokenized_texts):
    return [[token if not NUMERIC_RE.match(token) else NUMERIC_TOKEN for token in text]
            for text in tokenized_texts]

Выполним загрузу датсета

In [5]:
df_train = pd.read_csv("https://lodmedia.hb.bizmrg.com/case_files/768812/train_dataset_train.csv", index_col= 0)
df_test = pd.read_csv("https://lodmedia.hb.bizmrg.com/case_files/768812/test_dataset_test.csv", index_col= 0)

## Проанализируем датасет

In [6]:
df_train['hour'] = pd.to_datetime(df_train['publish_date']).dt.hour
df_train['day'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%d").astype(int)
df_train['mounth'] = pd.to_datetime(df_train['publish_date']).dt.strftime("%m").astype(int)
df_train['day_of_week'] = pd.to_datetime(df_train['publish_date']).dt.dayofweek
df_test['hour'] = pd.to_datetime(df_test['publish_date']).dt.hour
df_test['day'] = pd.to_datetime(df_test['publish_date']).dt.strftime("%d").astype(int)
df_test['mounth'] = pd.to_datetime(df_test['publish_date']).dt.strftime("%m").astype(int)
df_test['day_of_week'] = pd.to_datetime(df_test['publish_date']).dt.dayofweek
df_test

In [7]:
features = list(set(df_test.columns) - set(['publish_date']))

_ = df_train[features].hist(figsize=(20,12))

In [8]:
df_all = pd.concat([df_train,df_test], sort=False) #конкатенирую трейн и тест, чтобы одинаково распределить категории и авторов
df_all


In [9]:
df_all = pd.concat([df_train,df_test], sort=False)
df_all['document'] = df_all.index
df_all.document = df_all.document.apply(lambda x: x[0:24]) #https://www.rbc.ru/rbcfreenews/
df_all.document

In [10]:
#df_texts = pd.read_csv("https://vk.com/doc3811482_637303495?hash=UZcVDOKLKBeZZe3ImKuT8c9ZKdo4mjkBAPAnWSKuB2H&dl=oImVKIFQ82nQTyKhh3554942cMz05qGo286hCFuS8dw")
df_texts = pd.read_csv('https://vk.com/doc3811482_637303495?hash=DyaC3SymdU09hS0OLrYTFcJIZ8EEfElYoHSM3qcpIew&dl=TfOTFr6tMnvq1mdQnseZHQkmPHle6W73WA7n2AupHVX')
df_texts['length'] = df_texts.text.apply(lambda x: len(str(x)))
df_all['length'] = df_texts.length.apply(lambda x: x)
df_all['text'] = df_texts.text.apply(lambda x: str(x))
for i in range(len(df_all.length)):
    df_all.length[i] = df_texts.length[i]
    df_all.text[i] = str(df_texts.text[i])

In [11]:
df_all['Putin'] = df_all.title.apply(lambda x: 1 if re.search(r'Путин', x) else 0)
print(df_all.Putin.mean())
# df_all['Medvedev'] = df_all.title.apply(lambda x: 1 if re.search(r'Медведев', x) else 0) 
# print(df_all.Medvedev.mean())
df_all['Ukraine'] = df_all.title.apply(lambda x: 1 if re.search(r'Украин', x) else 0)
print(df_all.Ukraine.mean())
df_all['War'] = df_all.title.apply(lambda x: 1 if re.search(r'воен', x) or re.search(r'Воен', x) else 0)
print(df_all.War.mean())
# df_all['Biden'] = df_all.title.apply(lambda x: 1 if re.search(r'Байден', x) else 0)
# print(df_all.Biden.mean())
df_all['Gas'] = df_all.title.apply(lambda x: 1 if re.search(r'газ', x) or re.search(r'Газ', x) else 0)
print(df_all.Gas.mean())
# df_all['Musk'] = df_all.title.apply(lambda x: 1 if re.search(r'Маск', x) else 0)
# print(df_all.Musk.mean())
# df_all['bank'] = df_all.title.apply(lambda x: 1 if re.search(r'банк', x) or re.search(r'Банк', x) else 0)
# print(df_all.bank.mean())
# df_all['oil'] = df_all.title.apply(lambda x: 1 if re.search(r'нефт', x) or re.search(r'Нефт', x) else 0)
# print(df_all.oil.mean())
# # df_all['krypto'] = df_all.title.apply(lambda x: 1 if re.search(r'рипто', x) else 0)
# # print(df_all.krypto.mean())
# df_all['stock'] = df_all.title.apply(lambda x: 1 if re.search(r'акци', x) or re.search(r'Aкци', x) else 0)
# print(df_all.stock.mean())
# df_all['Bitcoin'] = df_all.title.apply(lambda x: 1 if re.search(r'иткоин', x) else 0)
# print(df_all.Bitcoin.mean())
df_all['COVID'] = df_all.title.apply(lambda x: 1 if re.search(r'COVID', x) else 0)
print(df_all.COVID.mean())
# df_all['virus'] = df_all.title.apply(lambda x: 1 if re.search(r'ирус', x) else 0)
# print(df_all.virus.mean())
# df_all['ospa'] = df_all.title.apply(lambda x: 1 if re.search(r'осп', x) or re.search(r'Осп', x) else 0)
# print(df_all.ospa.mean())
# df_all['sanctions'] = df_all.title.apply(lambda x: 1 if re.search(r'санкци', x) or re.search(r'Санкци', x) else 0)
# print(df_all.sanctions.mean())


In [12]:
df_all['Putin_text'] = df_all.text.apply(lambda x: 1 if re.search(r'Путин', str(x)) else 0)
print(df_all.Putin_text.mean())
# df_all['Medvedev'] = df_all.title.apply(lambda x: 1 if re.search(r'Медведев', x) else 0) 
# print(df_all.Medvedev.mean())
df_all['Ukraine_text'] = df_all.text.apply(lambda x: 1 if re.search(r'Украин', str(x)) else 0)
print(df_all.Ukraine_text.mean())
df_all['War_text'] = df_all.text.apply(lambda x: 1 if re.search(r'воен', str(x)) or re.search(r'Воен', str(x)) else 0)
print(df_all.War_text.mean())
# df_all['Biden'] = df_all.title.apply(lambda x: 1 if re.search(r'Байден', x) else 0)
# print(df_all.Biden.mean())
df_all['Gas_text'] = df_all.text.apply(lambda x: 1 if re.search(r'газ', str(x)) or re.search(r'Газ', str(x)) else 0)
print(df_all.Gas_text.mean())
# df_all['Musk'] = df_all.title.apply(lambda x: 1 if re.search(r'Маск', x) else 0)
# print(df_all.Musk.mean())
df_all['bank_text'] = df_all.text.apply(lambda x: 1 if re.search(r'банк', str(x)) or re.search(r'Банк', str(x)) else 0)
print(df_all.bank_text.mean())
df_all['oil_text'] = df_all.text.apply(lambda x: 1 if re.search(r'нефт', str(x)) or re.search(r'Нефт', str(x)) else 0)
print(df_all.oil_text.mean())
# df_all['krypto'] = df_all.title.apply(lambda x: 1 if re.search(r'рипто', x) else 0)
# print(df_all.krypto.mean())
df_all['stock_text'] = df_all.text.apply(lambda x: 1 if re.search(r'акци', str(x)) or re.search(r'Aкци', str(x)) else 0)
print(df_all.stock_text.mean())
# df_all['Bitcoin'] = df_all.title.apply(lambda x: 1 if re.search(r'иткоин', str(x)) or re.search(r'крипто', str(x)) else 0)
# print(df_all.Bitcoin.mean())
df_all['COVID_text'] = df_all.text.apply(lambda x: 1 if re.search(r'COVID', str(x)) or re.search(r'короно', str(x)) or re.search(r'ковид', str(x))  else 0)
print(df_all.COVID_text.mean())
# df_all['virus'] = df_all.text.apply(lambda x: 1 if re.search(r'ирус', x) else 0)
# print(df_all.virus.mean())
# df_all['ospa'] = df_all.text.apply(lambda x: 1 if re.search(r'осп', x) or re.search(r'Осп', x) else 0)
# print(df_all.ospa.mean())
df_all['Rus'] = df_all.text.apply(lambda x: 1 if re.search(r'Росси', x) or re.search(r'росси', x) else 0)
print(df_all.Rus.mean())
df_all['USA'] = df_all.text.apply(lambda x: 1 if re.search(r'США', x) or re.search(r'Американ', x) else 0)
print(df_all.USA.mean())
df_all['sanctions_text'] = df_all.text.apply(lambda x: 1 if re.search(r'санкци', str(x)) or re.search(r'Санкци', str(x)) else 0)
print(df_all.sanctions_text.mean())
df_all['Sport_text'] = df_all.text.apply(lambda x: 1 if re.search(r'спорт', str(x)) or re.search(r'Спорт', str(x)) else 0)
print(df_all.Sport_text.mean())
df_texts['rubl_text'] = df_texts.text.apply(lambda x: 1 if re.search(r'Рубл', str(x)) or re.search(r'рубл', str(x)) else 0)
print(df_texts.rubl_text.mean())

In [13]:
df_all[3:10]

In [14]:
df_all["category"] = df_all["category"].astype('category')
df_all["category"] = df_all["category"].cat.codes
df_all["category"] = df_all["category"].astype('int')
df_all["authors"] = df_all["authors"].astype('category')
df_all["authors"] = df_all["authors"].cat.codes
df_all["authors"] = df_all["authors"].astype('int')


In [15]:
df_all.tags = df_all.tags.apply(lambda x: x.split(sep=','))
df_all.tags = df_all.tags.apply(lambda x: [x[i].replace('[', '') for i in range(len(x))])
df_all.tags = df_all.tags.apply(lambda x: [x[i].replace(']', '') for i in range(len(x))])
df_all.tags = df_all.tags.apply(lambda x: [x[i].replace(' ', '') for i in range(len(x))])
df_all.tags = df_all.tags.apply(lambda x: [x[i][1:-1] for i in range(len(x))])  

#Добавляем фичу под названием количество тегов. Черт знает, нужна ли она.
df_all['lentags'] = df_all.tags.apply(lambda x: len(x))

MAX_DF = 0.9
MIN_COUNT = 3
vocabulary, word_doc_freq = build_vocabulary(df_all.tags, max_doc_freq=MAX_DF, min_count=MIN_COUNT)
UNIQUE_WORDS_N = len(vocabulary)
print('Количество уникальных токенов', UNIQUE_WORDS_N)
print(list(vocabulary.items())[10])
df_all.tags = texts_to_token_ids(df_all.tags, vocabulary)
df_all.tags[5]

for i in df_all.tags:
    while len(i) < 14:
        i.append(0)

tags2vec = gensim.models.Word2Vec(sentences=df_all.tags, negative=20,
                                  window=5, min_count=3, workers=4,
                                  sg=1)
        
df_all = df_all.join(pd.DataFrame(df_all.tags.tolist(), index = df_all.index, columns = {'tag'+str(i) for i in range(14)}))   #теги из строки превратились в связный список.
# df_all.tags = df_all.tags.apply(lambda x: " ".join(map(str, x)))
df_all[3:5]

In [16]:
df_all.tag1 = tags2vec.wv[df_all.tag1]
df_all.tag2 = tags2vec.wv[df_all.tag2]
df_all.tag3 = tags2vec.wv[df_all.tag3]
df_all.tag4 = tags2vec.wv[df_all.tag4]
df_all.tag5 = tags2vec.wv[df_all.tag5]
df_all.tag6 = tags2vec.wv[df_all.tag6]
df_all.tag7 = tags2vec.wv[df_all.tag7]
df_all.tag8 = tags2vec.wv[df_all.tag8]
df_all.tag9 = tags2vec.wv[df_all.tag9]
df_all.tag10 = tags2vec.wv[df_all.tag10]
df_all.tag11 = tags2vec.wv[df_all.tag11]
df_all.tag12 = tags2vec.wv[df_all.tag12]
df_all.tag13 = tags2vec.wv[df_all.tag13]
df_all.tag0 = tags2vec.wv[df_all.tag0]
df_all.tag3

In [17]:
xxx = tags2vec.wv[0][0]
df_all['tags_mean'] = df_all.tag1 + df_all.tag2 + df_all.tag3 + df_all.tag4 + df_all.tag5 + df_all.tag6 + df_all.tag7 + df_all.tag8 + df_all.tag9 + df_all.tag10 + df_all.tag11 + df_all.tag12 + df_all.tag13 + df_all.tag0
df_all.tags_mean = (df_all.tags_mean - (14 - df_all.lentags)*xxx)/df_all.lentags
df_all.tags_mean

In [18]:
df_all.title = tokenize_corpus(df_all.title)
df_all.title = [[navec[i] if i in navec else navec['<unk>'] for i in x] for x in df_all.title]
df_all['titleVectSum'] = df_all.title.apply(lambda x: np.sum([i for i in x]))
df_all['titleVect'] = df_all.title.apply(lambda x: sum([i for i in x])/len(x))
# df_all = df_all.join(pd.DataFrame(df_all.titleVect.tolist(), index = df_all.index, columns = {str(i) for i in range(300)}))
df_all[5:10]

In [19]:
OnlyTitle = pd.DataFrame(df_all.titleVect.tolist(), index = df_all.index, columns = {str(i) for i in range(300)})
OnlyTitle = OnlyTitle.mean(axis = 1)
OnlyTitle
df_all['titleNumber'] = OnlyTitle
df_all



Всего 9 категорий статей

In [20]:
df_allFP = pd.get_dummies(df_all, columns=["authors"])
df_allFP = pd.get_dummies(df_allFP, columns=["category"])
# df_allFP = df_allFP.join(pd.DataFrame(df_allFP.titleVect.tolist(), index = df_all.index, columns = {'title' + str(i) for i in range(300)}))
df_allV = pd.get_dummies(df_all, columns=["category"])
df_allD = pd.get_dummies(df_all, columns=["authors"])
df_allD = pd.get_dummies(df_allD, columns=["category"])


df_trainFP = df_allFP[0:7000]
df_testFP = df_allFP[7000:10000]
df_trainD = df_allD[0:7000]
df_testD = df_allD[7000:10000]
df_trainV = df_allV[0:7000]
df_testV = df_allV[7000:10000]

In [21]:
df_trainFP[:1]

In [41]:
X_plot = df_trainFP[["views","depth","full_reads_percent",'ctr', 'Putin','Ukraine','titleNumber','Gas','COVID']]
ax = sns.heatmap(X_plot.corr(), vmin = -1, vmax = +1, annot = True, cmap = 'coolwarm')

## Выделим выборки

In [23]:
XV = df_trainV.drop(["views","depth","full_reads_percent",'document','titleVectSum','titleVect', 'text',"title","publish_date", "session", "tags", "authors",'lentags','titleNumber','Putin','Ukraine','War','Gas','COVID'], axis = 1)
XD = df_trainD.drop(["views","depth","full_reads_percent",'document','titleVectSum','titleVect', 'text',"title","publish_date", "session", "tags", 'lentags'], axis = 1)
XF = df_trainFP.drop(["views","depth","full_reads_percent",'document','titleVectSum','titleVect', 'text',"title","publish_date", "session", "tags",'Putin','Ukraine','War','Gas','COVID'], axis = 1)

y_views = df_trainV[["views"]] #,"depth","full_reads_percent"]]
y_depth = df_trainD[["depth"]]
y_full_reads_percent = df_trainFP[["full_reads_percent"]]
y = df_train[["views","depth","full_reads_percent"]]

X_testV = df_testV.drop(["views","depth","full_reads_percent",'document','text','titleVectSum','titleVect', "title","publish_date", "session", "tags", "authors", 'lentags','titleNumber', 'Putin','Ukraine','War','Gas','COVID'], axis = 1)
X_testD = df_testD.drop(["views","depth","full_reads_percent",'document','text','titleVectSum','titleVect', "title","publish_date", "session", "tags", 'lentags'], axis = 1)
X_testF = df_testFP.drop(["views","depth","full_reads_percent",'document','text','titleVectSum','titleVect', "title","publish_date", "session", "tags",'Putin','Ukraine','War','Gas','COVID'], axis = 1)


# from sklearn.preprocessing import StandardScaler
# ss = StandardScaler()
# XV = ss.fit_transform(XV)
# X_testV = ss.transform(X_testV)

# ss2 = StandardScaler()
# XD = ss2.fit_transform(XD)
# X_testD = ss2.transform(X_testD)

# ss3 = StandardScaler()
# XF = ss2.fit_transform(XF)
# X_testF = ss2.transform(X_testF)

Стандартизируем данные

In [25]:
X_train, X_val, y_train, y_val = train_test_split(XV, y_views, test_size=0.05, random_state=210)
X_trainD, X_valD, y_trainD, y_valD = train_test_split(XD, y_depth, test_size=0.05, random_state=210)
X_trainF, X_valF, y_trainF, y_valF = train_test_split(XF, y_full_reads_percent, test_size=0.05, random_state=210)

In [26]:
print(X_train.shape)
print(X_trainD.shape)
print(X_trainF.shape)

## Подбор модели RandomForrest

In [27]:
regr = RandomForestRegressor(n_jobs=-1, random_state=5)
# RandomForestRegressor(max_depth=25 [12,15,50], max_features=12 [5,8,10,15,20], min_samples_leaf=3,
#                       n_estimators=500, n_jobs=-1, random_state=5) criterion='absolute_error' - класс!
# 0.910535319898284 - вместе с length
parametres = {'criterion': ['absolute_error'], 'n_estimators': [500], 'max_depth': [25], 'min_samples_leaf': [1], 'max_features': [14]}
RegrCV = GridSearchCV(regr, parametres, cv = 5)
RegrCV.fit(X_train, y_train.values.ravel())
print(RegrCV.best_estimator_)
regr = RegrCV.best_estimator_
RegrCV.score(X_val, y_val.values.ravel())

In [28]:
print(regr.feature_importances_)
X_train[0:7]

Обучим модель

In [29]:
Degr = RandomForestRegressor(n_jobs=-1, random_state=5)
# RandomForestRegressor(max_depth=25 [15 ок,50], max_features=500 [300,400], min_samples_leaf=3,
#                       n_estimators=1500 [600,1000], n_jobs=-1, random_state=5)
# 0.8376791146716198 c length
parametres = {'criterion': ['squared_error'], 'n_estimators' : [1200], 'max_depth' : [25], 'min_samples_leaf': [1], 'max_features': [500]}
DegrCV = GridSearchCV(Degr, parametres, cv = 5)
DegrCV.fit(X_trainD, y_trainD.values.ravel())
print(DegrCV.best_estimator_)
Degr = DegrCV.best_estimator_
DegrCV.score(X_valD, y_valD.values.ravel())

In [30]:
# print(Degr.feature_importances_)
# X_trainD[0:7]

In [31]:
Fegr = RandomForestRegressor(n_jobs=-1, random_state=5)
# RandomForestRegressor(max_depth=25, max_features=400 [150,600], min_samples_leaf=3,
#                       n_estimators=1000 [500,1500,2000], n_jobs=-1, random_state=5)
# 0.3894043042874483 c length
parametres = {'criterion': ['absolute_error'], 'n_estimators' : [1200], 'max_depth' : [25], 'min_samples_leaf': [3], 'max_features': [400]}
FegrCV = GridSearchCV(Fegr, parametres, cv = 5)
FegrCV.fit(X_trainF, y_trainF.values.ravel())
print(FegrCV.best_estimator_)
Fegr = FegrCV.best_estimator_
FegrCV.score(X_valF, y_valF.values.ravel())

In [32]:
# print(Fegr.feature_importances_)
# X_trainF[0:7]

Предскажем значения

In [33]:
Vred = regr.predict(X_val)
Dred = Degr.predict(X_valD)
Fred = Fegr.predict(X_valF)


## Оценка точности

In [34]:
score_views = r2_score(y_val, Vred)
score_depth = r2_score(y_valD, Dred)
score_frp = r2_score(y_valF, Fred)
score = 0.4 * score_views + 0.3 * score_depth + 0.3 * score_frp
score
print(score_views, score_depth, score_frp)
print('финальный score на валидации:', score)

In [35]:
Vred = regr.predict(X_testV)
Dred = Degr.predict(X_testD)
Fred = Fegr.predict(X_testF)

PRED = pd.DataFrame([Vred, Dred, Fred]).transpose()
PRED.columns =['views', 'depth', 'full_reads_percent']
PRED

**Вывод результатов**

In [36]:
output3 = PRED #верные названия для сдачи работы
output3.insert(0,'document_id',X_testV.index)

output3.to_csv('subrbc2.csv', index=False)
output3.head(5)