In [137]:
import nltk
nltk.download('stopwords')

import os
import pickle

from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  mean_squared_error, mean_absolute_error

import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [138]:
texts = []
likes = []
for filename in tqdm(os.listdir('clean_files')):
    one_file = pickle.load(open(f'clean_files/{filename}', 'rb'))
    like = (-1 * int(one_file['likes'][1:])) if one_file['likes'][0] == '–' else int(one_file['likes'])
    texts.append({})
    texts[-1]['text'] = one_file['text']
    texts[-1]['hubs'] = [i for i in one_file['hubs'].lower().strip(',').split() if i != '']
    texts[-1]['file'] = filename
    likes.append(like)

HBox(children=(FloatProgress(value=0.0, max=8109.0), HTML(value='')))




In [139]:
print(one_file.keys())

dict_keys(['id', 'status', 'title', 'text', 'time', 'hubs', 'likes'])


In [140]:
X_train, X_test, y_train, y_test = train_test_split(texts, likes, test_size=0.33, random_state=42)

In [141]:
vectorizer = TfidfVectorizer(
    max_features=2048, lowercase=True, analyzer='word',
    stop_words=stopwords.words('russian'), ngram_range=(1,3),dtype=np.float32
)
train_vects = vectorizer.fit_transform([text['text'] for text in X_train])
test_vects = vectorizer.transform([text['text'] for text in X_test])

In [142]:
regressor = LinearRegression().fit(train_vects, y_train)

In [143]:
def print_examples(X_test, y_test, predicted_test):
    n = 5
    max_scores, min_scores = [(-999999, -1)] * n, [(999999, -1)] * n
    for i in range (len(X_test)):
        score = (y_test[i] - predicted_test[i]) ** 2
        for j in range(n):
            if score > max_scores[j][0]:
                for k in range(n - 1, j, -1):
                    max_scores[k] = max_scores[k - 1]
                max_scores[j] = (score, i)
                break
        for j in range(n):
            if score < min_scores[j][0]:
                for k in range(n - 1, j, -1):
                    min_scores[k] = min_scores[k - 1]
                min_scores[j] = (score, i)
                break

    print()
    print('Best:')
    for ex in min_scores:
        print('Squere error: {:10.2f}'.format(ex[0]))
        print('Text: ' + X_test[ex[1]]['file'])
        print('Hubs: ' + ', '.join(X_test[ex[1]]['hubs']))
        print('predicted: ' + str(round(predicted_test[ex[1]])) + ', real: ' + str(y_test[ex[1]]))
        print()
    print('---------------------------------')
    print('Worse:')
    for ex in max_scores:
        print('Squere error: {:6.2f}'.format(ex[0]))
        print('Text: ' + X_test[ex[1]]['file'])
        print('Hubs: ' + ', '.join(X_test[ex[1]]['hubs']))
        print('predicted: ' + str(round(predicted_test[ex[1]])) + ', real: ' + str(y_test[ex[1]]))
        print()


def evaluate(predictor, train_vects, test_vects, X_train, X_test, y_train, y_test):
    predicted_train = predictor.predict(train_vects)
    predicted_test = predictor.predict(test_vects)    
    
    print('MSE train', mean_squared_error(predicted_train, y_train))
    print('MAE train', mean_absolute_error(predicted_train, y_train))
    print()
    print('MSE test', mean_squared_error(predicted_test, y_test))
    print('MAE test', mean_absolute_error(predicted_test, y_test))

    print_examples(X_test, y_test, predicted_test)

In [144]:
evaluate(regressor, train_vects, test_vects, X_train, X_test, y_train, y_test)

MSE train 214.3008234895762
MAE train 10.270459248573072

MSE test 688.1320474256046
MAE test 17.440299681480923

Best:
Squere error:       0.00
Text: 295308.pkl
Hubs: исследования, и, прогнозы, в, it
predicted: 9, real: 9

Squere error:       0.00
Text: 349020.pkl
Hubs: блог, компании, google, developers,, разработка, веб-сайтов,, монетизация, веб-сервисов
predicted: 24, real: 24

Squere error:       0.00
Text: 516554.pkl
Hubs: проектирование, и, рефакторинг,, управление, разработкой,, микросервисы
predicted: 0, real: 0

Squere error:       0.01
Text: 408929.pkl
Hubs: криптовалюты
predicted: 14, real: 14

Squere error:       0.01
Text: 403889.pkl
Hubs: блог, компании, medgadgets
predicted: 12, real: 12

---------------------------------
Worse:
Squere error: 169654.46
Text: 505240.pkl
Hubs: управление, персоналом,, карьера, в, it-индустрии,, it-компании
predicted: 54, real: 466

Squere error: 75558.21
Text: 535626.pkl
Hubs: научно-популярное,, биотехнологии
predicted: 29, real: 304

Sq

На примерах выше видно, что модель старается не давать слишком больших ответов и поэтому склонна к ошибкам, например,когда реальный ответ трёхзначен. Это логично, т.к. трёхначные выбросы встречаются редко:

In [145]:
print(len(y_train))
print(len([i for i in y_train if i > 100]))

5433
61


Т.к. выбросы дают самую большую ошибку, сделаем также эксперимент без них:

In [146]:
print(len([i for i in y_train if i > 40]))
print(len([i for i in y_train if i > 30]))

487
822


In [147]:
indicies = [i for i in range(len(y_test)) if 0 < y_test[i] < 40]
X_test_ = [X_test[i] for i in indicies]
y_test_ = [y_test[i] for i in indicies]
test_vects_ = test_vects = vectorizer.transform([text['text'] for text in X_test_])

evaluate(regressor, train_vects, test_vects_, X_train, X_test_, y_train, y_test_)

MSE train 214.3008234895762
MAE train 10.270459248573072

MSE test 326.47417955685313
MAE test 14.06173214284955

Best:
Squere error:       0.00
Text: 295308.pkl
Hubs: исследования, и, прогнозы, в, it
predicted: 9, real: 9

Squere error:       0.00
Text: 349020.pkl
Hubs: блог, компании, google, developers,, разработка, веб-сайтов,, монетизация, веб-сервисов
predicted: 24, real: 24

Squere error:       0.01
Text: 408929.pkl
Hubs: криптовалюты
predicted: 14, real: 14

Squere error:       0.01
Text: 403889.pkl
Hubs: блог, компании, medgadgets
predicted: 12, real: 12

Squere error:       0.01
Text: 287504.pkl
Hubs: управление, медиа
predicted: 6, real: 6

---------------------------------
Worse:
Squere error: 25318.42
Text: 408923.pkl
Hubs: блог, компании, kingston, technology,, компьютерное, железо,, настольные, компьютеры,, diy, или, сделай, сам,, игры, и, игровые, приставки
predicted: 162, real: 3

Squere error: 5236.01
Text: 354956.pkl
Hubs: информационная, безопасность,, разработка, в