# Классификация "фейковых" новостей

Задача:
1. 3 способами получить f1-score > 0.91 через sklearn
2. 3 способами получить f1-score > 0.52 через pytorch

Загрузим данные

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests

In [2]:
# URL для загрузки
url = "https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv"

# Загрузка файла
response = requests.get(url)

# Проверка, что запрос выполнен успешно
if response.status_code == 200:
    # Запись содержимого в файл
    with open("Constraint_Train.csv", "wb") as file:
        file.write(response.content)
    print("Файл успешно загружен и сохранён как 'Constraint_Train.csv'.")
else:
    print(f"Не удалось загрузить файл. Статус код: {response.status_code}")

Файл успешно загружен и сохранён как 'Constraint_Train.csv'.


In [3]:
dataset = pd.read_csv("Constraint_Train.csv")

dataset.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


## Методы sklearn

### TfidfVectorizer + SVC

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
import re
import string


en_stopwords = stopwords.words("english")
morph = MorphAnalyzer()

In [5]:
def preprocess_text(text):
    text = text.lower()
    # text = re.sub(r"[{}]".format(string.punctuation), "", text)
    text = word_tokenize(text, language="english")
    text = [word for word in text if word not in en_stopwords]
    text = [morph.parse(word)[0].normal_form for word in text]

    return " ".join(text)

In [6]:
dataset.tweet = dataset.tweet.apply(preprocess_text)

In [7]:
vectorizer = TfidfVectorizer(max_features=10000)
bow = vectorizer.fit_transform(dataset.tweet)

In [8]:
bow.shape

(6420, 10000)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
x_train, x_test, y_train, y_test = train_test_split(bow, dataset.label, test_size=0.33, random_state=42)

In [11]:
from sklearn.svm import SVC

In [12]:
model = SVC(random_state=42)

model.fit(x_train, y_train)

In [13]:
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.92      0.94      0.93      1004
        real       0.95      0.93      0.94      1115

    accuracy                           0.93      2119
   macro avg       0.93      0.93      0.93      2119
weighted avg       0.93      0.93      0.93      2119



### Word2Vec + RandomForestClassifier

In [14]:
def get_tokens(text):
    text = word_tokenize(text)

    return text

In [15]:
from gensim.models.word2vec import Word2Vec


dataset.tokens = dataset.tweet.apply(get_tokens)

model_tweets = Word2Vec(dataset.tokens, workers=4, vector_size=300, min_count=3, window=5, epochs=15)

  dataset.tokens = dataset.tweet.apply(get_tokens)


In [16]:
model_tweets.init_sims() # Нормировка векторов

  model_tweets.init_sims() # Нормировка векторов


In [17]:
# Токенизация текстов через обученную word2vec
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word]) # Добавляем вектор слова, если он был при обучении
    
    if len(result):
        result = np.average(result, axis=0) # Среднее всех векторов
    else:
        result = np.zeros(300) # 300 нулей (размер векторов слов), если не было никаких слов из обученного словаря
    return result

In [18]:
features = dataset.tweet.apply(get_text_embedding)

In [19]:
features = np.array(features.tolist())

In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, dataset.label, test_size=0.33, random_state=42)

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
model = RandomForestClassifier(random_state=42)

model.fit(x_train, y_train)

In [23]:
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.95      0.91      0.93      1004
        real       0.92      0.95      0.94      1115

    accuracy                           0.93      2119
   macro avg       0.93      0.93      0.93      2119
weighted avg       0.93      0.93      0.93      2119



### FastText и RandomForestClassifier с подобранными параметрами

In [24]:
from gensim.models.fasttext import FastText


model_tweets = FastText(dataset.tokens, workers=4, vector_size=300, min_count=3, window=5, epochs=15)

In [25]:
# Токенизация текстов через обученную fasttext
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word]) # Добавляем вектор слова, если он был при обучении
    
    if len(result):
        result = np.average(result, axis=0) # Среднее всех векторов
    else:
        result = np.zeros(300) # 300 нулей (размер векторов слов), если не было никаких слов из обученного словаря
    return result

In [26]:
features = dataset.tweet.apply(get_text_embedding)

In [27]:
features = np.array(features.tolist())

In [28]:
x_train, x_test, y_train, y_test = train_test_split(features, dataset.label, test_size=0.33, random_state=42)

In [29]:
model = RandomForestClassifier(random_state=42)

model.fit(x_train, y_train)

In [30]:
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.93      0.89      0.91      1004
        real       0.91      0.94      0.92      1115

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



In [32]:
from sklearn.model_selection import GridSearchCV


model = RandomForestClassifier(random_state=42)

grid_space={'max_depth':[3, 5, 10, None],
              'n_estimators':[50, 100, 200, 300],
              'max_features':[1, 3, 5, 7],
              'min_samples_leaf':[1, 2, 3],
              'min_samples_split':[1, 2, 3]
           }


grid_search = GridSearchCV(model, grid_space, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)

# Лучшие параметры
print("Лучшие параметры:", grid_search.best_params_)
# Лучшая оценка
print("Лучшая оценка:", grid_search.best_score_)
# Лучшая модель
best_clf = grid_search.best_estimator_

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


KeyboardInterrupt: 

In [None]:
best_clf.fit(x_train, y_train)

y_pred = best_clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        fake       0.93      0.89      0.91      1004
        real       0.90      0.94      0.92      1115

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



## Методы PyTorch

### Полносвязный слой

In [31]:
labels = (dataset.label == "real").astype(int).to_list() # Переводим метки в числа

In [32]:
token_lists = [word_tokenize(text.lower()) for text in dataset.tweet]
max_len = len(max(token_lists, key=len))

In [33]:
max_len

1014

In [33]:
from collections import Counter


fb = Counter([len(tokens) for tokens in token_lists])

In [34]:
fb.most_common(10)

[(17, 261),
 (19, 243),
 (18, 241),
 (12, 225),
 (21, 224),
 (16, 224),
 (14, 223),
 (13, 222),
 (15, 216),
 (20, 216)]

Брать максимальный размер в 1014 неразумно, лучше взять более реальное число в 200.

In [35]:
max_len = 200

In [36]:
dataset.tokens = dataset.tweet.apply(get_tokens)

model_tweets = Word2Vec(dataset.tokens, workers=4, vector_size=300, min_count=3, window=5, epochs=15)
model_tweets.init_sims()

  model_tweets.init_sims()


In [37]:
# Текст -> вектор через обученную word2vec
def get_text_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    
    if len(result):
        result = np.average(result, axis=0) # Среднее всех векторов
    else:
        result = np.zeros(300) # 300 нулей (размер векторов слов), если не было никаких слов из обученного словаря
    return result

In [39]:
features = [get_text_embedding(text, max_len) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:00<00:00, 7394.04it/s]


In [41]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [38]:
import torch
import torch.nn as nn
import torch.optim as optim

In [42]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.out = nn.Linear(300, 1)
    
    def forward(self, x):
        prediction = torch.sigmoid(self.out(x))
        return prediction
    
net = Net()
print(net)

Net(
  (out): Linear(in_features=300, out_features=1, bias=True)
)


In [59]:
in_data = torch.tensor(x_train).float()
targets = torch.tensor(y_train).float()

In [60]:
in_data.shape

torch.Size([4301, 300])

In [45]:
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [88]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]

        optimizer.zero_grad() # отчистка прошлых градиентов
        output = net(batch_x) # подсчёт предсказанных НС значений
        loss = criterion(output.squeeze(), batch_y) # посчёт функции потерь
        loss.backward() # вычисление градиентов
        optimizer.step() # шаг градиентного спуска
    print(loss)

In [64]:
for _ in range(100):
    train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:00<00:00, 3501.33it/s]


tensor(0.2180, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3421.24it/s]


tensor(0.1517, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4290.97it/s]


tensor(0.1241, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4533.78it/s]


tensor(0.1089, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4199.89it/s]


tensor(0.0993, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4570.40it/s]


tensor(0.0929, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4479.03it/s]


tensor(0.0883, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4653.09it/s]


tensor(0.0850, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4420.42it/s]


tensor(0.0825, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4677.69it/s]


tensor(0.0806, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4151.28it/s]


tensor(0.0791, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4501.35it/s]


tensor(0.0774, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4411.55it/s]


tensor(0.0757, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4494.14it/s]


tensor(0.0743, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4649.74it/s]


tensor(0.0732, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4480.68it/s]


tensor(0.0722, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4438.29it/s]


tensor(0.0714, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4605.51it/s]


tensor(0.0707, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3848.29it/s]


tensor(0.0700, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4477.95it/s]


tensor(0.0695, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4712.39it/s]


tensor(0.0689, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4750.54it/s]


tensor(0.0684, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4480.99it/s]


tensor(0.0680, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4499.00it/s]


tensor(0.0675, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4341.21it/s]


tensor(0.0671, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4543.26it/s]


tensor(0.0667, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4496.58it/s]


tensor(0.0664, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3963.08it/s]


tensor(0.0660, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4730.40it/s]


tensor(0.0657, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4766.05it/s]


tensor(0.0654, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4654.22it/s]


tensor(0.0651, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4438.96it/s]


tensor(0.0648, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4536.08it/s]


tensor(0.0646, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4615.86it/s]


tensor(0.0643, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4650.10it/s]


tensor(0.0641, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4433.19it/s]


tensor(0.0638, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3944.14it/s]


tensor(0.0636, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4494.57it/s]


tensor(0.0634, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4571.77it/s]


tensor(0.0632, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4552.29it/s]


tensor(0.0630, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4542.18it/s]


tensor(0.0628, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4565.26it/s]


tensor(0.0627, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4625.36it/s]


tensor(0.0625, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4501.71it/s]


tensor(0.0623, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4632.74it/s]


tensor(0.0622, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4427.65it/s]


tensor(0.0620, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4412.14it/s]


tensor(0.0619, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4513.68it/s]


tensor(0.0617, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4732.83it/s]


tensor(0.0616, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4618.05it/s]


tensor(0.0615, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4468.97it/s]

tensor(0.0613, grad_fn=<BinaryCrossEntropyBackward0>)







100%|██████████| 269/269 [00:00<00:00, 4549.69it/s]


tensor(0.0612, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4628.22it/s]


tensor(0.0611, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3913.89it/s]


tensor(0.0610, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4883.11it/s]


tensor(0.0608, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4552.53it/s]


tensor(0.0607, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4673.70it/s]


tensor(0.0606, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4537.61it/s]


tensor(0.0605, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4608.41it/s]


tensor(0.0604, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4452.31it/s]


tensor(0.0603, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4388.71it/s]


tensor(0.0602, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3910.56it/s]


tensor(0.0601, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4697.01it/s]


tensor(0.0600, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4343.35it/s]


tensor(0.0599, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4657.18it/s]


tensor(0.0598, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4607.47it/s]


tensor(0.0597, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4512.15it/s]


tensor(0.0597, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4534.89it/s]


tensor(0.0596, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4666.80it/s]


tensor(0.0595, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4392.08it/s]


tensor(0.0594, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3881.13it/s]


tensor(0.0593, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4439.10it/s]


tensor(0.0593, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3954.41it/s]


tensor(0.0592, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4186.99it/s]


tensor(0.0591, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4237.64it/s]


tensor(0.0590, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4422.24it/s]


tensor(0.0590, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4565.78it/s]


tensor(0.0589, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4496.81it/s]


tensor(0.0588, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3847.79it/s]


tensor(0.0588, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4438.64it/s]


tensor(0.0587, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4201.36it/s]


tensor(0.0586, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4486.74it/s]


tensor(0.0586, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4244.67it/s]


tensor(0.0585, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4250.44it/s]


tensor(0.0584, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4586.03it/s]


tensor(0.0584, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4539.44it/s]


tensor(0.0583, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4419.57it/s]


tensor(0.0583, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 3827.85it/s]


tensor(0.0582, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4582.73it/s]


tensor(0.0582, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4478.66it/s]


tensor(0.0581, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4507.28it/s]


tensor(0.0580, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4543.44it/s]


tensor(0.0580, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4400.11it/s]


tensor(0.0579, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4495.61it/s]


tensor(0.0579, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4430.00it/s]


tensor(0.0578, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4489.10it/s]


tensor(0.0578, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4481.57it/s]


tensor(0.0577, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4401.12it/s]


tensor(0.0577, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4369.34it/s]


tensor(0.0577, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 4607.09it/s]

tensor(0.0576, grad_fn=<BinaryCrossEntropyBackward0>)





In [65]:
in_data_test = torch.tensor(x_test).float()
targets_test = torch.tensor(y_test).float()

In [70]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [77]:
result = (output > 0.5).int()

In [79]:
print(classification_report(targets_test, result))

              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90       987
         1.0       0.92      0.92      0.92      1132

    accuracy                           0.91      2119
   macro avg       0.91      0.91      0.91      2119
weighted avg       0.91      0.91      0.91      2119



### LSTM

In [80]:
# Текст -> вектора токенов через обученную word2vec
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    
    return result

In [81]:
features = [get_word_embedding(text, max_len) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:01<00:00, 5125.15it/s]


In [82]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 200, dropout=0.3)
        self.out = nn.Linear(200, 1)
    
    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction
    

net = Net()
print(net)

Net(
  (lstm): LSTM(300, 200, dropout=0.3)
  (out): Linear(in_features=200, out_features=1, bias=True)
)




In [100]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]

        optimizer.zero_grad() # отчистка прошлых градиентов
        output = net(batch_x) # подсчёт предсказанных НС значений
        loss = criterion(output.squeeze(), batch_y) # посчёт функции потерь
        loss.backward() # вычисление градиентов
        optimizer.step() # шаг градиентного спуска
    print(loss)

In [101]:
in_data = torch.tensor(x_train).float()
targets = torch.tensor(y_train).float()

In [102]:
in_data.shape

torch.Size([4301, 200, 300])

In [105]:
optimizer = optim.Adam(net.parameters(), lr=0.005)
criterion = nn.BCELoss()

In [106]:
for _ in range(10):
    train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:30<00:00,  8.70it/s]


tensor(0.0435, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.69it/s]


tensor(0.0540, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.72it/s]


tensor(0.0331, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.74it/s]


tensor(0.0523, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.75it/s]


tensor(0.0054, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.71it/s]


tensor(0.0173, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.72it/s]


tensor(0.1070, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.71it/s]


tensor(0.0016, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.74it/s]


tensor(0.0022, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:30<00:00,  8.68it/s]

tensor(0.0243, grad_fn=<BinaryCrossEntropyBackward0>)





In [108]:
in_data_test = torch.tensor(x_test).float()
targets_test = torch.tensor(y_test).float()

In [109]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [110]:
result = (output > 0.5).int()

In [None]:
print(classification_report(targets_test, result))

              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93      1009
         1.0       0.96      0.92      0.94      1110

    accuracy                           0.94      2119
   macro avg       0.94      0.94      0.94      2119
weighted avg       0.94      0.94      0.94      2119



### CNN

In [None]:
class Net(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, dropout):
        super(Net, self).__init__()
        # Устанавливаем количество входных каналов на 1, так как мы будем работать с 1D векторами
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=1, out_channels=n_filters, kernel_size=fs, padding=(fs//2))
            for fs in filter_sizes
        ])
        # Полносвязный слой для классификации
        self.fc = nn.Linear(len(filter_sizes) * n_filters, 1)  # Для бинарной классификации
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.unsqueeze(1)  # Добавляем размерность канала: (batch_size, 1, word_vector_len)
        
        conved = [torch.relu(conv(x)) for conv in self.convs]  # Применяем свертки
        pooled = [torch.max(c, dim=2)[0] for c in conved]  # Пулинг по максимальному значению
        cat = self.dropout(torch.cat(pooled, dim=1))  # Объединяем результаты сверток и применяем dropout
        
        prediction = torch.sigmoid(self.fc(cat))  # Получаем предсказание
        return prediction


net = Net(embedding_dim=300, n_filters=100, filter_sizes=[3, 4, 5], dropout=0.3)
print(net)

Net(
  (convs): ModuleList(
    (0): Conv1d(1, 100, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): Conv1d(1, 100, kernel_size=(4,), stride=(1,), padding=(2,))
    (2): Conv1d(1, 100, kernel_size=(5,), stride=(1,), padding=(2,))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [40]:
features = [get_text_embedding(text, max_len) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:00<00:00, 7381.95it/s]


In [41]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [42]:
in_data = torch.tensor(x_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(x_train).float()


In [72]:
optimizer = optim.Adam(net.parameters(), lr=0.005)
criterion = nn.BCELoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]

        optimizer.zero_grad() # отчистка прошлых градиентов
        output = net(batch_x) # подсчёт предсказанных НС значений
        loss = criterion(output.squeeze(), batch_y) # посчёт функции потерь
        loss.backward() # вычисление градиентов
        optimizer.step() # шаг градиентного спуска
    print(loss)

In [None]:
in_data.shape

torch.Size([4301, 300])

In [74]:
for _ in range(100):
    train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:01<00:00, 266.35it/s]


tensor(0.5390, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.35it/s]


tensor(0.4432, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.78it/s]


tensor(0.4875, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.98it/s]


tensor(0.4101, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.14it/s]


tensor(0.4157, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.25it/s]


tensor(0.3842, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.56it/s]


tensor(0.4002, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.97it/s]


tensor(0.3161, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 259.46it/s]


tensor(0.2840, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 271.01it/s]


tensor(0.4612, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.46it/s]


tensor(0.2471, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.33it/s]


tensor(0.2838, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.33it/s]


tensor(0.4518, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.98it/s]


tensor(0.2783, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.27it/s]


tensor(0.3289, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.82it/s]


tensor(0.2559, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 271.75it/s]


tensor(0.3556, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 264.82it/s]


tensor(0.2252, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 275.04it/s]


tensor(0.2580, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.12it/s]


tensor(0.2410, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.03it/s]


tensor(0.2638, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 275.71it/s]


tensor(0.4042, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.64it/s]


tensor(0.4390, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.66it/s]


tensor(0.2456, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 265.81it/s]


tensor(0.1858, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.91it/s]


tensor(0.3712, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.28it/s]


tensor(0.3800, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.60it/s]


tensor(0.2896, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 269.00it/s]


tensor(0.3297, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 260.36it/s]


tensor(0.2389, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.43it/s]


tensor(0.2640, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.64it/s]


tensor(0.2357, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.57it/s]


tensor(0.3065, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.30it/s]


tensor(0.4528, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.99it/s]


tensor(0.3378, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 256.75it/s]


tensor(0.3070, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.12it/s]


tensor(0.4558, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.31it/s]


tensor(0.3362, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.48it/s]


tensor(0.2915, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 265.97it/s]


tensor(0.3042, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.08it/s]


tensor(0.2147, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 259.22it/s]


tensor(0.3342, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 263.69it/s]


tensor(0.2129, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 258.97it/s]


tensor(0.2708, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.43it/s]


tensor(0.3532, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 261.90it/s]


tensor(0.2525, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.66it/s]


tensor(0.3092, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.93it/s]


tensor(0.2356, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.15it/s]


tensor(0.3877, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.67it/s]


tensor(0.2764, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.57it/s]


tensor(0.1829, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.79it/s]


tensor(0.3212, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.84it/s]


tensor(0.2772, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.19it/s]


tensor(0.2659, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 264.68it/s]


tensor(0.3434, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 264.53it/s]


tensor(0.2831, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.98it/s]


tensor(0.3718, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.17it/s]


tensor(0.2044, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.96it/s]


tensor(0.2754, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 271.55it/s]


tensor(0.3873, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.50it/s]


tensor(0.2589, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.63it/s]


tensor(0.3491, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 275.44it/s]


tensor(0.3745, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.57it/s]


tensor(0.4279, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 261.81it/s]


tensor(0.2389, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 254.77it/s]


tensor(0.1882, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 263.41it/s]


tensor(0.2287, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 255.83it/s]


tensor(0.2574, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 277.02it/s]


tensor(0.2388, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 259.90it/s]


tensor(0.4052, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:23<00:00, 11.57it/s] 


tensor(0.3334, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 279.48it/s]


tensor(0.3700, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.57it/s]


tensor(0.1776, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.34it/s]


tensor(0.2015, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.00it/s]


tensor(0.2845, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.04it/s]


tensor(0.2560, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.52it/s]


tensor(0.3699, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 262.93it/s]


tensor(0.3831, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.40it/s]


tensor(0.2338, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 260.92it/s]


tensor(0.2904, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 274.15it/s]


tensor(0.3410, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.68it/s]


tensor(0.3058, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.01it/s]


tensor(0.1681, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.25it/s]


tensor(0.2367, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 279.16it/s]


tensor(0.2790, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.52it/s]


tensor(0.2046, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.10it/s]


tensor(0.3393, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 273.65it/s]


tensor(0.2285, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 259.90it/s]


tensor(0.3896, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.85it/s]


tensor(0.3157, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 269.50it/s]


tensor(0.2833, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.70it/s]


tensor(0.2281, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.57it/s]


tensor(0.3116, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 271.89it/s]


tensor(0.1679, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 268.31it/s]


tensor(0.3387, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 266.63it/s]


tensor(0.3103, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:01<00:00, 267.93it/s]


tensor(0.3240, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 270.61it/s]


tensor(0.2420, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 272.12it/s]


tensor(0.2066, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:00<00:00, 275.58it/s]

tensor(0.3532, grad_fn=<BinaryCrossEntropyBackward0>)





In [63]:
in_data_test = torch.tensor(x_test).float()
targets_test = torch.tensor(y_test).float()

In [75]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [76]:
result = (output > 0.5).int()

In [77]:
print(classification_report(targets_test, result))

              precision    recall  f1-score   support

         0.0       0.84      0.85      0.84      1000
         1.0       0.86      0.85      0.86      1119

    accuracy                           0.85      2119
   macro avg       0.85      0.85      0.85      2119
weighted avg       0.85      0.85      0.85      2119



## Вывод:

Я научился 6 способами классифицировать тексты (3 - sklearn и 3 - PyTorch). С задачей по метрикам я также справился (лучший результат - 93%, худший - 85%). 

Какой вариант я буду выбирать для дальнейшей работы? - однозначно PyTorch сети RNN (скорее всего, с уже предобученными моделями).