# Домашняя работа "Классификация в АОТ"

## Классификация текстов
### Fakenews
1. Мы будем работать с данными fakenews отсюда: https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv   
2. Проведите препроцессинг текста. Разбейте данные на train и test для задачи классификации.
3. Векторизуйте.
4. Обучите на полученных векторах алгоритм классификации.   
Мы уже видели как эта задача выполняется с помощью Word2vec. Давайте вспомним.

3 раза разными способами получить на задаче классификации значение f1 выше 0.91 для методов на sklearn и выше 0.52 для методов на pytorch.

In [1]:
# Загружаем библиотеки
import pandas as pd
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk
nltk.download('punkt')
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Загружаем данные
df = pd.read_csv('Constraint_Train.csv')
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [3]:
# Получаем предложения/твиты
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:04<00:00, 1301.32it/s]


In [4]:
# Обучаем модель Word2Vec
%time model_tweets = Word2Vec(sentences, workers=4, size=300, min_count=3, window=5, iter=15)

CPU times: user 9.54 s, sys: 102 ms, total: 9.64 s
Wall time: 8.26 s


In [5]:
# Смотрим какие слова оказались близки к слову france
model_tweets.wv.most_similar('france')

[('bags', 0.9364765882492065),
 ('center', 0.9301714301109314),
 ('tower', 0.9244937896728516),
 ('floor', 0.9229177832603455),
 ('front', 0.9215922355651855),
 ('saudi', 0.917776346206665),
 ('buenos', 0.9164356589317322),
 ('2015', 0.9150580763816833),
 ('images', 0.9130328893661499),
 ('jamaat', 0.9103695154190063)]

In [6]:
# Нормируем вектора
model_tweets.init_sims()

In [7]:
# Функция создаёт эмбединг текста
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [8]:
# Делаем вектора признаков
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:03<00:00, 1945.70it/s]


In [9]:
# Разделяем данные на train и test 
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.33)


### Способ 1 - логистическая регрессия

In [10]:
# Обучаем модель
model_1 = LogisticRegression(solver='liblinear', random_state=0)
model_1.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [11]:
# Делаем предсказание
predicted_1 = model_1.predict(X_test)

In [12]:
# Смотрим точность модели
f1_score(y_test, predicted_1, average=None)

array([0.91942659, 0.92641084])

### Способ 2 - метод опорных векторов

In [13]:
# Обучаем модель
model_2 = svm.SVC(kernel='linear', gamma='scale', random_state=0)
model_2.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [14]:
# Делаем предсказание
predicted_2 = model_2.predict(X_test)

In [15]:
# Смотрим точность модели
f1_score(y_test, predicted_2, average=None)

array([0.92007612, 0.92134831])

### Способ 3 - k ближайших соседей 

In [16]:
# Обучаем модель
model_3 = KNeighborsClassifier(n_neighbors=3)
model_3.fit(X_train, y_train) 

KNeighborsClassifier(n_neighbors=3)

In [17]:
# Делаем предсказание
predicted_3 = model_3.predict(X_test)

In [18]:
# Смотрим точность модели
f1_score(y_test, predicted_3, average=None)

array([0.91765826, 0.91904539])

### PyTorch + LSTM

#### Вариант 1

In [34]:
labels = (df.label == 'real').astype(int).to_list()

In [35]:
# Определяем максимальную длину предложения
token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))
max_len

1592

In [36]:
fd = Counter([len(tokens) for tokens in token_lists])

In [37]:
fd.most_common(10)

[(20, 178),
 (25, 174),
 (22, 170),
 (18, 170),
 (19, 168),
 (21, 168),
 (16, 163),
 (17, 162),
 (15, 160),
 (23, 156)]

In [38]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [39]:
# Делаем вектора признаков
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:02<00:00, 2656.43it/s]


In [40]:
len(features)

6420

In [41]:
# Уменьшаем размер данных (иначе не хватает объёма ОЗУ)
features = features[:3000]
labels = labels[:3000]

In [42]:
len(features)

3000

In [43]:
# Разделяем данные на train и test 
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [44]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [45]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [52]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [29]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [54]:
train_one_epoch(in_data, targets)

100%|██████████| 126/126 [01:16<00:00,  1.64it/s]

tensor(0.6943, grad_fn=<BinaryCrossEntropyBackward0>)





In [55]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [56]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [57]:
result = (output > 0.5) == targets_test

In [58]:
result.sum().item() / len(result)

0.5232323232323233

#### Вариант 2

In [15]:
# Делаем вектора признаков
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:04<00:00, 1465.96it/s]


In [16]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        return torch.sigmoid(self.out(x))


net = Net()
print(net)

Net(
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [18]:
# Уменьшаем размер данных (иначе не хватает объёма ОЗУ)
features = features[:3000]
labels = labels[:3000]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [25]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [46]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [32]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [47]:
for i in range(10):
    train_one_epoch(in_data, targets)

100%|██████████| 126/126 [01:31<00:00,  1.37it/s]


tensor(0.7257, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:28<00:00,  1.43it/s]


tensor(0.7231, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.40it/s]


tensor(0.7207, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:29<00:00,  1.41it/s]


tensor(0.7185, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.40it/s]


tensor(0.7165, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:29<00:00,  1.40it/s]


tensor(0.7147, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.39it/s]


tensor(0.7130, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.39it/s]


tensor(0.7115, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.38it/s]


tensor(0.7101, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 126/126 [01:30<00:00,  1.40it/s]

tensor(0.7088, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)





In [48]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [55]:
result = (output > 0.5) == targets_test

In [56]:
result.sum().item() / len(result)

0.5333333333333333