##Устанавливаем необходимые библиотеки

In [None]:
! pip install pymorphy2

In [None]:
import numpy as np
import pandas as pd
import os
import random

import re
import pymorphy2
import nltk
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import itertools
from gensim import models
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.models.fasttext import FastTextKeyedVectors

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

##Загружаем данные

In [None]:
num_words = 10000
seq_length = 30
embedding_size = 300
batch_size = 64
num_epochs = 5
learning_rate = 0.001

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
X_y_train = pd.read_excel('/content/drive/MyDrive/X_y_train.xlsx', usecols=["Text", "Class"])
X_y_test = pd.read_excel('/content/drive/MyDrive/X_y_test.xlsx', usecols=["Text", "Class"])

In [None]:
X_y_train["Class"].replace(-1, 0, inplace=True)
X_y_test["Class"].replace(-1, 0, inplace=True)

In [None]:
X_y_train.head(5)

Unnamed: 0,Text,Class
0,RT @Dj__De: @i_sleepwalker_ аахха китайский го...,1
1,"привет, я хочу поиграть с тобой в игру. выучи ...",0
2,Команда #КВН школы Гейдара Алиева г.#Астрахань...,1
3,"#20FactsAboutMe \n16. Не знаю, в какой сфере х...",0
4,этот год я встретил на Родине! Дал себе слово ...,0


In [None]:
morph = pymorphy2.MorphAnalyzer()
def preprocess_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub('@[^\s]+', ' ', text)
    text = re.sub('[^а-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip().split()
    text = [morph.parse(word)[0].normal_form for word in text]
    return text

In [None]:
train_tokens = [preprocess_text(t) for t in X_y_train["Text"]]
test_tokens = [preprocess_text(t) for t in X_y_test["Text"]]

##Загружаем w2vec и pretrained fasttext

In [None]:
my_w2v_vectors = KeyedVectors.load_word2vec_format('/content/my_w2v.vectors', binary=False)

In [None]:
pretrained_ft_vectors = models.fasttext.KeyedVectors.load('/content/drive/MyDrive/213/model.model')
pretrained_ft_vectors.init_sims(replace=True)

In [None]:
def padding_sentence(sentence, seq_length, value):
    while len(sentence) < seq_length:
        sentence.append(np.zeros(300 ,dtype=np.float32))
    else:
        return sentence[:seq_length]

In [None]:
def get_vector(word_vectors, sentence):
    vector = []
    for word in sentence:
        if word in word_vectors:
            vector.append(word_vectors[word])
    return vector

##Архитектура сетей

In [None]:
class CNN1D_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, kernel_size, dropout_prob):
        super().__init__()

        self.cnn = nn.Conv1d(input_size, hidden_size, kernel_size, stride=1, padding=0)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=1, dropout=dropout_prob, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.cnn(x)
        x = F.relu(x)
        x = self.maxpool(x)
        x = x.transpose(1, 2)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return torch.sigmoid(x)

In [None]:
class CNN1D_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, kernel_size, dropout_prob):
        super().__init__()

        self.cnn = nn.Conv1d(input_size, hidden_size, kernel_size, stride=1, padding=0)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=1, dropout=dropout_prob, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.cnn(x)
        x = F.relu(x)
        x = self.maxpool(x)
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return torch.sigmoid(x)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    loss, correct = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        y = y.float().unsqueeze(1)
        loss = loss_fn(pred, y)
        correct += (torch.round(pred) == y).type(torch.float).sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    correct /= size
    return correct

In [None]:
def evaluate(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            batch_size = X.shape[0]
            X, y = X.to(device), y.to(device)
            pred = model(X)
            y = y.float().unsqueeze(1)
            test_loss += loss_fn(pred, y).item() * batch_size
            correct += (torch.round(pred) == y).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    return correct

In [None]:
def get_result(net, kernel_size, features):
    epochs = num_epochs

    train_vector = [get_vector(features["vectors"], sentence) for sentence in train_tokens]
    test_vector = [get_vector(features["vectors"], sentence) for sentence in test_tokens]

    padded_train_vector = np.array([padding_sentence(i, seq_length, 0) for i in train_vector])
    padded_test_vector = np.array([padding_sentence(i, seq_length, 0) for i in test_vector])

    X_train, X_val, y_train, y_val = train_test_split(padded_train_vector, X_y_train["Class"], test_size=0.25, random_state=42)

    X_test = padded_test_vector
    y_test = X_y_test["Class"]

    inputs_train =  torch.tensor(X_train)
    targets_train = torch.IntTensor(y_train.to_numpy())

    inputs_val = torch.tensor(X_val)
    targets_val = torch.IntTensor(y_val.to_numpy())

    inputs_test =  torch.tensor(X_test)
    targets_test = torch.IntTensor(y_test.to_numpy())

    trainset = torch.utils.data.TensorDataset(inputs_train,targets_train)
    valset = torch.utils.data.TensorDataset(inputs_val, targets_val)
    testset = torch.utils.data.TensorDataset(inputs_test, targets_test)

    train_loader = torch.utils.data.DataLoader(trainset,batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size,shuffle=False)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,shuffle=False)

    model = net(kernel_size=kernel_size).to(device)
    loss_fn = F.binary_cross_entropy
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    results = {"Сеть": net.__name__, "Размер ядра": kernel_size, "Признаки": features["name"], "val_acc":0, "test_acc": 0}
    for t in range(epochs):
        train(train_loader, model, loss_fn, optimizer)
    results["val_acc"] = evaluate(val_loader, model, loss_fn)
    results["test_acc"] = evaluate(test_loader, model, loss_fn)
    return results

##Обучение

In [None]:
options_net = [CNN1D_LSTM, CNN1D_GRU]
options_kernel_size = [3, 5, 7]
options_features = [
    {"name": "Самообученные word2vec","vectors":my_w2v_vectors},
    {"name": "Предобученные fasttext","vectors":pretrained_ft_vectors}
]
result_df = pd.DataFrame(columns=['Сеть', "Размер ядра", "Признаки", 'val_acc', 'test_acc'])
for net, kernel_size, features in itertools.product(options_net, options_kernel_size, options_features):
    new_row = get_result(net, kernel_size, features)
    result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
result_df.sort_values(by='test_acc', ascending=False)

Unnamed: 0,Сеть,Размер ядра,Признаки,val_acc,test_acc
3,CNN1D_LSTM,5,Предобученные fasttext,0.710399,0.82
7,CNN1D_GRU,3,Предобученные fasttext,0.724002,0.78
0,CNN1D_LSTM,3,Самообученные word2vec,0.681224,0.76
1,CNN1D_LSTM,3,Предобученные fasttext,0.720959,0.74
5,CNN1D_LSTM,7,Предобученные fasttext,0.700555,0.74
9,CNN1D_GRU,5,Предобученные fasttext,0.7138,0.74
6,CNN1D_GRU,3,Самообученные word2vec,0.680329,0.7
11,CNN1D_GRU,7,Предобученные fasttext,0.690532,0.7
2,CNN1D_LSTM,5,Самообученные word2vec,0.681224,0.68
4,CNN1D_LSTM,7,Самообученные word2vec,0.682835,0.66
