##Импорт библиотек

In [None]:
! pip install pymorphy2
!pip install datasets

In [None]:
import os
import random
from pprint import pprint
from string import punctuation
import nltk
import re
import time
import datasets

import pymorphy2
from pymorphy2 import MorphAnalyzer
from pymorphy2 import units

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import gensim
from gensim import models
from gensim.models import Word2Vec, FastText, KeyedVectors
from gensim.models.fasttext import FastTextKeyedVectors

from torch.nn.init import kaiming_uniform_, xavier_uniform_

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Загрузка данных

In [None]:
X_y_train = pd.read_excel('/content/drive/MyDrive/X_y_train.xlsx', usecols=["Text", "Class"])
X_y_test = pd.read_excel('/content/drive/MyDrive/X_y_test.xlsx', usecols=["Text", "Class"])

In [None]:
X_y_train["Class"].replace(-1, 0, inplace=True)
X_y_test["Class"].replace(-1, 0, inplace=True)

In [None]:
X_y_train.head(10)

Unnamed: 0,Text,Class
0,RT @Dj__De: @i_sleepwalker_ аахха китайский го...,1
1,"привет, я хочу поиграть с тобой в игру. выучи ...",0
2,Команда #КВН школы Гейдара Алиева г.#Астрахань...,1
3,"#20FactsAboutMe \n16. Не знаю, в какой сфере х...",0
4,этот год я встретил на Родине! Дал себе слово ...,0
5,Че делать?(((((( Я не могу больше это слушать....,0
6,Проснулся. Школа. Школа. Пытка. Голод. Школа. ...,0
7,Работницы регистратуры в поликлинике и сотрудн...,1
8,Да какая разница насколько он ее младше? Главн...,1
9,"@Olga_PrimeTime, Мне не надо денег, мне не над...",1


In [None]:
morph = pymorphy2.MorphAnalyzer()
def preprocess_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub('@[^\s]+', ' ', text)
    text = re.sub('[^а-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip().split()
    text = [morph.parse(word)[0].normal_form for word in text]
    return text

In [None]:
train_tokens = [preprocess_text(t) for t in X_y_train["Text"]]
test_tokens = [preprocess_text(t) for t in X_y_test["Text"]]

In [None]:
print(train_tokens[0])

['аахха', 'китайский', 'город', 'в', 'центр', 'украина', 'прийтись', 'просить', 'автономность', 'быть', 'автономный', 'республика', 'винница']


##Модели

In [None]:
my_w2v_vectors = KeyedVectors.load_word2vec_format('/content/my_w2v.vectors', binary=False)

In [None]:
pretrained_ft_vectors = gensim.models.KeyedVectors.load('/content/drive/MyDrive/213/model.model')
pretrained_ft_vectors.init_sims(replace=True)

In [None]:
def get_vector(word_vectors, sentence):
    vector = []
    for word in sentence:
        if word in word_vectors:
            vector.append(word_vectors[word])
    return vector

In [None]:
def padding_sentence(sentence, seq_length, value):
    while len(sentence) < seq_length:
        sentence.append(np.zeros(300 ,dtype=np.float32))
    else:
        return sentence[:seq_length]

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, params):
        super(TextClassifier, self).__init__()
        self.POL = 3
        self.STRIDE = params['stride']
        self.dropout = nn.Dropout(0.25)
        if params['init_type'] == 'kaiming_uniform_':
            self.INIT_FUNC = kaiming_uniform_
        else:
            self.INIT_FUNC = xavier_uniform_
        convs = []
        for layer in params["layers"]:
            convnet = nn.Conv1d(layer['in_channels'], layer["out_channels"], layer["kernel_size"], self.STRIDE)
            self.INIT_FUNC(convnet.weight)
            convs.append(convnet)
            if params['norm'] == 'BatchNorm1d':
                convs.append(nn.BatchNorm1d(layer["out_channels"]))
            else:
                convs.append(nn.LayerNorm(params["seq_length"]-self.POL+1))
            convs.append(nn.ReLU())
            convs.append(nn.MaxPool1d(self.POL, self.STRIDE))
            if params["dropout"] == True:
                convs.append(self.dropout)


        self.conv = nn.Sequential(*convs)
        self.fc = nn.Linear(params["layers"][-1]["out_channels"], 1)


        self.INIT_FUNC(self.fc.weight)

    def forward(self, x):
        x = x.transpose(1, 2).contiguous()
        x = self.conv(x)
        x, _ = x.max(dim=-1)
        x = self.fc(x)
        x = torch.sigmoid(x)
        return x

In [None]:
result_df = pd.DataFrame(columns=['feature', 'init_type', 'dropout', 'weight_decay', 'norm', 'scheduler', 'val_F1', 'test_F1'])

In [None]:
def calculate_f1(grand_truth, predictions):
    binary_pred = [1 if pred >= 0.5 else 0 for pred in predictions]
    f1 = f1_score(grand_truth, binary_pred)
    return f1

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
def runlab(PARAMETERS, result_df):
    res_list = []
    if PARAMETERS["feature"] == 'my_w2v':
        train_vector = [get_vector(my_w2v_vectors, sentence) for sentence in train_tokens]
        test_vector = [get_vector(my_w2v_vectors, sentence) for sentence in test_tokens]
    elif PARAMETERS["feature"] == 'pretrain_ft':
        train_vector = [get_vector(pretrained_ft_vectors, sentence) for sentence in train_tokens]
        test_vector = [get_vector(pretrained_ft_vectors, sentence) for sentence in test_tokens]


    padded_train_vector = np.array([padding_sentence(i, PARAMETERS["seq_length"], 0) for i in train_vector])
    padded_test_vector = np.array([padding_sentence(i, PARAMETERS["seq_length"], 0) for i in test_vector])

    X_train, X_val, y_train, y_val = train_test_split(padded_train_vector, X_y_train["Class"], test_size=0.25, random_state=42)

    inputs_train =  torch.tensor(X_train)
    targets_train = torch.IntTensor(y_train.to_numpy())

    inputs_val = torch.tensor(X_val)
    targets_val = torch.IntTensor(y_val.to_numpy())

    inputs_test =  torch.tensor(padded_test_vector)
    targets_test = torch.IntTensor(X_y_test["Class"].to_numpy())


    trainset = torch.utils.data.TensorDataset(inputs_train,targets_train)
    valset = torch.utils.data.TensorDataset(inputs_val, targets_val)
    testset = torch.utils.data.TensorDataset(inputs_test, targets_test)

    train_loader = torch.utils.data.DataLoader(trainset,batch_size=PARAMETERS['batch_size'], shuffle=True)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=PARAMETERS['batch_size'],shuffle=False)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=PARAMETERS['batch_size'],shuffle=False)

    model = TextClassifier(PARAMETERS).to(device)

    if PARAMETERS["optimizer"] == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=PARAMETERS["learning_rate"], weight_decay=PARAMETERS["weight_decay"])
    else:
        optimizer = optim.SGD(model.parameters(), lr=PARAMETERS["learning_rate"], weight_decay=PARAMETERS["weight_decay"])
    if PARAMETERS["scheduler"] == "MultiStepLR":
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10], gamma=0.1)
    else:
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)
    val_f1 = 0
    for epoch in range(PARAMETERS['num_epochs']):
        model.train()
        predictions = []
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_batch = y_batch.type(torch.FloatTensor)
            y_pred = model(x_batch).cpu()
            loss = F.binary_cross_entropy(y_pred, y_batch.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            predictions += list(y_pred.cpu().detach().numpy())

        model.eval()
        val_predictions = []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                val_predictions += list(y_pred.cpu().detach().numpy())

        scheduler.step()
        model.eval()
        test_predictions = []
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                test_predictions += list(y_pred.cpu().detach().numpy())
        val_f1 = calculate_f1(y_val, val_predictions)
        test_f1 = calculate_f1(X_y_test["Class"], test_predictions)
    new_row = {'feature': PARAMETERS["feature"], 'init_type': PARAMETERS["init_type"],'dropout': PARAMETERS["dropout"],'weight_decay': PARAMETERS["weight_decay"],'norm': PARAMETERS["norm"], 'scheduler': PARAMETERS["scheduler"], 'val_F1':val_f1, 'test_F1': test_f1}
    res_list.append(new_row)
    final_res_df = pd.concat([result_df, pd.DataFrame(res_list)], ignore_index=True)

    print('val F1:', val_f1)
    return final_res_df


##Параметры

###MY W2V

In [None]:
#Начальные параметры
PARAMETERS = {
    # - Preprocessing parameeters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)


val F1: 0.6706627847234115


In [None]:
#Начальная инициализация весов
PARAMETERS = {
    # - Preprocessing parameeters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'xavier_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)


val F1: 0.6063100137174211


In [None]:
#Дропаут
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': True, # False
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.5779048396977741


In [None]:
#L2
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 1e-5, # 0
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.6761936785474107


In [None]:
#Нормализация
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'LayerNorm', #  BatchNorm1d
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.615494721774915


In [None]:
#Настройка скорости обучения
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'my_w2v',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'ExponentialLR' #  MultiStepLR
}
result_df = runlab(PARAMETERS, result_df)


val F1: 0.6259791122715405


###Pretrained FastText

In [None]:
#Начальные параметры
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.659585403460682


In [None]:
#Начальная инициализация весов
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'xavier_uniform_', #  kaiming_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.7073170731707318


In [None]:
#Регуляризация
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': True, # False
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.7156451161488945


In [None]:
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 1e-5, # 0
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.6154138915318744


In [None]:
#Нормализация
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'LayerNorm', # BatchNorm1d
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'MultiStepLR' # ExponentialLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.6187279151943463


In [None]:
#Скорость обучения
PARAMETERS = {
    # - Preprocessing parameters -
    'feature': 'pretrain_ft',
    'seq_length': 20,
    # - Model parameters -
    'init_type': 'kaiming_uniform_', # xavier_uniform_
    'optimizer': 'ADAM',
    'layers': [{'in_channels': 300,'out_channels': 400, 'kernel_size': 3}],
    'stride': 1,
    'dropout': False, # True
    'weight_decay': 0, # 1e-5
    'norm':'BatchNorm1d', # LayerNorm
    # - Training parameters -
    'num_epochs': 10,
    'batch_size': 16,
    'learning_rate': 0.001,
    'scheduler': 'ExponentialLR' # MultiStepLR
}
result_df = runlab(PARAMETERS, result_df)

val F1: 0.6074895977808599


##Результаты

In [None]:
result_df.sort_values(by = 'test_F1', ascending = False)

Unnamed: 0,feature,init_type,dropout,weight_decay,norm,scheduler,val_F1,test_F1
7,pretrain_ft,xavier_uniform_,False,0.0,BatchNorm1d,MultiStepLR,0.707317,0.807018
3,my_w2v,kaiming_uniform_,False,1e-05,BatchNorm1d,MultiStepLR,0.676194,0.784314
8,pretrain_ft,kaiming_uniform_,True,0.0,BatchNorm1d,MultiStepLR,0.715645,0.774194
5,my_w2v,kaiming_uniform_,False,0.0,BatchNorm1d,ExponentialLR,0.625979,0.77193
6,pretrain_ft,kaiming_uniform_,False,0.0,BatchNorm1d,MultiStepLR,0.659585,0.745098
0,my_w2v,kaiming_uniform_,False,0.0,BatchNorm1d,MultiStepLR,0.670663,0.727273
10,pretrain_ft,kaiming_uniform_,False,0.0,LayerNorm,MultiStepLR,0.618728,0.716981
1,my_w2v,xavier_uniform_,False,0.0,BatchNorm1d,MultiStepLR,0.60631,0.695652
9,pretrain_ft,kaiming_uniform_,False,1e-05,BatchNorm1d,MultiStepLR,0.615414,0.666667
11,pretrain_ft,kaiming_uniform_,False,0.0,BatchNorm1d,ExponentialLR,0.60749,0.642857
