In [1]:
import pandas as pd
import torch
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt
import seaborn as sns
import natasha
from navec import Navec
from slovnet.model.emb import NavecEmbedding
import numpy as np
import nltk
import string

In [2]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Эдуард\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Эдуард\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
class DataPreprocessing(torch.utils.data.Dataset):
    
    def __init__(self, path, sep, encoding, subset='train'):
        self.subset = subset
        self.df = pd.read_csv(path, sep=sep, encoding=encoding)
        self.length_of_vec = 0
        for i in range(len(self.df)):
            if len(self.df['Описание анкеты'].values[i].split(' ')) > self.length_of_vec:
                self.length_of_vec = len(self.df['Описание анкеты'].values[i].split(' '))
        self.x_train, self.x_test, self.y_train, self.y_test = model_selection.train_test_split(self.df['Описание анкеты'], self.df[' Рекомендовать'], test_size=0.33, random_state=42)
    
    def vectorize(self, disc):
        tokens = nltk.word_tokenize(disc, language='russian')
        tokens_without_punct = [str(token) for token in tokens if token not in string.punctuation]
        without_stop_words = [str(token) for token in tokens_without_punct if token not in nltk.corpus.stopwords.words('russian')]
        #snowball = nltk.stem.snowball.SnowballStemmer('russian')
        #stemmed_tokens = [str(snowball.stem(i)) for i in without_stop_words] леммизация
        ids = []
        for j in without_stop_words:
            if j.lower() in navec:
                ids.append(navec.vocab[j.lower()])
            else:
                ids.append(navec.vocab['<unk>'])
        if len(ids) < self.length_of_vec:
            addition = [0 for i in range(self.length_of_vec - len(ids))]
            ids += addition
        return torch.tensor(ids)
                
    def __len__(self):
        if self.subset == 'train':
            return len(self.x_train)
        else:
            return len(self.x_test)
    
    def __getitem__(self, index):
        if self.subset == 'train':
            disc = self.x_train.values[index]
            recomend = self.y_train.values[index]
        else:
            disc = self.x_test.values[index]
            recomend = self.y_test.values[index]
        if type(disc) is str:
            return self.vectorize(disc), torch.tensor(recomend)

In [4]:
path_df = 'C:/Users/Эдуард/Desktop/проект/data.txt'
sep = ';'
encoding = 'cp1251'
data = DataPreprocessing(path_df, sep, encoding)

In [5]:
batch_size = len(data)
learning_rate = 0.001
epochs = 100

Input matrix 44x300

In [17]:
class Model(torch.nn.Module):
    
    embedding = NavecEmbedding(navec)
    x, _ = data[0]
    dim = len(x)
    def __init__(self):
        super().__init__()
        self.sequential = torch.nn.Sequential(
            torch.nn.Conv1d(self.dim, self.dim, 10),
            torch.nn.MaxPool1d(2),
            torch.nn.ELU(),
            torch.nn.Conv1d(self.dim, self.dim, 50),
            torch.nn.MaxPool1d(2),
            torch.nn.ELU(),
            torch.nn.Conv1d(self.dim, 12, 20),
            torch.nn.ELU(),
            torch.nn.Flatten(),
            torch.nn.Linear(348, 100),
            torch.nn.ReLU(),
            torch.nn.Linear(100, 20),
            torch.nn.ReLU(),
            torch.nn.Linear(20, 1),
            torch.nn.Sigmoid()
        )
        
    def forward(self, x):
        embedded_x = self.embedding(x)
        y = self.sequential(embedded_x)
        return y
    
    def trainn(self, data, batch_size, epochs):
        dl = torch.utils.data.DataLoader(data, batch_size=batch_size)
        loss_fn = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), weight_decay=0.009)
        self.train()
        for epoch in range(epochs):
            for batch, (X, Y) in enumerate(dl):
                optimizer.zero_grad()
                #pred = torch.tensor([model(X[i]) for i in range(len(X))], requires_grad=True)
                #target = Y.to(torch.float32)
                pred = self(X)
                target = torch.reshape(Y, (-1, 1)).float()
                loss = loss_fn(pred, target)
                loss.backward()
                optimizer.step()
                if batch % 1 == 0:
                    loss, current = loss.item(), (batch + 1)*len(X)
                    print(f'{epoch+1}/{epochs}')
                    print(f'batch: {batch}; loss = {loss}')

In [18]:
model = Model()

In [19]:
model.trainn(data, batch_size, epochs)

1/100
batch: 0; loss = 0.7507927417755127
2/100
batch: 0; loss = 0.7105888724327087
3/100
batch: 0; loss = 0.6495482921600342
4/100
batch: 0; loss = 0.5817843675613403
5/100
batch: 0; loss = 0.5726590752601624
6/100
batch: 0; loss = 0.5982565879821777
7/100
batch: 0; loss = 0.5709444880485535
8/100
batch: 0; loss = 0.5598000884056091
9/100
batch: 0; loss = 0.5697249174118042
10/100
batch: 0; loss = 0.5742855072021484
11/100
batch: 0; loss = 0.5689115524291992
12/100
batch: 0; loss = 0.5608711242675781
13/100
batch: 0; loss = 0.5569228529930115
14/100
batch: 0; loss = 0.5585339665412903
15/100
batch: 0; loss = 0.5607529878616333
16/100
batch: 0; loss = 0.5592071413993835
17/100
batch: 0; loss = 0.5558499693870544
18/100
batch: 0; loss = 0.5546475648880005
19/100
batch: 0; loss = 0.555702269077301
20/100
batch: 0; loss = 0.5564998984336853
21/100
batch: 0; loss = 0.5558892488479614
22/100
batch: 0; loss = 0.5541115403175354
23/100
batch: 0; loss = 0.5523744225502014
24/100
batch: 0; loss

In [20]:
def predict(model, data, metric=True, i=0,):
    if metric:
        dl = torch.utils.data.DataLoader(data, batch_size=len(data))
        for X, Y in dl:
            Y_pred = []
            arr = torch.reshape(model(X), (len(data), )).detach().numpy()
            for i in arr:
                if i >= 0.5:
                    Y_pred.append(1)
                elif i < 0.5:
                    Y_pred.append(0) 
            rec = metrics.recall_score(Y.detach().numpy(), np.array(Y_pred))
            prec = metrics.precision_score(Y.detach().numpy(), np.array(Y_pred))
        print(f'recall = {rec}; precision = {prec}')
    else:
        x, y = data[i]
        x = torch.reshape(x, (1, 44))
        print(f'predicted = {model(x)}: true = {y}')

In [12]:
torch.save(model, 'model.pth')

In [21]:
predict(model, data)

recall = 0.875; precision = 0.7567567567567568


In [15]:
data_test = DataPreprocessing(path_df, sep, encoding, subset='test')

In [22]:
predict(model, data_test)

recall = 0.25; precision = 0.25
