In [1]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from torchtext.vocab import Vectors, GloVe

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
class ConfigExperiment:
    seed = 42
    test_size = 0.3
    device = "cuda" if torch.cuda.is_available() else "cpu"
    embed_dim = 300
    max_vocab_size = 50_000
    batch_size = 64
    num_epochs = 30
    lr = 1e-2
    num_workers = 0
    patience = 3
    early_stopping_delta = 1e-4
    save_dirname = "models"
    
config = ConfigExperiment()

In [5]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    
init_random_seed(config.seed)

### Чтение данных

In [5]:
train = pd.read_csv("../data/train_processed_data.csv", index_col=False)
validate = pd.read_csv("../data/validate_processed_data.csv", index_col=False)
test = pd.read_csv("../data/test_processed_data.csv", index_col=False)

train.shape, validate.shape, test.shape

((136100, 2), (45367, 2), (45367, 2))

### Создание полей 

In [9]:
tokenize = lambda x: str(x).split()

TEXT = data.Field(sequential=True, tokenize=tokenize, batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

fields = [('text',TEXT), ('label', LABEL)]

### Создание датасета из файлов csv

In [10]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path="../data/",
                                        train="train_processed_data.csv",
                                        validation="validate_processed_data.csv",
                                        test="test_processed_data.csv",
                                        format="csv",
                                        fields=fields,
                                        skip_header=True
)

### Посмотрим на данные

In [11]:
print(vars(train_data[0]))

{'text': ['оставаться', 'самый', 'нужный', 'и', 'самый', 'близкие', ')', 'весь', 'остальной', 'уходить', ')', 'и', 'я', 'только', 'рада', ')', 'потому', 'что', 'я', 'никогда', 'сам', 'не', 'понять', 'нужный', 'я', 'человек', 'или', 'нет', ')'], 'label': '1'}


In [12]:
print(vars(valid_data[0]))

{'text': ['at_user', 'at_user', 'я', 'старушка', '(', '(', '('], 'label': '0'}


In [13]:
print(vars(test_data[0]))

{'text': ['at_user', 'привееть', ',', 'хелена', ':', ')', ')', ')', 'мимими', '.', '.', '.', 'пряник', ',', 'конфета', 'и', 'сирец'], 'label': '1'}


In [14]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 136100
Number of validation examples: 45367
Number of testing examples: 45367


### Построить словарь 

In [15]:
TEXT.build_vocab(train_data, min_freq=2)
LABEL.build_vocab(train_data)

In [16]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 33041
Unique tokens in LABEL vocabulary: 2


### Создать объекты итераторов для каждого датасета

In [17]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text,
    batch_size=config.batch_size,
    device=config.device)

In [18]:
print('Train:')
for batch in train_iterator:
    print(batch)
    break
    
print('Valid:')
for batch in valid_iterator:
    print(batch)
    break
    
print('Test:')
for batch in test_iterator:
    print(batch)
    break

Train:

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 64x32 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]
Valid:

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 64x46 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]
Test:

[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 64x95 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]


### Самые частые слова в текстах

In [19]:
print(TEXT.vocab.freqs.most_common(20))

[('(', 126857), (')', 116605), (',', 112838), ('.', 110646), ('at_user', 89459), (':', 64837), ('я', 64792), ('не', 44975), ('!', 39989), ('и', 36467), ('в', 35466), ('что', 23049), ('на', 22897), ('а', 21843), ('?', 21683), ('url', 20083), ('с', 20052), ('весь', 18848), ('ты', 18175), ('быть', 16970)]


### Словарь для текстов

In [20]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '(', ')', ',', '.', 'at_user', ':', 'я', 'не']


### Словарь меток классов

In [21]:
print(LABEL.vocab.stoi)

defaultdict(None, {'1': 0, '0': 1})
