Задача классификации комментариев

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
%matplotlib inline
from nltk import word_tokenize 
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
data = pd.read_csv("comments.tsv", sep='\t')

texts = data['comment_text'].values
target = data['should_ban'].values

SEED = 41

### Split data

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, random_state=SEED, stratify=data['should_ban'])

### My dataset (to pytorch format)

In [None]:
class CustomTextDataset(Dataset):
    def __init__(self, text, labels):
        self.text = text
        self.labels = labels
    
    def __getitem__(self, idx):
        label = self.labels.iloc[idx]
        text = self.text.iloc[idx]
        sample = [label, text]
        return sample

    def __len__(self):
        return len(self.labels)
    
train_dataset = CustomTextDataset(train['comment_text'], train['should_ban'])
test_dataset = CustomTextDataset(test['comment_text'], test['should_ban'])

### Tokenize

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = train_dataset

def yield_tokens(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)
        
vocab = build_vocab_from_iterator(yield_tokens([train_dataset, test_dataset]), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
len(vocab)

#### example tokenize

In [None]:
tokens = tokenizer("Hello how are you?, Welcome to CoderzColumn!!")
indexes = vocab(tokens)

tokens, indexes

In [None]:
train.loc[510, 'comment_text']

In [None]:
for i in train_dataset:
    vocab(tokenizer(i[1]))

In [None]:
max_words = 20

In [None]:
def vectorize_batch(batch):
    Y, X = list(zip(*batch))
    X = [vocab(tokenizer(text)) for text in X]
    X = [tokens+([0]* (max_words-len(tokens))) if len(tokens)<max_words else tokens[:max_words] for tokens in X] ## Bringing all samples to max_words length.

    return torch.tensor(X, dtype=torch.int32), torch.tensor(Y)


train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset , batch_size=1024, collate_fn=vectorize_batch)

In [None]:
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

### Model

In [None]:
from torch import nn
from torch.nn import functional as F

embed_len = 50
hidden_dim = 50
n_layers = 2

class RNNClassifier(nn.Module):
    def __init__(self):
        super(RNNClassifier, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 2)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings, torch.randn(n_layers, len(X_batch), hidden_dim))
        return self.linear(output[:,-1])

In [None]:
rnn_classifier = RNNClassifier()

rnn_classifier

In [None]:
for layer in rnn_classifier.children():
    print("Layer : {}".format(layer))
    print("Parameters : ")
    for param in layer.parameters():
        print(param.shape)
    print()

In [None]:
out = rnn_classifier(torch.randint(0, len(vocab), (1024, max_words)))

out.shape

### Train

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().numpy(), Y_preds.detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            Y_preds = model(X)

            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [None]:
from torch.optim import AdamW

epochs = 150
learning_rate = 1e-4

loss_fn = nn.CrossEntropyLoss()
rnn_classifier = RNNClassifier()
optimizer = AdamW(rnn_classifier.parameters(), lr=learning_rate)

TrainModel(rnn_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)

In [None]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

    return Y_shuffled.detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).detach().numpy()

Y_actual, Y_preds = MakePredictions(rnn_classifier, test_loader)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=['0', '1']))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# for name, X, y, model in [
#     ('train', X_train_bow, y_train, bow_model),
#     ('test ', X_test_bow, y_test, bow_model)
# ]:
auc = roc_auc_score(Y_actual, Y_preds)
plt.plot(*roc_curve(Y_actual, Y_preds)[:2], label='AUC=%.4f' % (auc))

plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()