In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator

df = pd.read_excel("toxic_non_toxic.xls")
print(df.head())

                                       Toxic comment  \
0              почитай посты у этого автора, дебил.    
1  мне жаль тебя, гандон, если для тебя оскорблен...   
2               тебе в говне ходить нормально, урод?   
3  блять, я согласен, что энергия от виэ на текущ...   
4  я этим сраным ватсаппом никогда не пользовался...   

                                      Polite comment  
0  попробуйте почитать посты этого автора, может ...  
1  извините, но мне вас очень жаль, если для вас ...  
2  извини, но приятно бы тебе было ходить в грязном?  
3  я согласен с вами, что энергия от виэ на текущ...  
4  просто я, к сожалению, ватсаппом никогда не по...  


In [27]:
df = df.dropna()
df.columns = ['toxic', 'polite']

In [28]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
def tokenize(text):
    return word_tokenize(text.lower().strip())

src_sentences = [tokenize(t) for t in df['toxic']]
tgt_sentences = [tokenize(p) for p in df['polite']]

specials = ['<pad>', '<sos>', '<eos>', '<unk>']
src_vocab = build_vocab_from_iterator(src_sentences, specials=specials)
tgt_vocab = build_vocab_from_iterator(tgt_sentences, specials=specials)

src_vocab.set_default_index(src_vocab['<unk>'])
tgt_vocab.set_default_index(tgt_vocab['<unk>'])

def encode(tokens, vocab, eos=True):
    ids = [vocab[token] for token in tokens]
    return ids + [vocab['<eos>']] if eos else ids

src_data = [torch.tensor(encode(s, src_vocab)) for s in src_sentences]
tgt_data = [torch.tensor([tgt_vocab['<sos>']] + encode(t, tgt_vocab, eos=True)) for t in tgt_sentences]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lidakarpovich/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [38]:
from torch.utils.data import DataLoader, Dataset

class ToxicDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

def collate_batch(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=src_vocab['<pad>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab['<pad>'], batch_first=True)
    return src_batch, tgt_batch


from sklearn.model_selection import train_test_split

src_data = [torch.tensor(encode(s, src_vocab)) for s in src_sentences]
tgt_data = [torch.tensor([tgt_vocab['<sos>']] + encode(t, tgt_vocab)) for t in tgt_sentences]


src_train, src_val, tgt_train, tgt_val = train_test_split(
    src_data, tgt_data, test_size=0.1, random_state=42
)

train_data = ToxicDataset(src_train, tgt_train)
val_data = ToxicDataset(src_val, tgt_val)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=1, shuffle=False, collate_fn=collate_batch)

val_data_pairs = []

for i in range(len(src_data)):
    for val_tensor in src_val:
        if torch.equal(src_data[i], val_tensor):
            val_data_pairs.append((
                " ".join(src_sentences[i]),
                " ".join(tgt_sentences[i])
            ))
            break 

In [39]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, enc_hid_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell


class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim, attn_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hid_dim + dec_hid_dim, attn_dim)
        self.v = nn.Linear(attn_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1) 


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, attention, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(enc_hid_dim + emb_dim, dec_hid_dim, batch_first=True)
        self.fc_out = nn.Linear(enc_hid_dim + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)


    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        attn_weights = self.attention(hidden.squeeze(0), encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)
        context = torch.bmm(attn_weights, encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        output = output.squeeze(1)
        context = context.squeeze(1)
        embedded = embedded.squeeze(1)
        output = self.fc_out(torch.cat((output, context, embedded), dim=1))
        return output, hidden, cell, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)

        input = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

In [40]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

def evaluate_bleu(model, data_pairs):
    total_bleu = 0
    for src_text, tgt_text in data_pairs:
        pred_text = generate_polite(model, src_text)
        reference = tokenize(tgt_text)
        hypothesis = tokenize(pred_text)
        bleu = sentence_bleu([reference], hypothesis, smoothing_function=smoothie)
        total_bleu += bleu

    avg_bleu = total_bleu / len(data_pairs)
    return avg_bleu

In [41]:
def generate_polite(model, src_sentence, max_len=30):
    model.eval()
    tokens = tokenize(src_sentence)
    src_tensor = torch.tensor(encode(tokens, src_vocab), dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor)

    inputs = torch.tensor([tgt_vocab['<sos>']], dtype=torch.long).to(device)
    outputs = []

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell, _ = model.decoder(inputs, hidden, cell, encoder_outputs)

        pred_token = output.argmax(1).item()
        if pred_token == tgt_vocab['<eos>']:
            break
        outputs.append(pred_token)
        inputs = torch.tensor([pred_token], dtype=torch.long).to(device)

    return " ".join(tgt_vocab.lookup_tokens(outputs))

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(tgt_vocab)
EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ATTN_DIM = 256
DROPOUT = 0.5
CLIP = 1.0

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)
enc = Encoder(INPUT_DIM, EMB_DIM, ENC_HID_DIM, DROPOUT)
dec = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, attn, DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
optimizer = optim.Adam(model.parameters())

EPOCHS = 50
teacher_forcing_ratio = 0.7
best_bleu = 0
patience = 5
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    for src, tgt in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt, teacher_forcing_ratio=teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        epoch_loss += loss.item()

    val_bleu = evaluate_bleu(model, val_data_pairs)
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, BLEU: {val_bleu:.4f}")

Epoch 1: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.52it/s]


Epoch 1, Loss: 6.4757, BLEU: 0.0094


Epoch 2: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.60it/s]


Epoch 2, Loss: 5.6094, BLEU: 0.0222


Epoch 3: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.31it/s]


Epoch 3, Loss: 5.3246, BLEU: 0.0071


Epoch 4: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.73it/s]


Epoch 4, Loss: 5.0372, BLEU: 0.0116


Epoch 5: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.36it/s]


Epoch 5, Loss: 4.8509, BLEU: 0.0093


Epoch 6: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.68it/s]


Epoch 6, Loss: 4.6168, BLEU: 0.0232


Epoch 7: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.68it/s]


Epoch 7, Loss: 4.4174, BLEU: 0.0176


Epoch 8: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.78it/s]


Epoch 8, Loss: 4.1433, BLEU: 0.0210


Epoch 9: 100%|████████████████████████████████████| 6/6 [00:01<00:00,  3.60it/s]


Epoch 9, Loss: 3.9912, BLEU: 0.0207


Epoch 10: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.56it/s]


Epoch 10, Loss: 3.7096, BLEU: 0.0213


Epoch 11: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.87it/s]


Epoch 11, Loss: 3.5947, BLEU: 0.0226


Epoch 12: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.91it/s]


Epoch 12, Loss: 3.3926, BLEU: 0.0322


Epoch 13: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.69it/s]


Epoch 13, Loss: 3.1185, BLEU: 0.0243


Epoch 14: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.63it/s]


Epoch 14, Loss: 2.9770, BLEU: 0.0257


Epoch 15: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.67it/s]


Epoch 15, Loss: 2.7043, BLEU: 0.0242


Epoch 16: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.47it/s]


Epoch 16, Loss: 2.3775, BLEU: 0.0247


Epoch 17: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.47it/s]


Epoch 17, Loss: 2.1118, BLEU: 0.0342


Epoch 18: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.57it/s]


Epoch 18, Loss: 1.9555, BLEU: 0.0272


Epoch 19: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.39it/s]


Epoch 19, Loss: 1.7120, BLEU: 0.0259


Epoch 20: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.49it/s]


Epoch 20, Loss: 1.5319, BLEU: 0.0310


Epoch 21: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.25it/s]


Epoch 21, Loss: 1.2912, BLEU: 0.0288


Epoch 22: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.22it/s]


Epoch 22, Loss: 1.2225, BLEU: 0.0531


Epoch 23: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.45it/s]


Epoch 23, Loss: 1.1598, BLEU: 0.0564


Epoch 24: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.34it/s]


Epoch 24, Loss: 0.8729, BLEU: 0.0567


Epoch 25: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.40it/s]


Epoch 25, Loss: 0.7528, BLEU: 0.0554


Epoch 26: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.50it/s]


Epoch 26, Loss: 0.6992, BLEU: 0.0552


Epoch 27: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.42it/s]


Epoch 27, Loss: 0.5262, BLEU: 0.0541


Epoch 28: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.37it/s]


Epoch 28, Loss: 0.4882, BLEU: 0.0583


Epoch 29: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.50it/s]


Epoch 29, Loss: 0.5114, BLEU: 0.0553


Epoch 30: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.26it/s]


Epoch 30, Loss: 0.4321, BLEU: 0.0562


Epoch 31: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.62it/s]


Epoch 31, Loss: 0.3699, BLEU: 0.0558


Epoch 32: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.62it/s]


Epoch 32, Loss: 0.3008, BLEU: 0.0650


Epoch 33: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.32it/s]


Epoch 33, Loss: 0.2895, BLEU: 0.0631


Epoch 34: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.52it/s]


Epoch 34, Loss: 0.2102, BLEU: 0.0561


Epoch 35: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.78it/s]


Epoch 35, Loss: 0.2192, BLEU: 0.0612


Epoch 36: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.25it/s]


Epoch 36, Loss: 0.1716, BLEU: 0.0605


Epoch 37: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.49it/s]


Epoch 37, Loss: 0.1465, BLEU: 0.0548


Epoch 38: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.40it/s]


Epoch 38, Loss: 0.1413, BLEU: 0.0609


Epoch 39: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.25it/s]


Epoch 39, Loss: 0.1182, BLEU: 0.0626


Epoch 40: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.48it/s]


Epoch 40, Loss: 0.1149, BLEU: 0.0577


Epoch 41: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.45it/s]


Epoch 41, Loss: 0.0998, BLEU: 0.0662


Epoch 42: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.44it/s]


Epoch 42, Loss: 0.0933, BLEU: 0.0640


Epoch 43: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.55it/s]


Epoch 43, Loss: 0.1031, BLEU: 0.0621


Epoch 44: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.56it/s]


Epoch 44, Loss: 0.0833, BLEU: 0.0635


Epoch 45: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.42it/s]


Epoch 45, Loss: 0.0856, BLEU: 0.0657


Epoch 46: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.51it/s]


Epoch 46, Loss: 0.0648, BLEU: 0.0590


Epoch 47: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.43it/s]


Epoch 47, Loss: 0.0745, BLEU: 0.0649


Epoch 48: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.62it/s]


Epoch 48, Loss: 0.0564, BLEU: 0.0594


Epoch 49: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.37it/s]


Epoch 49, Loss: 0.0715, BLEU: 0.0595


Epoch 50: 100%|███████████████████████████████████| 6/6 [00:01<00:00,  3.75it/s]


Epoch 50, Loss: 0.0558, BLEU: 0.0597


In [43]:
final_bleu = evaluate_bleu(model, val_data_pairs)
print(f"\nFinal BLEU score on validation set: {final_bleu:.4f}")


Final BLEU score on validation set: 0.0597


In [44]:
sample_pairs = list(zip(df['toxic'], df['polite']))[:100]
bleu_score = evaluate_bleu(model, sample_pairs)
print(f"Average BLEU score: {bleu_score:.4f}")

Average BLEU score: 0.8038
