In [1]:
import pandas as pd
from transformers import AutoTokenizer
import torch
import torch.nn as nn
import re


import matplotlib.pyplot as plt
from src.data_utils import preprocessing_data

  from .autonotebook import tqdm as notebook_tqdm


### Raw dataset

In [47]:
with open('tweets.txt', encoding='utf-8') as f:
    s = f.readlines()

In [48]:
s[1]

"is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!\n"

In [49]:
data  = pd.DataFrame()
data['text'] = s
data

Unnamed: 0,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire \n
4,"@nationwideclass no, it's not behaving at all...."
...,...
1600493,Ask Programming: LaTeX or InDesign?: submitted...
1600494,"On that note, I hate Word. I hate Pages. I hat..."
1600495,Ahhh... back in a *real* text editing environm...
1600496,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [50]:
data.to_csv('data/raw_dataset.csv', index=False)

### Raw Data Analysis

In [2]:
data = pd.read_csv('data/raw_dataset.csv')
data

Unnamed: 0,text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire \n
4,"@nationwideclass no, it's not behaving at all...."
...,...
1600493,Ask Programming: LaTeX or InDesign?: submitted...
1600494,"On that note, I hate Word. I hate Pages. I hat..."
1600495,Ahhh... back in a *real* text editing environm...
1600496,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


### Классическая очистка

1. Привести к нижнему регистру;
2. удалить ссылки, упоминания, эмодзи (по необходимости); 
- ОФФТОП а вообще прикольно оставить спец символы и эмодзи как измерение галлюцинации... Такой шум в исходных данных + тогда надо чистить их через regex
s = regex.sub(r'[^\p{L}\p{N}\s]', ' ', s)

3. убрать n-ые пробелы 
3. заменить нестандартные символы;
4. токенизировать текст.

In [3]:
from src.data_utils import preprocessing_data

In [4]:

datatest = pd.DataFrame()
datatest['tokens'] = preprocessing_data(data)
datatest


Unnamed: 0,tokens
0,"[12, 257, 2503, 11, 326, 338, 257, 275, 31647,..."
1,"[271, 9247, 326, 339, 460, 470, 4296, 465, 239..."
2,"[72, 288, 1572, 867, 1661, 329, 262, 2613, 13,..."
3,"[1820, 2187, 1767, 5300, 340, 29658, 290, 588,..."
4,"[3919, 11, 340, 338, 407, 37722, 379, 477, 13,..."
...,...
1600493,"[2093, 8300, 25, 47038, 393, 773, 274, 570, 27..."
1600494,"[261, 326, 3465, 11, 1312, 5465, 1573, 13, 131..."
1600495,"[993, 12337, 986, 736, 287, 257, 1635, 5305, 9..."
1600496,"[83, 472, 903, 287, 4173, 272, 11, 1312, 766, ..."


In [10]:
datatest.tokens.sample(1).iloc[0]

[5562,
 338,
 826,
 288,
 7737,
 467,
 2539,
 815,
 307,
 503,
 8326,
 45630,
 272,
 21580,
 220,
 50256]

In [8]:
datatest.to_csv('data/dataset_processed.csv', index=False)

In [9]:
dataset = pd.read_csv("data/dataset_processed.csv")
dataset

Unnamed: 0,tokens
0,"[12, 257, 2503, 11, 326, 338, 257, 275, 31647,..."
1,"[271, 9247, 326, 339, 460, 470, 4296, 465, 239..."
2,"[72, 288, 1572, 867, 1661, 329, 262, 2613, 13,..."
3,"[1820, 2187, 1767, 5300, 340, 29658, 290, 588,..."
4,"[3919, 11, 340, 338, 407, 37722, 379, 477, 13,..."
...,...
1600493,"[2093, 8300, 25, 47038, 393, 773, 274, 570, 27..."
1600494,"[261, 326, 3465, 11, 1312, 5465, 1573, 13, 131..."
1600495,"[993, 12337, 986, 736, 287, 257, 1635, 5305, 9..."
1600496,"[83, 472, 903, 287, 4173, 272, 11, 1312, 766, ..."


In [20]:
from sklearn.model_selection import train_test_split


train, val = train_test_split(datatest, test_size=0.3, random_state=42)
val, test = train_test_split(val, test_size=0.3, random_state=42)

In [21]:
train.to_csv('data/train.csv', index=False)
val.to_csv('data/val.csv', index=False)
test.to_csv('data/test.csv', index=False)

In [2]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')

In [3]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence



class AutoTextDataset(Dataset):
    def __init__(self, data):
        self.tokens = data.tokens
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        token = eval(self.tokens[index])
        return {
            'x': torch.tensor(token[:-1]),
            'y': torch.tensor(token[1:])
        }
def collate_fn(batch):
    l = torch.tensor([len(item['x']) for item in batch])
    sort_ind = torch.argsort(l, descending=True)
    x = [item['x'] for item in batch]
    y = [item['y'] for item in batch]
    
    sort_x = [x[i] for i in sort_ind]
    sort_y = [y[i] for i in sort_ind]

    pad_x = pad_sequence(sort_x, batch_first=True, padding_value=50256)
    pad_y = pad_sequence(sort_y, batch_first=True, padding_value=50256)
    return {
        'lengths': [l[i] for i in sort_ind], 
        'x': pad_x, 
        'y': pad_y, 
    }



In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import AutoTokenizer
from torch.optim import Adam
import evaluate  


from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence



class AutoTextDataset(Dataset):
    def __init__(self, data):
        self.tokens = data.tokens
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        token = eval(self.tokens[index])
        return {
            'x': torch.tensor(token[:-1]),
            'y': torch.tensor(token[1:])
        }
def collate_fn(batch):
    l = torch.tensor([len(item['x']) for item in batch])
    sort_ind = torch.argsort(l, descending=True)
    x = [item['x'] for item in batch]
    y = [item['y'] for item in batch]
    
    sort_x = [x[i] for i in sort_ind]
    sort_y = [y[i] for i in sort_ind]

    pad_x = pad_sequence(sort_x, batch_first=True, padding_value=50256)
    pad_y = pad_sequence(sort_y, batch_first=True, padding_value=50256)
    return {
        'lengths': [l[i] for i in sort_ind], 
        'x': pad_x, 
        'y': pad_y, 
    }



class GRUmodel(nn.Module):
    def __init__(self, vocab_size=50257, embed_dim=64, hidden_dim=32, padding_idx=50256):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, lengths=None):
        emb = self.emb(x)
        if lengths is not None:
            packed_emb = pack_padded_sequence(emb, lengths, batch_first=True, enforce_sorted=False)
            out, _ = self.gru(packed_emb)
            out, _ = pad_packed_sequence(out, batch_first=True)
        else:
            out, _ = self.gru(emb)
        logits = self.fc(out)
        return logits

    def generate(self, context_tokens, max_new_tokens=10, temperature=1.0):
        self.eval()
        with torch.no_grad():
            generated = context_tokens.clone()

            for _ in range(max_new_tokens):
                logits = self(generated)  
                next_logits = logits[0, -1, :] / temperature 
                probs = torch.softmax(next_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                generated = torch.cat([generated, next_token.unsqueeze(0)], dim=1)
                if next_token.item() == self.emb.padding_idx:
                    break

            return generated
        
import torch
import torch.nn as nn
from torch.optim import Adam
import evaluate  


rouge = evaluate.load("rouge")

def train_loop(dataloader, model, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        x = batch['x'].to(device)
        y = batch['y'].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def val_loop(dataloader, model, criterion, tokenizer, device, num_samples=3):
    model.eval()
    total_loss = 0
    all_preds = []
    all_refs = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            x = batch['x'].to(device)
            y = batch['y'].to(device)

            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            total_loss += loss.item()

            # Соберём первые несколько реальных и предсказанных последовательностей
            if i == 0:
                for j in range(min(num_samples, x.size(0))):
                    # Реальный таргет (без паддингов)
                    ref_tokens = y[j].cpu().tolist()
                    ref_tokens = [t for t in ref_tokens if t != tokenizer.pad_token_id]
                    ref_text = tokenizer.decode(ref_tokens, skip_special_tokens=True)

                    # Генерация (используем только начало x как контекст)
                    context = x[j:j+1]  # (1, T)
                    generated = model.generate(context, max_new_tokens=20, temperature=0.7)

                    # Декодируем сгенерированное (без контекста)
                    gen_tokens = generated[0, context.size(1):].cpu().tolist()
                    gen_tokens = [t for t in gen_tokens if t != tokenizer.pad_token_id and t != tokenizer.eos_token_id]
                    gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)

                    all_refs.append(ref_text)
                    all_preds.append(gen_text)

    avg_loss = total_loss / len(dataloader)
    rouge_scores = rouge.compute(predictions=all_preds, references=all_refs)
    return avg_loss, rouge_scores, list(zip(all_refs, all_preds))


trainds = AutoTextDataset(train)
valds = AutoTextDataset(val)
train_loader = DataLoader(trainds, shuffle=True, batch_size=32, collate_fn=collate_fn)
val_loader = DataLoader(valds, shuffle=False, batch_size=32, collate_fn=collate_fn)
model = GRUmodel()
device = 'cuda'
optimizer = Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss(ignore_index=50256)
model.to(device)
epochs = 5
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")




for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    train_loss = train_loop(train_loader, model, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}")

    val_loss, rouge_scores, samples = val_loop(val_loader, model, criterion, tokenizer, device)
    print(f"Val Loss: {val_loss:.4f}")
    
    print("ROUGE-1:", rouge_scores['rouge1'])
    print("ROUGE-2:", rouge_scores['rouge2'])
    print("ROUGE-L:", rouge_scores['rougeL'])

    print("\nПримеры автодополнений:")
    for ref, gen in samples[:3]:
        print(f"Ожидание: {ref}")
        print(f"Модель:   {gen}")
        print("-" * 50)

In [None]:
trainds = AutoTextDataset(train)
valds = AutoTextDataset(val)
dl_train = DataLoader(trainds, shuffle=True, batch_size=8, collate_fn=collate_fn)
dl_val = DataLoader(valds, shuffle=False, batch_size=16, collate_fn=collate_fn)
model = GRUmodel()
crit = nn.CrossEntropyLoss(ignore_index=50256)
opt = torch.optim.Adam(model.parameters())


In [8]:
import torch
import torch.nn as nn
from torch.optim import Adam
import evaluate  


rouge = evaluate.load("rouge")

def train_loop(dataloader, model, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        x = batch['x'].to(device)
        y = batch['y'].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def val_loop(dataloader, model, criterion, tokenizer, device, num_samples=3):
    model.eval()
    total_loss = 0
    all_preds = []
    all_refs = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            x = batch['x'].to(device)
            y = batch['y'].to(device)

            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            total_loss += loss.item()

            # Соберём первые несколько реальных и предсказанных последовательностей
            if i == 0:
                for j in range(min(num_samples, x.size(0))):
                    # Реальный таргет (без паддингов)
                    ref_tokens = y[j].cpu().tolist()
                    ref_tokens = [t for t in ref_tokens if t != tokenizer.pad_token_id]
                    ref_text = tokenizer.decode(ref_tokens, skip_special_tokens=True)

                    # Генерация (используем только начало x как контекст)
                    context = x[j:j+1]  # (1, T)
                    generated = model.generate(context, max_new_tokens=20, temperature=0.7)

                    # Декодируем сгенерированное (без контекста)
                    gen_tokens = generated[0, context.size(1):].cpu().tolist()
                    gen_tokens = [t for t in gen_tokens if t != tokenizer.pad_token_id and t != tokenizer.eos_token_id]
                    gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)

                    all_refs.append(ref_text)
                    all_preds.append(gen_text)

    avg_loss = total_loss / len(dataloader)
    rouge_scores = rouge.compute(predictions=all_preds, references=all_refs)
    return avg_loss, rouge_scores, list(zip(all_refs, all_preds))


trainds = AutoTextDataset(train)
valds = AutoTextDataset(val)
train_loader = DataLoader(trainds, shuffle=True, batch_size=32, collate_fn=collate_fn)
val_loader = DataLoader(valds, shuffle=False, batch_size=32, collate_fn=collate_fn)
model = GRUmodel()
device = 'cuda'
optimizer = Adam(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss(ignore_index=50256)
model.to(device)
epochs = 5
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")




for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    train_loss = train_loop(train_loader, model, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}")
    val_loss, rouge_scores, samples = val_loop(val_loader, model, criterion, tokenizer, device)
    print(f"Val Loss: {val_loss:.4f}")
    print("ROUGE-1:", rouge_scores['rouge1'])
    print("ROUGE-2:", rouge_scores['rouge2'])
    print("ROUGE-L:", rouge_scores['rougeL'])

    print("\nПримеры автодополнений:")
    for ref, gen in samples[:3]:
        print(f"Ожидание: {ref}")
        print(f"Модель:   {gen}")
        print("-" * 50)

KeyboardInterrupt: 

In [135]:
for batch in dl_train:
    pred = model(batch['x'])
    print(pred)
    break

tensor([[[-0.2349, -0.2167,  0.0902,  ...,  0.0549, -0.1867, -0.0376],
         [-0.3318, -0.4711,  0.0452,  ...,  0.2100, -0.5133, -0.2629],
         [-0.3801, -0.2510, -0.0786,  ...,  0.2460, -0.5038, -0.3385],
         ...,
         [ 0.2215,  0.0978,  0.1000,  ..., -0.3724, -0.1006, -0.0493],
         [-0.2423, -0.1464, -0.0303,  ..., -0.3090, -0.2128,  0.0755],
         [-0.2284, -0.0895, -0.1916,  ..., -0.1869, -0.2602,  0.0476]],

        [[ 0.0170, -0.0060,  0.0589,  ...,  0.0716, -0.3858, -0.1723],
         [-0.4506, -0.4831,  0.0475,  ...,  0.4309, -0.5493,  0.1163],
         [ 0.0202, -0.1068,  0.0999,  ...,  0.4084, -0.3107,  0.1242],
         ...,
         [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375],
         [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375],
         [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375]]],
       grad_fn=<AddBackward0>)


In [128]:
pred.shape

torch.Size([2, 31, 50257])

In [136]:
pred.view(-1, pred.size(-1))

tensor([[-0.2349, -0.2167,  0.0902,  ...,  0.0549, -0.1867, -0.0376],
        [-0.3318, -0.4711,  0.0452,  ...,  0.2100, -0.5133, -0.2629],
        [-0.3801, -0.2510, -0.0786,  ...,  0.2460, -0.5038, -0.3385],
        ...,
        [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375],
        [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375],
        [-0.0456, -0.1237, -0.0253,  ...,  0.1337, -0.2808, -0.1375]],
       grad_fn=<ViewBackward0>)

In [144]:
batch['y']

tensor([[ 3073, 14343,   262,  9082,  2513,   263,   300,    76,    69,  5488,
            11,  2632,   807,   477,   262,   835,    11,   703,  1282,   345,
          9099,   470,  3031,   736,  2063,   262,   640, 50256],
        [ 6621, 42254,   290,   318,  1016,    70, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]])

In [142]:
batch['y'].view(-1)

tensor([ 3073, 14343,   262,  9082,  2513,   263,   300,    76,    69,  5488,
           11,  2632,   807,   477,   262,   835,    11,   703,  1282,   345,
         9099,   470,  3031,   736,  2063,   262,   640, 50256,  6621, 42254,
          290,   318,  1016,    70, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256])

In [139]:
crit = nn.CrossEntropyLoss(ignore_index=50256)

In [143]:
crit(pred.view(-1, pred.size(-1)), batch['y'].view(-1))

tensor(10.8438, grad_fn=<NllLossBackward0>)

In [91]:
tokenizer.vocab_size

50257

In [90]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")



In [11]:
tokenizer("HI, man", add_special_tokens=True).input_ids

[25374, 11, 582]

In [13]:
tokenizer.eos_token_id

50256

In [None]:
tokenizer.

'"'