<a href="https://colab.research.google.com/github/Kostia9/Data-Analysis-2025/blob/main/Lab3/Lab3_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import string
import bz2
import os
import json

# NLTK tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import userdata

# Seeds and device
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Device: cuda


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
import os, json
from google.colab import userdata

os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

os.environ["KAGGLE_JSON"] = userdata.get("KAGGLE_JSON")
kaggle_json = os.environ['KAGGLE_JSON']
kaggle_token = json.loads(kaggle_json)

# Записати у файл kaggle.json
kaggle_path = os.path.expanduser("~/.kaggle/kaggle.json")
with open(kaggle_path, "w") as f:
    json.dump(kaggle_token, f)
os.chmod(kaggle_path, 0o600)

!pip install -q kaggle

!kaggle datasets download -d bittlingmayer/amazonreviews
!unzip -n amazonreviews.zip

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
100% 491M/493M [00:00<00:00, 1.72GB/s]
100% 493M/493M [00:00<00:00, 1.71GB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [3]:
# --- 2. Load, clean, tokenize ---
def extractData(filename, n_samples=None):
    data = []
    with bz2.open(filename, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if n_samples is not None and i >= n_samples:
                break
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                label, text = parts
                data.append((label, text))
    df = pd.DataFrame(data, columns=['label', 'text'])
    df['label'] = df['label'].str.extract(r'(\d+)').astype(int)
    # Keep only labels {1,2} then map to {0,1}
    df = df[df['label'].isin([1, 2])].reset_index(drop=True)
    df['label'] = df['label'] - 1
    return df

# Smaller sample for speed; increase if you want better accuracy
train_df = extractData('train.ft.txt.bz2', n_samples=500_000)
test_df  = extractData('test.ft.txt.bz2',  n_samples=50_000)

print(f"Loaded {len(train_df)} train and {len(test_df)} test samples.")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"@\S+", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

print("Cleaning...")
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text']  = test_df['text'].apply(clean_text)

print("Tokenizing...")
train_df['tokens'] = train_df['clean_text'].apply(word_tokenize)
test_df['tokens']  = test_df['clean_text'].apply(word_tokenize)

print(train_df.head(3))

Loaded 500000 train and 50000 test samples.
Cleaning...
Tokenizing...
   label                                               text  \
0      1  Stuning even for the non-gamer: This sound tra...   
1      1  The best soundtrack ever to anything.: I'm rea...   
2      1  Amazing!: This soundtrack is my favorite music...   

                                          clean_text  \
0  stuning even for the non gamer this sound trac...   
1  the best soundtrack ever to anything i m readi...   
2  amazing this soundtrack is my favorite music o...   

                                              tokens  
0  [stuning, even, for, the, non, gamer, this, so...  
1  [the, best, soundtrack, ever, to, anything, i,...  
2  [amazing, this, soundtrack, is, my, favorite, ...  


In [4]:
# --- 3. Build vocab ---
from collections import Counter
counter = Counter()
for tok_list in train_df['tokens']:
    counter.update(tok_list)

MAX_VOCAB = 30_000
specials = ["[PAD]", "[UNK]"]
most_common = counter.most_common(MAX_VOCAB - len(specials))
itos = specials + [w for w, _ in most_common]
stoi = {w: i for i, w in enumerate(itos)}

PAD_IDX = stoi["[PAD]"]
UNK_IDX = stoi["[UNK]"]
vocab_size = len(stoi)
print("Vocab size:", vocab_size)

Vocab size: 20000


In [5]:
# --- 4. Encode and pad ---
def encode(tokens):
    return [stoi.get(t, UNK_IDX) for t in tokens]

MAX_LEN = 200
def pad_and_encode(tokens):
    seq = encode(tokens)
    seq = seq[:MAX_LEN] + [PAD_IDX] * max(0, MAX_LEN - len(seq))
    return torch.tensor(seq, dtype=torch.long)

print("Encoding...")
X_train_full = torch.stack([pad_and_encode(seq) for seq in train_df['tokens']])
y_train_full = torch.tensor(train_df['label'].values, dtype=torch.float32)

X_test = torch.stack([pad_and_encode(seq) for seq in test_df['tokens']])
y_test = torch.tensor(test_df['label'].values, dtype=torch.float32)

print("X_train_full:", X_train_full.shape)
print("y_train_full:", y_train_full.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Encoding...
X_train_full: torch.Size([500000, 100])
y_train_full: torch.Size([500000])
X_test: torch.Size([50000, 100])
y_test: torch.Size([50000])


In [6]:
# --- 5. Datasets and loaders ---
from torch.utils.data import TensorDataset, DataLoader, random_split

full_train_dataset = TensorDataset(X_train_full, y_train_full)
test_ds = TensorDataset(X_test, y_test)

VAL_FRAC = 0.2
val_sz = int(len(full_train_dataset) * VAL_FRAC)
train_sz = len(full_train_dataset) - val_sz
train_ds, val_ds = random_split(full_train_dataset, [train_sz, val_sz],
                                generator=torch.Generator().manual_seed(42))

BATCH_SIZE = 256
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, pin_memory=True)

print(f"Train: {len(train_ds)} | Val: {len(val_ds)} | Test: {len(test_ds)}")

Train: 400000 | Val: 100000 | Test: 50000


In [7]:
# --- 6. Model ---
class GRUNet(nn.Module):
    # Додаємо pad_idx
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, pad_idx, n_layers, drop_prob=0.5):
        super(GRUNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Використовуємо pad_idx
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.gru = nn.GRU(embedding_dim, hidden_dim, n_layers,
                          batch_first=True,
                          dropout=drop_prob,
                          bidirectional=False) # Однонаправлена GRU

        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x) # [B, T, E]

        # gru_out: [B, T, H]
        # hidden: [n_layers, B, H]
        gru_out, hidden = self.gru(embeds, hidden)

        # Використовуємо останній прихований стан для класифікації
        # hidden[-1] - це прихований стан останнього шару
        last_hidden_state = hidden[-1, :, :] # Shape: [B, H]

        out = self.dropout(last_hidden_state)
        out = self.fc(out)    # [B, 1]
        out = self.sigmoid(out) # [B, 1]

        if self.output_size == 1:
            out = out.squeeze(1) # [B]

        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

In [10]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    train_loss = 0.0

    for x, y in loader:
        x, y = x.to(device), y.to(device)

        h = model.init_hidden(x.size(0))

        optimizer.zero_grad()
        out, h = model(x, h)

        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(loader)
    return train_loss

def evaluate_one_epoch(model, loader, criterion, device):
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
      for x, y in loader:
          x, y = x.to(device), y.to(device)

          h = model.init_hidden(x.size(0))
          out, h = model(x, h)

          loss = criterion(out, y)
          valid_loss += loss.item()

    valid_loss /= len(loader)
    return valid_loss

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_acc, n = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        h = model.init_hidden(xb.size(0))
        logits, h = model(xb, h)

        loss = criterion(logits, yb)

        # Отримуємо прогнози (0 або 1)
        preds = (logits >= 0.5).long()

        total_loss += loss.item() * xb.size(0)
        total_acc  += (preds == yb.long()).sum().item()
        n += xb.size(0)

    return total_loss/n, total_acc/n

In [11]:
# --- 8. Train baseline model (random init embeddings) ---
emb_dim = 100
hidden_dim = 128
pad_idx = PAD_IDX

# Нові параметри для GRUNet
output_size = 1
n_layers = 2
drop_prob = 0.5

model_basic = GRUNet(
    vocab_size, output_size, emb_dim, hidden_dim, pad_idx, n_layers, drop_prob
).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_basic.parameters(), lr=1e-3)

n_epochs = 5
print(model_basic)

print("\n--- Training: baseline ---")
for epoch in range(n_epochs):
    tr = train_one_epoch(model_basic, train_loader, optimizer, criterion, device)
    va = evaluate_one_epoch(model_basic, val_loader, criterion, device)
    print(f"[{epoch+1:02d}] train_loss={tr:.4f} | val_loss={va:.4f}")

test_loss, test_acc = evaluate(model_basic, test_loader, criterion, device)
print(f"\n[Baseline] TEST loss={test_loss:.4f}  acc={test_acc*100:.2f}%")

GRUNet(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (gru): GRU(100, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

--- Training: baseline ---
[01] train_loss=0.2949 | val_loss=0.1951
[02] train_loss=0.1770 | val_loss=0.1790
[03] train_loss=0.1494 | val_loss=0.1753
[04] train_loss=0.1275 | val_loss=0.1799
[05] train_loss=0.1060 | val_loss=0.1909

[Baseline] TEST loss=0.1966  acc=93.09%


In [12]:
print("\n--- Завантаження fastText ---")
if not os.path.exists("wiki-news-300d-1M.vec.zip"):
    print("Downloading fastText vectors (1.6GB)...")
    !wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
print("Unzipping vectors...")
!unzip -q -n wiki-news-300d-1M.vec.zip

def load_fasttext_vec(path):
    vectors = {}
    dim = 0
    print("Loading fastText vectors into memory...")
    with open(path, "r", encoding="utf8") as f:
        first_line_parts = f.readline().rstrip().split(" ")
        num_words = int(first_line_parts[0])
        dim = int(first_line_parts[1])

        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            try:
                if len(parts) == dim + 1:
                    vec = np.asarray(parts[1:], dtype=np.float32)
                    vectors[word] = vec
            except ValueError:
                continue
    return vectors, dim

ft_path = "wiki-news-300d-1M.vec"
fasttext_vectors, emb_dim_ft = load_fasttext_vec(ft_path)
print(f"fastText dim: {emb_dim_ft} | loaded entries: {len(fasttext_vectors):,}")


--- Завантаження fastText ---
Downloading fastText vectors (1.6GB)...
Unzipping vectors...
Loading fastText vectors into memory...
fastText dim: 300 | loaded entries: 999,994


In [13]:
print("Створення матриці ембедингів...")
emb_dim = emb_dim_ft # Тепер 300

emb_matrix = np.random.normal(scale=0.1, size=(vocab_size, emb_dim)).astype(np.float32)
emb_matrix[PAD_IDX] = 0.0

hit = 0
for w, idx in stoi.items():
    v = fasttext_vectors.get(w)
    if v is not None:
        emb_matrix[idx] = v
        hit += 1
print(f"Coverage: {hit}/{vocab_size} = {hit/vocab_size:.1%}")

pretrained_emb = nn.Embedding.from_pretrained(
    torch.tensor(emb_matrix),
    freeze=True,
    padding_idx=PAD_IDX
)


Створення матриці ембедингів...
Coverage: 19380/20000 = 96.9%


In [15]:
# --- 11. Train model ---
model_tl = GRUNet(
    vocab_size, output_size, emb_dim, hidden_dim, pad_idx, n_layers, drop_prob
).to(device)

model_tl.embedding = pretrained_emb
model_tl.to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_tl.parameters(), lr=1e-3)

print("\n--- training fastText (GRU) ---")
for epoch in range(n_epochs):
    tr = train_one_epoch(model_tl, train_loader, optimizer, criterion, device)
    va = evaluate_one_epoch(model_tl, val_loader, criterion, device)
    print(f"[{epoch+1:02d}] train_loss={tr:.4f} | val_loss={va:.4f}")

test_loss, test_acc = evaluate(model_tl, test_loader, criterion, device)
print(f"\n[fastText GRU Model] TEST  loss={test_loss:.4f}  acc={test_acc*100:.2f}%")


--- training fastText (GRU) ---
[01] train_loss=0.2914 | val_loss=0.2019
[02] train_loss=0.1897 | val_loss=0.1773
[03] train_loss=0.1706 | val_loss=0.1678
[04] train_loss=0.1582 | val_loss=0.1661
[05] train_loss=0.1479 | val_loss=0.1612

[fastText GRU Model] TEST  loss=0.1679  acc=93.81%
