<a href="https://colab.research.google.com/github/Kostia9/Data-Analysis-2025/blob/main/Lab3/Lab3_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import string
import bz2
import os
import json
import copy

# NLTK tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import userdata

# Seeds and device
torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Device: cuda


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
import os, json
from google.colab import userdata

os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

os.environ["KAGGLE_JSON"] = userdata.get("KAGGLE_JSON")
kaggle_json = os.environ['KAGGLE_JSON']
kaggle_token = json.loads(kaggle_json)

kaggle_path = os.path.expanduser("~/.kaggle/kaggle.json")
with open(kaggle_path, "w") as f:
    json.dump(kaggle_token, f)
os.chmod(kaggle_path, 0o600)

!pip install -q kaggle

!kaggle datasets download -d bittlingmayer/amazonreviews
!unzip -n amazonreviews.zip

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 67% 329M/493M [00:00<00:00, 1.73GB/s]
100% 493M/493M [00:00<00:00, 1.73GB/s]
Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [3]:
# --- 2. Load, clean, tokenize ---
def extractData(filename, n_samples=None):
    data = []
    with bz2.open(filename, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if n_samples is not None and i >= n_samples:
                break
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                label, text = parts
                data.append((label, text))
    df = pd.DataFrame(data, columns=['label', 'text'])
    df['label'] = df['label'].str.extract(r'(\d+)').astype(int)
    # Keep only labels {1,2} then map to {0,1}
    df = df[df['label'].isin([1, 2])].reset_index(drop=True)
    df['label'] = df['label'] - 1
    return df

train_df = extractData('train.ft.txt.bz2', n_samples=200_000)
test_df  = extractData('test.ft.txt.bz2')

print(f"Loaded {len(train_df)} train and {len(test_df)} test samples.")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"@\S+", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

print("Cleaning...")
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text']  = test_df['text'].apply(clean_text)

print("Tokenizing...")
train_df['tokens'] = train_df['clean_text'].apply(word_tokenize)
test_df['tokens']  = test_df['clean_text'].apply(word_tokenize)

print(train_df.head(3))

Loaded 200000 train and 400000 test samples.
Cleaning...
Tokenizing...
   label                                               text  \
0      1  Stuning even for the non-gamer: This sound tra...   
1      1  The best soundtrack ever to anything.: I'm rea...   
2      1  Amazing!: This soundtrack is my favorite music...   

                                          clean_text  \
0  stuning even for the non gamer this sound trac...   
1  the best soundtrack ever to anything i m readi...   
2  amazing this soundtrack is my favorite music o...   

                                              tokens  
0  [stuning, even, for, the, non, gamer, this, so...  
1  [the, best, soundtrack, ever, to, anything, i,...  
2  [amazing, this, soundtrack, is, my, favorite, ...  


In [4]:
# --- 3. Build vocab ---
from collections import Counter
counter = Counter()
for tok_list in train_df['tokens']:
    counter.update(tok_list)

MAX_VOCAB = 20_000
specials = ["[PAD]", "[UNK]"]
most_common = counter.most_common(MAX_VOCAB - len(specials))
itos = specials + [w for w, _ in most_common]
stoi = {w: i for i, w in enumerate(itos)}

PAD_IDX = stoi["[PAD]"]
UNK_IDX = stoi["[UNK]"]
vocab_size = len(stoi)
print("Vocab size:", vocab_size)

Vocab size: 20000


In [5]:
lengths = np.array([len(toks) for toks in train_df['tokens']])

print("=== Length Statistics ===")
print("Count:", len(lengths))
print("Min:", lengths.min())
print("Max:", lengths.max())
print("Mean:", lengths.mean())
print("Median:", np.median(lengths))
print("90th percentile:", np.percentile(lengths, 90))
print("95th percentile:", np.percentile(lengths, 95))
print("99th percentile:", np.percentile(lengths, 99))

=== Length Statistics ===
Count: 200000
Min: 3
Max: 242
Mean: 81.982425
Median: 74.0
90th percentile: 150.0
95th percentile: 167.0
99th percentile: 186.0


In [6]:
MAX_LEN = 170

# --- 4. Encode and pad ---
def encode(tokens):
    return [stoi.get(t, UNK_IDX) for t in tokens]

def pad_and_encode(tokens):
    seq = encode(tokens)
    seq = seq[:MAX_LEN] + [PAD_IDX] * max(0, MAX_LEN - len(seq))
    return torch.tensor(seq, dtype=torch.long)

print("Encoding...")
X_train_full = torch.stack([pad_and_encode(seq) for seq in train_df['tokens']])
y_train_full = torch.tensor(train_df['label'].values, dtype=torch.float32)

X_test = torch.stack([pad_and_encode(seq) for seq in test_df['tokens']])
y_test = torch.tensor(test_df['label'].values, dtype=torch.float32)

print("X_train_full:", X_train_full.shape)
print("y_train_full:", y_train_full.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Encoding...
X_train_full: torch.Size([200000, 170])
y_train_full: torch.Size([200000])
X_test: torch.Size([400000, 170])
y_test: torch.Size([400000])


In [7]:
# --- 5. Datasets and loaders ---
from torch.utils.data import TensorDataset, DataLoader, random_split

full_train_dataset = TensorDataset(X_train_full, y_train_full)
test_ds = TensorDataset(X_test, y_test)

VAL_FRAC = 0.2
val_sz = int(len(full_train_dataset) * VAL_FRAC)
train_sz = len(full_train_dataset) - val_sz
train_ds, val_ds = random_split(full_train_dataset, [train_sz, val_sz],
                                generator=torch.Generator().manual_seed(42))

BATCH_SIZE = 256
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, pin_memory=True)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, pin_memory=True)

print(f"Train: {len(train_ds)} | Val: {len(val_ds)} | Test: {len(test_ds)}")

Train: 160000 | Val: 40000 | Test: 400000


In [8]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, n_layers, drop_prob=0.5):
        super(GRUNet, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.gru = nn.GRU(embedding_dim, hidden_dim, n_layers,
                          batch_first=True,
                          dropout=drop_prob if n_layers > 1 else 0)

        self.dropout = nn.Dropout(drop_prob)

        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embeds = self.embedding(x.long())

        gru_out, _ = self.gru(embeds)
        pooled = torch.max(gru_out, dim=1)[0]

        out = self.dropout(pooled)
        out = self.fc(out)

        return out.squeeze(1)

In [9]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    train_loss = 0.0

    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()

        out = model(x)

        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    return train_loss / len(loader)

def evaluate_one_epoch(model, loader, criterion, device):
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
      for x, y in loader:
          x, y = x.to(device), y.to(device)

          out = model(x)

          loss = criterion(out, y)
          valid_loss += loss.item()

    valid_loss /= len(loader)
    return valid_loss

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_acc, n = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        preds = (torch.sigmoid(logits) >= 0.5).long()

        total_loss += loss.item() * xb.size(0)
        total_acc  += (preds == yb.long()).sum().item()
        n += xb.size(0)

    return total_loss/n, total_acc/n

In [10]:
# --- 8. Train baseline model (random init embeddings) ---
emb_dim = 300
hidden_dim = 128
pad_idx = PAD_IDX

n_layers = 2

model_basic = GRUNet(vocab_size, emb_dim, hidden_dim, pad_idx, n_layers).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_basic.parameters(), lr=1e-3, weight_decay=1e-4)

n_epochs = 20
patience = 5
patience_counter = 0

best_val_loss = 1e5

print("\n--- Training: baseline ---")
for epoch in range(n_epochs):
    train_loss = train_one_epoch(model_basic, train_loader, optimizer, criterion, device)
    val_loss = evaluate_one_epoch(model_basic, val_loader, criterion, device)
    print(f"[{epoch+1:02d}] train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")
    # improvement?
    if val_loss < best_val_loss - 1e-6:
        best_val_loss = val_loss
        best_state = copy.deepcopy(model_basic.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch} (no improvement for {patience} epochs).")
            break

# load best weights
model_basic.load_state_dict(best_state)

test_loss, test_acc = evaluate(model_basic, test_loader, criterion, device)
print(f"\n[Baseline] TEST loss={test_loss:.4f}  acc={test_acc*100:.2f}%")


--- Training: baseline ---
[01] train_loss=0.3289 | val_loss=0.2390
[02] train_loss=0.2265 | val_loss=0.2150
[03] train_loss=0.2024 | val_loss=0.2099
[04] train_loss=0.1857 | val_loss=0.1886
[05] train_loss=0.1740 | val_loss=0.1828
[06] train_loss=0.1657 | val_loss=0.1853
[07] train_loss=0.1572 | val_loss=0.1760
[08] train_loss=0.1495 | val_loss=0.1744
[09] train_loss=0.1427 | val_loss=0.1766
[10] train_loss=0.1364 | val_loss=0.1770
[11] train_loss=0.1286 | val_loss=0.1786
[12] train_loss=0.1226 | val_loss=0.1820
[13] train_loss=0.1163 | val_loss=0.1808
Early stopping at epoch 12 (no improvement for 5 epochs).

[Baseline] TEST loss=0.1733  acc=93.33%


In [11]:
print("\n--- Завантаження fastText ---")
if not os.path.exists("wiki-news-300d-1M.vec.zip"):
    print("Downloading fastText vectors (1.6GB)...")
    !wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
print("Unzipping vectors...")
!unzip -q -n wiki-news-300d-1M.vec.zip


--- Завантаження fastText ---
Downloading fastText vectors (1.6GB)...
Unzipping vectors...


In [12]:
def load_fasttext_vec(path):
    vectors = {}
    dim = 0
    print("Loading fastText vectors into memory...")
    with open(path, "r", encoding="utf8") as f:
        first_line_parts = f.readline().rstrip().split(" ")
        num_words = int(first_line_parts[0])
        dim = int(first_line_parts[1])

        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            try:
                if len(parts) == dim + 1:
                    vec = np.asarray(parts[1:], dtype=np.float32)
                    vectors[word] = vec
            except ValueError:
                continue
    return vectors, dim

ft_path = "wiki-news-300d-1M.vec"
fasttext_vectors, emb_dim_ft = load_fasttext_vec(ft_path)
print(f"fastText dim: {emb_dim_ft} | loaded entries: {len(fasttext_vectors):,}")

Loading fastText vectors into memory...
fastText dim: 300 | loaded entries: 999,994


In [13]:
print("Створення матриці ембедингів...")
emb_dim = emb_dim_ft

emb_matrix = np.random.normal(scale=0.1, size=(vocab_size, emb_dim)).astype(np.float32)
emb_matrix[PAD_IDX] = 0.0

hit = 0
for w, idx in stoi.items():
    v = fasttext_vectors.get(w)
    if v is not None:
        emb_matrix[idx] = v
        hit += 1
print(f"Coverage: {hit}/{vocab_size} = {hit/vocab_size:.1%}")

pretrained_emb = nn.Embedding.from_pretrained(
    torch.tensor(emb_matrix),
    freeze=True,
    padding_idx=PAD_IDX
)


Створення матриці ембедингів...
Coverage: 19300/20000 = 96.5%


In [14]:
# --- 11. Train model ---
model_tl = GRUNet(vocab_size, emb_dim, hidden_dim, pad_idx, n_layers).to(device)

model_tl.embedding = pretrained_emb
model_tl.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_tl.parameters(), lr=1e-3, weight_decay=1e-4)

n_epochs = 20

print("\n--- training fastText (GRU) ---")

best_val_loss = 1e5
patience_counter = 0
patience = 3
best_state = None

for epoch in range(n_epochs):
    train_loss = train_one_epoch(model_tl, train_loader, optimizer, criterion, device)
    val_loss = evaluate_one_epoch(model_tl, val_loader, criterion, device)
    print(f"[{epoch+1:02d}] train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")
    # improvement?
    if val_loss < best_val_loss - 1e-6:
        best_val_loss = val_loss
        best_state = copy.deepcopy(model_tl.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch} (no improvement for {patience} epochs).")
            break

model_tl.load_state_dict(best_state)
test_loss, test_acc = evaluate(model_tl, test_loader, criterion, device)
print(f"\n[fastText GRU Model] TEST  loss={test_loss:.4f}  acc={test_acc*100:.2f}%")


--- training fastText (GRU) ---
[01] train_loss=0.3226 | val_loss=0.2451
[02] train_loss=0.2385 | val_loss=0.2306
[03] train_loss=0.2202 | val_loss=0.2121
[04] train_loss=0.2092 | val_loss=0.2089
[05] train_loss=0.1987 | val_loss=0.2063
[06] train_loss=0.1940 | val_loss=0.1986
[07] train_loss=0.1867 | val_loss=0.1870
[08] train_loss=0.1814 | val_loss=0.1847
[09] train_loss=0.1769 | val_loss=0.1825
[10] train_loss=0.1738 | val_loss=0.1802
[11] train_loss=0.1689 | val_loss=0.1877
[12] train_loss=0.1667 | val_loss=0.1751
[13] train_loss=0.1627 | val_loss=0.1794
[14] train_loss=0.1612 | val_loss=0.1762
[15] train_loss=0.1579 | val_loss=0.1784
Early stopping at epoch 14 (no improvement for 3 epochs).

[fastText GRU Model] TEST  loss=0.1740  acc=93.27%
