In [None]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.9 MB/s[0m eta [36

In [None]:
import pandas as pd
data=pd.read_csv('train_set.csv')
data.head()

Unnamed: 0,title,label_numeric
0,"100+ STT Né thính, Cap né thính hài hước, NÉT ...",7
1,"Top 111+ stt cuộc sống an nhiên, bình dị tự tạ...",7
2,"Top hạt giống hoa dễ trồng, nở quanh năm cho n...",7
3,Chi tiết 3 cách nấu rau bò khai đơn giản mà th...,1
4,Top 10 quạt cây hơi nước được ưa chuộng nhất h...,4


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from underthesea import word_tokenize
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import re
import math

# Định nghĩa cell GRU tùy chỉnh
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Trọng số của update gate
        self.W_z = nn.Linear(input_size, hidden_size)
        self.U_z = nn.Linear(hidden_size, hidden_size, bias=False)

        # Trọng số của reset gate
        self.W_r = nn.Linear(input_size, hidden_size)
        self.U_r = nn.Linear(hidden_size, hidden_size, bias=False)

        # Trọng số của candidate hidden state
        self.W_h = nn.Linear(input_size, hidden_size)
        self.U_h = nn.Linear(hidden_size, hidden_size, bias=False)

    def forward(self, x, h_prev):
        # Update gate z_t
        z_t = torch.sigmoid(self.W_z(x) + self.U_z(h_prev))

        # Reset gate r_t
        r_t = torch.sigmoid(self.W_r(x) + self.U_r(h_prev))

        # Candidate hidden state h_hat_t
        h_hat_t = torch.tanh(self.W_h(x) + self.U_h(r_t * h_prev))

        # Final hidden state h_t
        h_t = (1 - z_t) * h_prev + z_t * h_hat_t

        return h_t

class LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(hidden_size))
        self.beta = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len=200):
        super().__init__()
        # Ensure dim is even
        if dim % 2 != 0:
            raise ValueError("RotaryPositionalEmbedding dimension must be even")
        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        position = torch.arange(max_seq_len).float()
        sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq)
        self.register_buffer("sin", sinusoid_inp.sin())
        self.register_buffer("cos", sinusoid_inp.cos())

    def forward(self, x, seq_len):
        # Apply RoPE to the last dimension
        dim = x.shape[-1]
        x1, x2 = x.chunk(2, dim=-1)

        # Get sin and cos for the current sequence length
        sin, cos = self.sin[:seq_len, :dim//2], self.cos[:seq_len, :dim//2]


        num_leading_dims = x.ndim - 2
        reshape_pattern = (1,) * num_leading_dims + (seq_len, dim // 2)
        sin = sin.view(*reshape_pattern)
        cos = cos.view(*reshape_pattern)

        # Apply rotation
        rotated_x = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)

        return rotated_x

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads=4, max_seq_len=200):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)

        # Add RoPE
        self.rope = RotaryPositionalEmbedding(self.head_dim, max_seq_len)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # Project queries, keys, and values
        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Apply RoPE to queries and keys
        q = self.rope(q, seq_len)
        k = self.rope(k, seq_len)

        # Compute attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn_weights = torch.softmax(scores, dim=-1)

        # Apply attention to values
        context = torch.matmul(attn_weights, v)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)

        return self.out_proj(context)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        return focal_loss.mean()

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=2, dropout=0.5):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout1 = nn.Dropout(dropout)

        # Rotary positional embedding
        self.rope = RotaryPositionalEmbedding(embed_size)

        # Multiple GRU layers
        self.gru_layers = nn.ModuleList([
            GRUCell(embed_size if i == 0 else hidden_size, hidden_size)
            for i in range(num_layers)
        ])

        # Bidirectional processing
        self.gru_layers_reverse = nn.ModuleList([
            GRUCell(embed_size if i == 0 else hidden_size, hidden_size)
            for i in range(num_layers)
        ])

        # Layer normalization
        self.layer_norm1 = LayerNorm(hidden_size * 2)
        self.layer_norm2 = LayerNorm(hidden_size * 2)

        # Multi-head attention
        self.attention = MultiHeadAttention(hidden_size * 2)

        self.dropout2 = nn.Dropout(dropout)

        # Additional fully connected layers with residual connections
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.fc2 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

        # Label smoothing
        self.label_smoothing = 0.1

    def forward(self, x):
        embedded = self.dropout1(self.embedding(x))
        batch_size = x.size(0)
        seq_len = x.size(1)

        # Apply rotary positional embedding
        embedded = self.rope(embedded, seq_len)

        # Initialize hidden states for each layer
        h_forward = [torch.zeros(batch_size, self.gru_layers[0].hidden_size, device=x.device)
                    for _ in range(len(self.gru_layers))]
        h_backward = [torch.zeros(batch_size, self.gru_layers[0].hidden_size, device=x.device)
                     for _ in range(len(self.gru_layers))]

        # Store all hidden states for attention
        all_hidden_states = []

        # Process sequence
        for t in range(seq_len):
            # Forward pass
            x_t = embedded[:, t, :]
            for layer_idx, gru in enumerate(self.gru_layers):
                h_forward[layer_idx] = gru(x_t, h_forward[layer_idx])
                x_t = h_forward[layer_idx]

            # Backward pass
            x_t_reverse = embedded[:, seq_len - 1 - t, :]
            for layer_idx, gru in enumerate(self.gru_layers_reverse):
                h_backward[layer_idx] = gru(x_t_reverse, h_backward[layer_idx])
                x_t_reverse = h_backward[layer_idx]

            # Concatenate forward and backward states
            combined_h = torch.cat([h_forward[-1], h_backward[-1]], dim=1)
            all_hidden_states.append(combined_h)

        # Stack all hidden states
        all_hidden_states = torch.stack(all_hidden_states, dim=1)

        # Apply layer normalization
        all_hidden_states = self.layer_norm1(all_hidden_states)

        # Apply multi-head attention
        attended = self.attention(all_hidden_states)
        attended = self.layer_norm2(attended)

        # Global average pooling
        context_vector = torch.mean(attended, dim=1)

        # Additional fully connected layers with residual connections
        out = self.dropout2(context_vector)
        residual = out

        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout2(out)
        out = out + residual

        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout2(out)

        out = self.fc3(out)

        return out

class TextDataset(Dataset):
    def __init__(self, texts, labels, word2idx, max_len):
        self.texts = texts
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Convert text to sequence of indices
        tokens = word_tokenize(text.lower())
        indices = [self.word2idx.get(token, self.word2idx['<UNK>']) for token in tokens]

        # Pad or truncate sequence
        if len(indices) < self.max_len:
            indices = indices + [self.word2idx['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]

        return torch.tensor(indices), torch.tensor(label)

def preprocess_text(text):
    # Remove special characters and extra spaces
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def build_vocab(texts, min_freq=2):
    # Tokenize all texts
    all_tokens = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        all_tokens.extend(tokens)

    # Count token frequencies
    token_counts = Counter(all_tokens)

    # Create vocabulary
    vocab = ['<PAD>', '<UNK>']  # Special tokens
    vocab.extend([token for token, count in token_counts.items() if count >= min_freq])

    # Create word to index mapping
    word2idx = {word: idx for idx, word in enumerate(vocab)}

    return vocab, word2idx

def main():
    # Hyperparameters
    EMBED_SIZE = 300
    HIDDEN_SIZE = 512
    MAX_LEN = 200
    BATCH_SIZE = 32
    NUM_EPOCHS = 30
    LEARNING_RATE = 0.0003
    WEIGHT_DECAY = 2e-4

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load data
    df = pd.read_csv('train_set.csv')
    texts = df['title'].apply(preprocess_text).values
    labels = df['label_numeric'].values

    # Encode labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)

    # Build vocabulary
    vocab, word2idx = build_vocab(texts, min_freq=2)
    vocab_size = len(vocab)
    num_classes = len(label_encoder.classes_)

    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Create datasets
    train_dataset = TextDataset(train_texts, train_labels, word2idx, MAX_LEN)
    val_dataset = TextDataset(val_texts, val_labels, word2idx, MAX_LEN)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model
    model = GRUModel(
        vocab_size=vocab_size,
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        output_size=num_classes,
        num_layers=2,
        dropout=0.5
    ).to(device)

    # Loss function and optimizer
    criterion = FocalLoss(alpha=1, gamma=2)  # Using Focal Loss
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        betas=(0.9, 0.999),
        eps=1e-8
    )

    # Learning rate scheduler with warmup
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=LEARNING_RATE,
        epochs=NUM_EPOCHS,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        div_factor=25,
        final_div_factor=1000
    )

    # Train model
    best_val_acc = 0.0
    patience = 7
    patience_counter = 0

    for epoch in range(NUM_EPOCHS):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            # Apply label smoothing
            if model.training:
                smooth_labels = torch.zeros_like(outputs).scatter_(1, labels.unsqueeze(1), 1)
                smooth_labels = smooth_labels * (1 - model.label_smoothing) + model.label_smoothing / num_classes
                loss = -(smooth_labels * torch.log_softmax(outputs, dim=1)).sum(dim=1).mean()
            else:
                loss = criterion(outputs, labels)

            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()

        train_acc = 100. * train_correct / train_total

        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_acc = 100. * val_correct / val_total

        print(f'Epoch {epoch+1}/{NUM_EPOCHS}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_acc:.2f}%')

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_gru_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping triggered after {epoch + 1} epochs')
                break

if __name__ == '__main__':
    main()

Epoch 1/30:
Train Loss: 1.9145, Train Acc: 33.24%
Val Loss: 1.2526, Val Acc: 36.11%
Epoch 2/30:
Train Loss: 1.6557, Train Acc: 51.44%
Val Loss: 0.7351, Val Acc: 65.09%
Epoch 3/30:
Train Loss: 1.3497, Train Acc: 65.11%
Val Loss: 0.5124, Val Acc: 73.87%
Epoch 4/30:
Train Loss: 1.2037, Train Acc: 70.97%
Val Loss: 0.4762, Val Acc: 76.43%
Epoch 5/30:
Train Loss: 1.0956, Train Acc: 75.53%
Val Loss: 0.4497, Val Acc: 77.55%
Epoch 6/30:
Train Loss: 1.0199, Train Acc: 78.50%
Val Loss: 0.3855, Val Acc: 80.71%
Epoch 7/30:
Train Loss: 0.9608, Train Acc: 80.85%
Val Loss: 0.3806, Val Acc: 81.53%
Epoch 8/30:
Train Loss: 0.9147, Train Acc: 83.08%
Val Loss: 0.4014, Val Acc: 81.76%
Epoch 9/30:
Train Loss: 0.8693, Train Acc: 85.20%
Val Loss: 0.4154, Val Acc: 82.43%
Epoch 10/30:
Train Loss: 0.8412, Train Acc: 85.92%
Val Loss: 0.3991, Val Acc: 83.11%
Epoch 11/30:
Train Loss: 0.8016, Train Acc: 87.94%
Val Loss: 0.4027, Val Acc: 83.48%
Epoch 12/30:
Train Loss: 0.7645, Train Acc: 89.58%
Val Loss: 0.3976, Val A

In [None]:
!gdown 1isX7s6hChuJ4GHkVwP80C9DZtbfv1FXl

Downloading...
From: https://drive.google.com/uc?id=1isX7s6hChuJ4GHkVwP80C9DZtbfv1FXl
To: /content/test_set_public.csv
  0% 0.00/99.5k [00:00<?, ?B/s]100% 99.5k/99.5k [00:00<00:00, 129MB/s]


In [None]:
import torch
import pandas as pd
import numpy as np
from underthesea import word_tokenize
from collections import Counter
import re

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def build_vocab(texts, min_freq=2):
    all_tokens = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        all_tokens.extend(tokens)

    token_counts = Counter(all_tokens)

    vocab = ['<PAD>', '<UNK>']
    vocab.extend([token for token, count in token_counts.items() if count >= min_freq])

    word2idx = {word: idx for idx, word in enumerate(vocab)}

    return vocab, word2idx

def predict_sentence(model, sentence, word2idx, max_len=200, device='cuda'):
    """
    Predict label for a single sentence using the trained GRU model.

    Parameters:
    - model: Trained GRU model
    - sentence: Input sentence (string)
    - word2idx: Word to index mapping dictionary
    - max_len: Maximum sequence length
    - device: Device to run inference on

    Returns:
    - predicted_label: Predicted label (integer)
    """
    model.eval()

    sentence = preprocess_text(sentence)
    tokens = word_tokenize(sentence.lower())

    indices = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

    if len(indices) < max_len:
        indices = indices + [word2idx['<PAD>']] * (max_len - len(indices))
    else:
        indices = indices[:max_len]


    input_tensor = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get prediction
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, dim=1)

    return predicted.item()

# Load the test data
test_df = pd.read_csv('test_set_public.csv')

train_df = pd.read_csv('train_set.csv')
train_texts = train_df['title'].apply(preprocess_text).values

vocab, word2idx = build_vocab(train_texts, min_freq=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRUModel(
    vocab_size=len(vocab),
    embed_size=300,
    hidden_size=512,
    output_size=len(train_df['label_numeric'].unique()),
    num_layers=2,
    dropout=0.5
).to(device)


model.load_state_dict(torch.load('best_gru_model.pth'))
model.eval()

# Make predictions
test_df['label_numeric'] = test_df['title'].apply(
    lambda x: predict_sentence(model, x, word2idx, max_len=200, device=device)
)

# Prepare submission file
submission_df = test_df[['_id', 'label_numeric']].copy()
submission_df.rename(columns={'_id': 'id'}, inplace=True)

# Save predictions
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
