In [11]:
TRAIN_PATH = "/kaggle/input/ml-challenge-udhgam-2/train.jsonl"
TEST_PATH  = "/kaggle/input/ml-challenge-udhgam-2/test.jsonl"

def load_jsonl_safe(path):
    data=[]
    with open(path) as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except:
                pass
    return pd.DataFrame(data)

train_df = load_jsonl_safe(TRAIN_PATH)
test_df  = load_jsonl_safe(TEST_PATH)

print(train_df.shape, test_df.shape)
print(train_df.columns)

(79806, 4) (19952, 3)
Index(['example_id', 'input_ids', 'attention_mask', 'label'], dtype='object')


In [15]:
import os
import json
import gc
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec

# --- Config ---
CONFIG = {
    'seed': 42,
    'epochs': 6,            # Optimized for pretrained convergence
    'batch_size': 32,
    'lr': 5e-4,
    'max_len': 512,
    'embed_dim': 128,       # Must match Word2Vec vector_size
    'n_layers': 3,
    'n_heads': 4,
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(CONFIG['seed'])
print(f"Using device: {CONFIG['device']}")

  from google.cloud.aiplatform.utils import gcs_utils


Using device: cuda


In [16]:
# -----------------------------
# 2. Data Loading
# -----------------------------
print("\n--- Loading Data ---")
train_path = "/kaggle/input/ml-challenge-udhgam-2/train.jsonl"
test_path  = "/kaggle/input/ml-challenge-udhgam-2/test.jsonl"

def load_jsonl(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_df = load_jsonl(train_path)
test_df = load_jsonl(test_path)

# Dynamic Vocab Calculation
max_id_train = max([max(x) for x in train_df['input_ids'] if len(x) > 0], default=0)
max_id_test = max([max(x) for x in test_df['input_ids'] if len(x) > 0], default=0)
VOCAB_SIZE = max(max_id_train, max_id_test) + 100
print(f"Vocabulary Size: {VOCAB_SIZE}")


--- Loading Data ---
Vocabulary Size: 50467


In [17]:
# -----------------------------
# 3. Word2Vec Pretraining
# -----------------------------
print("\n--- Training Word2Vec Embeddings ---")
# Combine all data to learn token relationships
all_sentences = train_df['input_ids'].tolist() + test_df['input_ids'].tolist()
# Convert to strings for Gensim (Gensim requires string tokens)
all_sentences_str = [[str(x) for x in seq] for seq in all_sentences]

# Train W2V
w2v = Word2Vec(sentences=all_sentences_str, vector_size=CONFIG['embed_dim'], 
               window=5, min_count=1, workers=4, epochs=10, seed=CONFIG['seed'])

# Extract Weights into a matrix
embedding_matrix = np.zeros((VOCAB_SIZE, CONFIG['embed_dim']))
for i in range(VOCAB_SIZE):
    token = str(i)
    if token in w2v.wv:
        embedding_matrix[i] = w2v.wv[token]
    else:
        # Random init for unseen tokens
        embedding_matrix[i] = np.random.normal(scale=0.1, size=(CONFIG['embed_dim'],))

pretrained_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)
print("Word2Vec Ready!")


--- Training Word2Vec Embeddings ---
Word2Vec Ready!


In [18]:
# -----------------------------
# 4. Feature Engineering (For LightGBM)
# -----------------------------
print("\n--- Engineering Features for LightGBM ---")
def get_handcrafted_features(input_ids):
    length = len(input_ids)
    if length == 0: return [0]*6
    arr = np.array(input_ids)
    return [length, len(np.unique(arr)), 1 - (len(np.unique(arr))/length), 
            np.mean(arr), np.std(arr), np.max(arr)]

train_feats = np.array([get_handcrafted_features(x) for x in train_df['input_ids']])
test_feats = np.array([get_handcrafted_features(x) for x in test_df['input_ids']])

# TF-IDF + SVD
train_str = train_df['input_ids'].apply(lambda x: ' '.join(map(str, x)))
test_str = test_df['input_ids'].apply(lambda x: ' '.join(map(str, x)))

tfidf = TfidfVectorizer(max_features=15000, token_pattern=r'\b\w+\b')
train_tfidf = tfidf.fit_transform(train_str)
test_tfidf = tfidf.transform(test_str)

svd = TruncatedSVD(n_components=32, random_state=CONFIG['seed'])
train_svd = svd.fit_transform(train_tfidf)
test_svd = svd.transform(test_tfidf)

X_train_gbm = np.hstack([train_feats, train_svd])
X_test_gbm = np.hstack([test_feats, test_svd])

scaler = StandardScaler()
X_train_gbm = scaler.fit_transform(X_train_gbm)
X_test_gbm = scaler.transform(X_test_gbm)


--- Engineering Features for LightGBM ---


In [19]:
# -----------------------------
# 5. PyTorch Dataset & Collator
# -----------------------------
class ObfuscatedDataset(Dataset):
    def __init__(self, df, labels=None):
        self.input_ids = df['input_ids'].tolist()
        self.labels = labels
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx):
        ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        if self.labels is not None:
            return ids, torch.tensor(self.labels[idx], dtype=torch.float)
        return ids

def collate_fn(batch):
    if isinstance(batch[0], tuple):
        ids, labels = zip(*batch)
        ids_pad = pad_sequence(ids, batch_first=True, padding_value=0)
        return ids_pad, (ids_pad != 0).float(), torch.stack(labels)
    else:
        ids = batch
        ids_pad = pad_sequence(ids, batch_first=True, padding_value=0)
        return ids_pad, (ids_pad != 0).float()

In [20]:
# -----------------------------
# 6. Model Architectures
# -----------------------------
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim), nn.LayerNorm(in_dim), nn.Tanh(), nn.Linear(in_dim, 1)
        )
    def forward(self, last_hidden_state, mask):
        w = self.attention(last_hidden_state).squeeze(-1)
        w = w.masked_fill(mask == 0, -1e4)
        w = torch.softmax(w, dim=1)
        return torch.sum(last_hidden_state * w.unsqueeze(-1), dim=1)

# Model 1: Transformer
class TransformerModel(nn.Module):
    def __init__(self, pretrained_emb):
        super().__init__()
        # Load Word2Vec weights
        self.embedding = nn.Embedding.from_pretrained(pretrained_emb, freeze=False, padding_idx=0)
        self.pos_emb = nn.Embedding(CONFIG['max_len'], CONFIG['embed_dim'])
        enc_layer = nn.TransformerEncoderLayer(d_model=CONFIG['embed_dim'], nhead=CONFIG['n_heads'], 
                                             dim_feedforward=512, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(enc_layer, num_layers=CONFIG['n_layers'])
        self.pooler = AttentionPooling(CONFIG['embed_dim'])
        self.head = nn.Sequential(nn.Linear(CONFIG['embed_dim'], 64), nn.ReLU(), 
                                  nn.Dropout(0.1), nn.Linear(64, 1), nn.Sigmoid())

    def forward(self, ids, mask):
        if ids.size(1) > CONFIG['max_len']: ids, mask = ids[:, :CONFIG['max_len']], mask[:, :CONFIG['max_len']]
        x = self.embedding(ids) + self.pos_emb(torch.arange(ids.size(1), device=ids.device).unsqueeze(0))
        x = self.transformer(x, src_key_padding_mask=(mask==0))
        return self.head(self.pooler(x, mask)).squeeze()

# Model 2: Bi-Directional GRU
class GRUModel(nn.Module):
    def __init__(self, pretrained_emb):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_emb, freeze=False, padding_idx=0)
        self.gru = nn.GRU(CONFIG['embed_dim'], 64, num_layers=2, batch_first=True, bidirectional=True)
        self.pooler = AttentionPooling(128) # 64 * 2 (Bidirectional)
        self.head = nn.Sequential(nn.Linear(128, 64), nn.ReLU(), 
                                  nn.Dropout(0.1), nn.Linear(64, 1), nn.Sigmoid())

    def forward(self, ids, mask):
        if ids.size(1) > CONFIG['max_len']: ids, mask = ids[:, :CONFIG['max_len']], mask[:, :CONFIG['max_len']]
        x = self.embedding(ids)
        x, _ = self.gru(x)
        return self.head(self.pooler(x, mask)).squeeze()

In [21]:
# -----------------------------
# 7. Training Engine
# -----------------------------
def train_model(model_class, model_name, train_df, labels, pretrained_emb):
    kf = KFold(n_splits=5, shuffle=True, random_state=CONFIG['seed'])
    oof_preds, test_preds = np.zeros(len(train_df)), np.zeros(len(test_df))
    
    test_loader = DataLoader(ObfuscatedDataset(test_df), batch_size=CONFIG['batch_size']*2, 
                             collate_fn=collate_fn, shuffle=False)

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f"--- {model_name} Fold {fold+1} ---")
        train_loader = DataLoader(ObfuscatedDataset(train_df.iloc[train_idx], labels[train_idx]), 
                                  batch_size=CONFIG['batch_size'], collate_fn=collate_fn, shuffle=True)
        val_loader = DataLoader(ObfuscatedDataset(train_df.iloc[val_idx], labels[val_idx]), 
                                batch_size=CONFIG['batch_size']*2, collate_fn=collate_fn, shuffle=False)
        
        model = model_class(pretrained_emb).to(CONFIG['device'])
        optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'])
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=CONFIG['lr'], 
                                                        steps_per_epoch=len(train_loader), epochs=CONFIG['epochs'])
        criterion = nn.L1Loss()
        scaler = torch.amp.GradScaler('cuda')

        for epoch in range(CONFIG['epochs']):
            model.train()
            for ids, mask, y in train_loader:
                ids, mask, y = ids.to(CONFIG['device']), mask.to(CONFIG['device']), y.to(CONFIG['device'])
                optimizer.zero_grad()
                with torch.amp.autocast('cuda'):
                    loss = criterion(model(ids, mask), y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()

        model.eval()
        val_preds_fold = []
        with torch.no_grad():
            for ids, mask, y in val_loader:
                val_preds_fold.extend(model(ids.to(CONFIG['device']), mask.to(CONFIG['device'])).cpu().numpy())
        oof_preds[val_idx] = np.array(val_preds_fold)
        
        test_preds_fold = []
        with torch.no_grad():
            for ids, mask in test_loader:
                test_preds_fold.extend(model(ids.to(CONFIG['device']), mask.to(CONFIG['device'])).cpu().numpy())
        test_preds += np.array(test_preds_fold) / 5
    
    print(f"{model_name} CV MAE: {mean_absolute_error(labels, oof_preds):.5f}")
    return oof_preds, test_preds

In [22]:
# -----------------------------
# 8. Execution & Submission
# -----------------------------
labels = train_df['label'].values

# 1. Train Transformer
print("\n=== Training Transformer ===")
oof_trans, pred_trans = train_model(TransformerModel, "Transformer", train_df, labels, pretrained_embeddings)

# 2. Train GRU (Added Diversity)
print("\n=== Training Bi-GRU ===")
oof_gru, pred_gru = train_model(GRUModel, "Bi-GRU", train_df, labels, pretrained_embeddings)

# 3. Train LightGBM
print("\n=== Training LightGBM ===")
kf = KFold(n_splits=5, shuffle=True, random_state=CONFIG['seed'])
oof_lgb, pred_lgb = np.zeros(len(labels)), np.zeros(len(test_df))
params = {'objective': 'mae', 'metric': 'mae', 'verbosity': -1, 'learning_rate': 0.05, 
          'num_leaves': 32, 'feature_fraction': 0.8, 'bagging_fraction': 0.8}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_gbm)):
    model = lgb.train(params, lgb.Dataset(X_train_gbm[tr_idx], labels[tr_idx]), num_boost_round=3000,
                      valid_sets=[lgb.Dataset(X_train_gbm[val_idx], labels[val_idx])],
                      callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
    oof_lgb[val_idx] = model.predict(X_train_gbm[val_idx])
    pred_lgb += model.predict(X_test_gbm) / 5
print(f"LightGBM CV MAE: {mean_absolute_error(labels, oof_lgb):.5f}")

# 4. Final Ensemble (Inverse Error Weighting)
print("\n=== Blending Models ===")
scores = np.array([mean_absolute_error(labels, oof_trans),
                   mean_absolute_error(labels, oof_gru),
                   mean_absolute_error(labels, oof_lgb)])

# Calculate weights: Better score (lower error) -> Higher weight
weights = 1 / scores
weights /= weights.sum()

print(f"Final Weights -> Trans: {weights[0]:.3f}, GRU: {weights[1]:.3f}, LGB: {weights[2]:.3f}")

final_preds = (pred_trans * weights[0]) + (pred_gru * weights[1]) + (pred_lgb * weights[2])
final_preds = np.clip(final_preds, 0, 1)

pd.DataFrame({"example_id": test_df.example_id, "label": final_preds}).to_csv("submission.csv", index=False)
print("Submission Generated Successfully!")


=== Training Transformer ===
--- Transformer Fold 1 ---
--- Transformer Fold 2 ---
--- Transformer Fold 3 ---
--- Transformer Fold 4 ---
--- Transformer Fold 5 ---
Transformer CV MAE: 0.16026

=== Training Bi-GRU ===
--- Bi-GRU Fold 1 ---
--- Bi-GRU Fold 2 ---
--- Bi-GRU Fold 3 ---
--- Bi-GRU Fold 4 ---
--- Bi-GRU Fold 5 ---
Bi-GRU CV MAE: 0.15418

=== Training LightGBM ===
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2907]	valid_0's l1: 0.180393
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2391]	valid_0's l1: 0.182203
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's l1: 0.178003
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2998]	valid_0's l1: 0.179903
Training until validation scores don't improve for 100 rounds
Did not meet 