In [1]:
import os
import gc
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from scipy.optimize import minimize
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# ==========================================
# 1. TOP-TIER CONFIG
# ==========================================
class Config:
    model_name = 'roberta-base' # SWITCHING BRAINS: RoBERTa captures different patterns than DeBERTa
    max_len = 512               # CRITICAL FIX: Capture the FULL complaint
    batch_size = 8              # RoBERTa is lighter, so 8 often fits. If OOM, try 6.
    grad_acc_steps = 2          # Effective batch = 16
    epochs = 4
    folds = 5                   # 5-Fold is mandatory for Top 10 stability
    lr = 2e-5
    head_lr = 1e-4
    weight_decay = 0.01
    seed = 42
    num_workers = 2
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(Config.seed)

# ==========================================
# 2. DATA LOADING
# ==========================================
input_root = "/kaggle/input"
candidate = None
try:
    folders = os.listdir(input_root)
    for name in ("neural-craft-26", "neural-craft-2026", "neural-craft_26", "Neural_Craft_26"):
        if name in folders:
            candidate = os.path.join(input_root, name)
            break
    if candidate is None:
        subdirs = [f for f in folders if not f.startswith(".")]
        if len(subdirs) == 1: candidate = os.path.join(input_root, subdirs[0])
    if candidate is None: candidate = "/kaggle/input/neural-craft-data"
    
    print(f"Dataset: {candidate}")
    train = pd.read_csv(os.path.join(candidate, "train_complaints.csv"))
    test = pd.read_csv(os.path.join(candidate, "test_complaints.csv"))
    train['complaint_text'] = train['complaint_text'].fillna("").astype(str)
    test['complaint_text'] = test['complaint_text'].fillna("").astype(str)

except Exception as e:
    print(f"Error: {e}")
    train = pd.DataFrame({'complaint_id': range(10), 'complaint_text': ['test']*10, 
                          'primary_category': ['A']*10, 'secondary_category': ['B']*10, 'severity': [3]*10})
    test = pd.DataFrame({'complaint_id': range(10), 'complaint_text': ['test']*10})

# ==========================================
# 3. ENCODING & HIERARCHY
# ==========================================
le_primary = LabelEncoder()
le_secondary = LabelEncoder()
train['primary_enc'] = le_primary.fit_transform(train['primary_category'])
train['secondary_enc'] = le_secondary.fit_transform(train['secondary_category'])
num_primary = len(le_primary.classes_)
num_secondary = len(le_secondary.classes_)

hierarchy_map = {}
for p_id in range(num_primary):
    valid_secs = train.loc[train['primary_enc'] == p_id, 'secondary_enc'].unique().tolist()
    hierarchy_map[p_id] = set(valid_secs)

# ==========================================
# 4. OPTIMIZED POST-PROCESSING (THE SECRET SAUCE)
# ==========================================
class OptimizedRounder:
    def __init__(self):
        self.coef_ = [1.5, 2.5, 3.5, 4.5] # Initial boundaries

    def _loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]: X_p[i] = 1
            elif pred >= coef[0] and pred < coef[1]: X_p[i] = 2
            elif pred >= coef[1] and pred < coef[2]: X_p[i] = 3
            elif pred >= coef[2] and pred < coef[3]: X_p[i] = 4
            else: X_p[i] = 5
        return -r2_score(y, X_p) # Minimize negative R2

    def fit(self, X, y):
        # Nelder-Mead optimization to find best boundaries
        loss_partial = lambda coef: self._loss(coef, X, y)
        initial_coef = [1.5, 2.5, 3.5, 4.5]
        self.coef_ = minimize(loss_partial, initial_coef, method='nelder-mead').x

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]: X_p[i] = 1
            elif pred >= coef[0] and pred < coef[1]: X_p[i] = 2
            elif pred >= coef[1] and pred < coef[2]: X_p[i] = 3
            elif pred >= coef[2] and pred < coef[3]: X_p[i] = 4
            else: X_p[i] = 5
        return X_p.astype(int)

# ==========================================
# 5. MODEL (ROBERTA)
# ==========================================
class NeuralCraftRoBERTa(nn.Module):
    def __init__(self, model_name, num_p, num_s):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name, config=self.config)
        
        # Simple Mean Pooling is robust for RoBERTa
        self.fc_primary = nn.Linear(self.config.hidden_size, num_p)
        self.fc_secondary = nn.Linear(self.config.hidden_size, num_s)
        self.fc_severity = nn.Linear(self.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        # Mean Pooling
        last_hidden_state = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        feature = sum_embeddings / sum_mask
        
        p_logits = self.fc_primary(feature)
        s_logits = self.fc_secondary(feature)
        sev_pred = self.fc_severity(feature)
        return p_logits, s_logits, sev_pred

# ==========================================
# 6. DATASET & TRAINING
# ==========================================
class ComplaintDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=Config.max_len, is_test=False):
        self.texts = df['complaint_text'].values
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        if not is_test:
            self.primary = df['primary_enc'].values
            self.secondary = df['secondary_enc'].values
            self.severity = df['severity'].values.astype(float)
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in inputs.items()}
        if not self.is_test:
            item['primary'] = torch.tensor(self.primary[idx], dtype=torch.long)
            item['secondary'] = torch.tensor(self.secondary[idx], dtype=torch.long)
            item['severity'] = torch.tensor(self.severity[idx], dtype=torch.float)
        return item

def train_and_predict():
    skf = StratifiedKFold(n_splits=Config.folds, shuffle=True, random_state=Config.seed)
    tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
    
    # Storage for OOF (Out of Fold) predictions to train the Rounder
    oof_sev_preds = np.zeros(len(train))
    oof_sev_targets = np.zeros(len(train))
    
    # Test Predictions Accumulator
    test_p_logits = np.zeros((len(test), num_primary))
    test_s_logits = np.zeros((len(test), num_secondary))
    test_sev_preds = np.zeros(len(test))
    
    splits = list(skf.split(train, train['primary_enc']))
    
    for fold, (train_idx, val_idx) in enumerate(splits):
        print(f"\n--- FOLD {fold+1}/{Config.folds} ---")
        
        # Data
        train_ds = ComplaintDataset(train.iloc[train_idx].reset_index(drop=True), tokenizer)
        val_ds = ComplaintDataset(train.iloc[val_idx].reset_index(drop=True), tokenizer)
        train_loader = DataLoader(train_ds, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers)
        val_loader = DataLoader(val_ds, batch_size=Config.batch_size*2, shuffle=False, num_workers=Config.num_workers)
        
        model = NeuralCraftRoBERTa(Config.model_name, num_primary, num_secondary).to(Config.device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=Config.lr, weight_decay=Config.weight_decay)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*Config.epochs)
        
        criterion_ce = nn.CrossEntropyLoss()
        criterion_mse = nn.MSELoss()
        
        best_score = -np.inf
        best_model_path = f"roberta_fold_{fold}.pth"
        
        # Training Loop
        for epoch in range(Config.epochs):
            model.train()
            scaler = torch.cuda.amp.GradScaler()
            
            for step, batch in enumerate(tqdm(train_loader, desc=f"Ep {epoch+1}", leave=False)):
                ids = batch['input_ids'].to(Config.device)
                mask = batch['attention_mask'].to(Config.device)
                p_t = batch['primary'].to(Config.device)
                s_t = batch['secondary'].to(Config.device)
                sev_t = batch['severity'].to(Config.device).unsqueeze(1)
                
                with torch.cuda.amp.autocast():
                    p_l, s_l, sev_p = model(ids, mask)
                    loss = 0.3*criterion_ce(p_l, p_t) + 0.4*criterion_ce(s_l, s_t) + 0.3*criterion_mse(sev_p, sev_t)
                    loss = loss / Config.grad_acc_steps
                
                scaler.scale(loss).backward()
                if (step+1) % Config.grad_acc_steps == 0:
                    scaler.step(optimizer); scaler.update(); optimizer.zero_grad(); scheduler.step()
            
            # Validation
            model.eval()
            p_preds, s_preds, sev_preds_fold = [], [], []
            p_true, s_true, sev_true_fold = [], [], []
            
            with torch.no_grad():
                for batch in val_loader:
                    ids = batch['input_ids'].to(Config.device)
                    mask = batch['attention_mask'].to(Config.device)
                    p_l, s_l, sev_p = model(ids, mask)
                    
                    p_preds.extend(torch.argmax(p_l, 1).cpu().numpy())
                    s_preds.extend(torch.argmax(s_l, 1).cpu().numpy())
                    sev_preds_fold.extend(sev_p.cpu().numpy().flatten())
                    p_true.extend(batch['primary'].numpy())
                    s_true.extend(batch['secondary'].numpy())
                    sev_true_fold.extend(batch['severity'].numpy())
            
            score = 0.3*accuracy_score(p_true, p_preds) + 0.4*accuracy_score(s_true, s_preds) + 0.3*r2_score(sev_true_fold, sev_preds_fold)
            if score > best_score:
                best_score = score
                torch.save(model.state_dict(), best_model_path)
                
        print(f"Fold Best Score: {best_score:.4f}")
        
        # PREDICT OOF (For Optimizer) & TEST (For Submission)
        model.load_state_dict(torch.load(best_model_path))
        model.eval()
        
        # Fill OOF
        with torch.no_grad():
            for i, batch in enumerate(val_loader):
                # Recalculating indices implies keeping strict order or better: just append
                pass 
            # Note: For simplicity in this script, we assume strict ordering or just use the fold preds above
            # In production, mapping indices is safer. Here we trust the split order.
            
            # Populate OOF arrays (Simplified)
            start_idx = 0 # This needs mapping. 
            # SKIPPING OOF FILL for brevity/safety in this specific script format. 
            # We will train optimizer on the accumulated validation predictions we JUST generated.
            oof_sev_preds[val_idx] = np.array(sev_preds_fold)
            oof_sev_targets[val_idx] = np.array(sev_true_fold)
            
        # Predict Test
        test_ds = ComplaintDataset(test, tokenizer, is_test=True)
        test_loader = DataLoader(test_ds, batch_size=Config.batch_size*2, shuffle=False)
        fold_test_sev = []
        with torch.no_grad():
            for batch in tqdm(test_loader, leave=False):
                ids = batch['input_ids'].to(Config.device)
                mask = batch['attention_mask'].to(Config.device)
                p_l, s_l, sev_p = model(ids, mask)
                
                # Accumulate Logits (Soft Voting)
                # We need global indices to add correctly, but since loader is sequential:
                pass 
                
        # Re-run Test Prediction purely for accumulation
        idx_tracker = 0
        with torch.no_grad():
            for batch in test_loader:
                ids = batch['input_ids'].to(Config.device); mask = batch['attention_mask'].to(Config.device)
                p_l, s_l, sev_p = model(ids, mask)
                batch_len = len(ids)
                test_p_logits[idx_tracker:idx_tracker+batch_len] += p_l.cpu().numpy()
                test_s_logits[idx_tracker:idx_tracker+batch_len] += s_l.cpu().numpy()
                test_sev_preds[idx_tracker:idx_tracker+batch_len] += sev_p.cpu().numpy().flatten()
                idx_tracker += batch_len

        del model, optimizer, scaler; torch.cuda.empty_cache(); gc.collect()

    # --- FINAL PROCESSING ---
    
    # 1. Optimize Severity Thresholds
    print("\nOptimizing Severity Thresholds...")
    rounder = OptimizedRounder()
    rounder.fit(oof_sev_preds, oof_sev_targets)
    print(f"Optimal Thresholds: {rounder.coef_}")
    
    # 2. Average Test Predictions
    test_p_logits /= Config.folds
    test_s_logits /= Config.folds
    test_sev_preds /= Config.folds
    
    # 3. Hierarchical Decoding
    final_p = np.argmax(test_p_logits, axis=1)
    final_s = []
    for i, p_cat in enumerate(final_p):
        valid_secs = hierarchy_map.get(p_cat, set())
        s_row = test_s_logits[i].copy()
        if valid_secs:
            mask = np.full(num_secondary, -1e9)
            mask[list(valid_secs)] = 0
            s_row += mask
        final_s.append(np.argmax(s_row))
    
    # 4. Apply Optimized Rounding to Severity
    final_sev = rounder.predict(test_sev_preds, rounder.coef_)
    
    # 5. Save
    sub = pd.DataFrame({
        "complaint_id": test["complaint_id"].values,
        "primary_category": le_primary.inverse_transform(final_p),
        "secondary_category": le_secondary.inverse_transform(final_s),
        "severity": final_sev
    })
    sub.to_csv("submission_roberta.csv", index=False)
    print("Saved submission_roberta.csv")

if __name__ == "__main__":
    train_and_predict()

Dataset: /kaggle/input/neural-craft-26


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


--- FOLD 1/5 ---


2026-02-08 15:52:45.753041: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770565965.948021      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770565966.001329      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770565966.473000      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770565966.473046      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770565966.473049      24 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ep 1:   0%|          | 0/300 [00:00<?, ?it/s]

Ep 2:   0%|          | 0/300 [00:00<?, ?it/s]

Ep 3:   0%|          | 0/300 [00:00<?, ?it/s]

Ep 4:   0%|          | 0/300 [00:00<?, ?it/s]

Fold Best Score: 0.7231


  0%|          | 0/32 [00:00<?, ?it/s]


--- FOLD 2/5 ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Ep 1:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 2:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 3:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 4:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Fold Best Score: 0.7112


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/32 [00:00<?, ?it/s]


--- FOLD 3/5 ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Ep 1:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 2:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 3:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 4:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Fold Best Score: 0.7122


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/32 [00:00<?, ?it/s]


--- FOLD 4/5 ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Ep 1:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 2:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 3:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 4:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Fold Best Score: 0.6943


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/32 [00:00<?, ?it/s]


--- FOLD 5/5 ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Ep 1:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 2:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 3:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Ep 4:   0%|          | 0/300 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Fold Best Score: 0.7130


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/32 [00:00<?, ?it/s]


Optimizing Severity Thresholds...
Optimal Thresholds: [1.66343307 2.51391314 3.38954637 4.59472854]
Saved submission_roberta.csv
