In [None]:
import pandas as pd

import numpy as np

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

from torch.optim import AdamW 

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, r2_score

import re

# --- Configuration ---

class Config:

    MAX_LEN = 512           

    TRAIN_BATCH_SIZE = 16

    VALID_BATCH_SIZE = 16

    EPOCHS = 10

    LEARNING_RATE = 3e-5

    BERT_MODEL_NAME = 'bert-base-uncased' 

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {Config.device}")



In [None]:
def clean_text(text):

    if not isinstance(text, str):

        return ""

    

    # 1. Remove the "XXXX" redaction placeholders

    text = re.sub(r'X{2,}', '', text) 

    

    # 2. Standardize money format {$100.00} -> money_token

    text = re.sub(r'\{\$[\d,]+\.?\d*\}', '[MONEY]', text)

    

    # 3. Remove non-alphanumeric characters (keep basic punctuation)

    text = re.sub(r'[^a-zA-Z0-9\s\.,!?]', '', text)

    

    # 4. Remove extra whitespace

    text = re.sub(r'\s+', ' ', text).strip()

    

    return text



In [None]:
# --- Load Data ---

df_train = pd.read_csv("/kaggle/input/neural-craft/train_complaints.csv")

df_test = pd.read_csv("/kaggle/input/neural-craft/test_complaints.csv")

# Apply Cleaning

df_train['clean_text'] = df_train['complaint_text'].apply(clean_text)

df_test['clean_text'] = df_test['complaint_text'].apply(clean_text)

# --- Label Encoding ---

# We need to map text categories to numbers (0, 1, 2...)

primary_encoder = LabelEncoder()

secondary_encoder = LabelEncoder()

df_train['primary_label'] = primary_encoder.fit_transform(df_train['primary_category'])

df_train['secondary_label'] = secondary_encoder.fit_transform(df_train['secondary_category'])

# Severity is already numeric (1-5), but for regression we ensure it's float

df_train['severity'] = df_train['severity'].astype(float)

# Save number of classes for the model architecture

NUM_PRIMARY_LABELS = len(primary_encoder.classes_)

NUM_SECONDARY_LABELS = len(secondary_encoder.classes_)

print(f"Primary Categories: {NUM_PRIMARY_LABELS}")

print(f"Secondary Categories: {NUM_SECONDARY_LABELS}")




In [None]:

class ComplaintDataset(Dataset):

    def __init__(self, df, tokenizer, max_len, is_test=False):

        self.df = df

        self.tokenizer = tokenizer

        self.max_len = max_len

        self.is_test = is_test

        self.text = df['clean_text'].values

        

        if not self.is_test:

            self.primary = df['primary_label'].values

            self.secondary = df['secondary_label'].values

            self.severity = df['severity'].values

    def __len__(self):

        return len(self.df)

    def __getitem__(self, index):

        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(

            text,

            None,

            add_special_tokens=True,

            max_length=self.max_len,

            padding='max_length',

            truncation=True,

            return_token_type_ids=False

        )

        item = {

            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),

            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)

        }

        if not self.is_test:

            item['primary_labels'] = torch.tensor(self.primary[index], dtype=torch.long)

            item['secondary_labels'] = torch.tensor(self.secondary[index], dtype=torch.long)

            # Severity is float for Regression (MSE Loss)

            item['severity_labels'] = torch.tensor(self.severity[index], dtype=torch.float)

        return item



class MultiTaskBERT(nn.Module):

    def __init__(self, n_primary, n_secondary):

        super(MultiTaskBERT, self).__init__()

        self.bert = BertModel.from_pretrained(Config.BERT_MODEL_NAME)

        

        # INCREASED CAPACITY: Add an extra dense layer

        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)

        self.classifier_dropout = nn.Dropout(p=0.3)

        self.activation = nn.Tanh() # Tanh is often used in BERT heads

        # Head 1: Primary

        self.out_primary = nn.Linear(self.bert.config.hidden_size, n_primary)

        

        # Head 2: Secondary

        self.out_secondary = nn.Linear(self.bert.config.hidden_size, n_secondary)

        

        # Head 3: Severity

        self.out_severity = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        pooler_output = output.pooler_output

        

        # Pass through the new intermediate layer

        hidden_state = self.pre_classifier(pooler_output)

        hidden_state = self.activation(hidden_state)

        hidden_state = self.classifier_dropout(hidden_state)

        

        # Final predictions

        primary_logits = self.out_primary(hidden_state)

        secondary_logits = self.out_secondary(hidden_state)

        severity_logits = self.out_severity(hidden_state)

        

        return primary_logits, secondary_logits, severity_logits



def train_fn(data_loader, model, optimizer, device, scheduler):

    model.train()

    total_loss = 0

    

    # === CRITICAL CHANGE: Apply Class Weights here ===

    loss_fct_prim = nn.CrossEntropyLoss(weight=weights_prim)

    loss_fct_sec = nn.CrossEntropyLoss(weight=weights_sec)

    

    # Severity is regression, so we don't use class weights, 

    # but we can multiply the loss to make the model focus on it more.

    loss_fct_sev = nn.MSELoss()

    for d in data_loader:

        input_ids = d["input_ids"].to(device)

        attention_mask = d["attention_mask"].to(device)

        

        primary_targets = d["primary_labels"].to(device)

        secondary_targets = d["secondary_labels"].to(device)

        severity_targets = d["severity_labels"].to(device)

        optimizer.zero_grad()

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Calculate individual losses

        loss_prim = loss_fct_prim(o_prim, primary_targets)

        loss_sec = loss_fct_sec(o_sec, secondary_targets)

        loss_sev = loss_fct_sev(o_sev.view(-1), severity_targets)

        

        # === TUNING THE COMBINED LOSS ===

        # We assume Secondary is hardest, so we might give it slightly more weight 

        # or keep them equal. Let's boost Severity importance slightly to improve R2.

        loss = (1.0 * loss_prim) + (2.0 * loss_sec) + (1.0 * loss_sev)

        

        total_loss += loss.item()

        loss.backward()

        

        # Gradient Clipping (Prevents "exploding gradients" which ruin accuracy)

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        

        optimizer.step()

        scheduler.step()

        

    return total_loss / len(data_loader)

def eval_fn(data_loader, model, device):

    model.eval()

    

    fin_targets_prim = []

    fin_outputs_prim = []

    

    fin_targets_sec = []

    fin_outputs_sec = []

    

    fin_targets_sev = []

    fin_outputs_sev = []

    with torch.no_grad():

        for d in data_loader:

            input_ids = d["input_ids"].to(device)

            attention_mask = d["attention_mask"].to(device)

            

            p_t = d["primary_labels"].cpu().detach().numpy()

            s_t = d["secondary_labels"].cpu().detach().numpy()

            sev_t = d["severity_labels"].cpu().detach().numpy()

            

            o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

            

            # For classification, get the index of max logit

            prim_preds = torch.argmax(o_prim, dim=1).cpu().detach().numpy()

            sec_preds = torch.argmax(o_sec, dim=1).cpu().detach().numpy()

            # For regression, keep raw float

            sev_preds = o_sev.view(-1).cpu().detach().numpy()

            fin_targets_prim.extend(p_t)

            fin_outputs_prim.extend(prim_preds)

            

            fin_targets_sec.extend(s_t)

            fin_outputs_sec.extend(sec_preds)

            

            fin_targets_sev.extend(sev_t)

            fin_outputs_sev.extend(sev_preds)

    return (fin_outputs_prim, fin_targets_prim, 

            fin_outputs_sec, fin_targets_sec, 

            fin_outputs_sev, fin_targets_sev)





In [None]:


from sklearn.utils.class_weight import compute_class_weight

# Calculate weights for Primary Category

class_weights_prim = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['primary_label']), 

    y=df_train['primary_label']

)

# Convert to Tensor

weights_prim = torch.tensor(class_weights_prim, dtype=torch.float).to(Config.device)

# Calculate weights for Secondary Category

class_weights_sec = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['secondary_label']), 

    y=df_train['secondary_label']

)

# Convert to Tensor

weights_sec = torch.tensor(class_weights_sec, dtype=torch.float).to(Config.device)

print("Class weights calculated and moved to device.")



# Split Train into Train/Validation

df_train_split, df_val_split = train_test_split(df_train, test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained(Config.BERT_MODEL_NAME)

train_dataset = ComplaintDataset(df_train_split, tokenizer, Config.MAX_LEN)

valid_dataset = ComplaintDataset(df_val_split, tokenizer, Config.MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=True)

valid_data_loader = DataLoader(valid_dataset, batch_size=Config.VALID_BATCH_SIZE)

# --- Setup ---

best_score = -float('inf')

best_model_path = "best_multitask_bert.bin"

# Initialize Model

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model = model.to(Config.device)

# Optimizer & Scheduler

param_optimizer = list(model.named_parameters())

no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [

    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},

    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},

]

num_train_steps = int(len(df_train_split) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)

optimizer = AdamW(optimizer_grouped_parameters, lr=Config.LEARNING_RATE)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# --- Training Loop ---

print(f"Starting training for {Config.EPOCHS} epochs...")

for epoch in range(Config.EPOCHS):

    # 1. Train

    train_loss = train_fn(train_data_loader, model, optimizer, Config.device, scheduler)

    

    # 2. Validate

    o_prim, t_prim, o_sec, t_sec, o_sev, t_sev = eval_fn(valid_data_loader, model, Config.device)

    

    # 3. Calculate Metrics

    acc_prim = accuracy_score(t_prim, o_prim)

    acc_sec = accuracy_score(t_sec, o_sec)

    r2_sev = r2_score(t_sev, o_sev)

    

    # User's Custom Metric

    final_score = (0.3 * acc_prim) + (0.4 * acc_sec) + (0.3 * r2_sev)

    

    print(f"Epoch {epoch+1}/{Config.EPOCHS} | Train Loss: {train_loss:.4f}")

    print(f"Scores -> Primary: {acc_prim:.4f} | Secondary: {acc_sec:.4f} | Severity R2: {r2_sev:.4f}")

    print(f"Weighted Score: {final_score:.4f}")

    

    # 4. Save Logic: If this score is better than the best, save it!

    if final_score > best_score:

        print(f"--> Improvement detected! Saving model (Old Best: {best_score:.4f} -> New Best: {final_score:.4f})")

        torch.save(model.state_dict(), best_model_path)

        best_score = final_score

    else:

        print(f"--> No improvement (Best is still: {best_score:.4f})")

    

    print("-" * 30)

print(f"\nTraining Complete. Best Weighted Score achieved: {best_score:.4f}")




In [None]:

# --- Load the Best Model ---

print("Loading the best saved model for inference...")

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model.load_state_dict(torch.load("best_multitask_bert.bin"))

model = model.to(Config.device)

model.eval()

# --- Setup Test Data ---

test_dataset = ComplaintDataset(df_test, tokenizer, Config.MAX_LEN, is_test=True)

test_data_loader = DataLoader(test_dataset, batch_size=Config.VALID_BATCH_SIZE, shuffle=False)

predictions_prim = []

predictions_sec = []

predictions_sev = []

# --- Predict ---

with torch.no_grad():

    for d in test_data_loader:

        input_ids = d["input_ids"].to(Config.device)

        attention_mask = d["attention_mask"].to(Config.device)

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Classification: Get Max Logit

        predictions_prim.extend(torch.argmax(o_prim, dim=1).cpu().detach().numpy())

        predictions_sec.extend(torch.argmax(o_sec, dim=1).cpu().detach().numpy())

        

        # Regression: Get Raw Float

        predictions_sev.extend(o_sev.view(-1).cpu().detach().numpy())

# --- Inverse Transform Labels ---

final_prim_labels = primary_encoder.inverse_transform(predictions_prim)

final_sec_labels = secondary_encoder.inverse_transform(predictions_sec)

# --- Post-process Severity ---

# Round to nearest integer (1.9 -> 2) and clip to range [1, 5]

final_sev_labels = [int(max(1, min(5, round(x)))) for x in predictions_sev]

# --- Create Submission ---

submission = pd.DataFrame({

    'complaint_id': df_test['complaint_id'],

    'complaint_text': df_test['complaint_text'],

    'primary_category': final_prim_labels,

    'secondary_category': final_sec_labels,

    'severity': final_sev_labels

})

submission.to_csv("submission_best_model.csv", index=False)

print("Submission file 'submission_best_model.csv' created successfully using the best model.")

print(submission.head())

In [None]:
import pandas as pd

import numpy as np

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

from torch.optim import AdamW 

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, r2_score

import re

# --- Configuration ---

class Config:

    MAX_LEN = 512           

    TRAIN_BATCH_SIZE = 16

    VALID_BATCH_SIZE = 16

    EPOCHS = 10

    LEARNING_RATE = 3e-5

    BERT_MODEL_NAME = 'bert-base-uncased' 

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {Config.device}")



def clean_text(text):

    if not isinstance(text, str):

        return ""

    

    # 1. Remove the "XXXX" redaction placeholders

    text = re.sub(r'X{2,}', '', text) 

    

    # 2. Standardize money format {$100.00} -> money_token

    text = re.sub(r'\{\$[\d,]+\.?\d*\}', '[MONEY]', text)

    

    # 3. Remove non-alphanumeric characters (keep basic punctuation)

    text = re.sub(r'[^a-zA-Z0-9\s\.,!?]', '', text)

    

    # 4. Remove extra whitespace

    text = re.sub(r'\s+', ' ', text).strip()

    

    return text

# --- Load Data ---

# Assuming files are named 'train.csv' and 'test.csv'

# Replace with your actual file paths if different

df_train = pd.read_csv("/kaggle/input/neural-craft/train_complaints.csv")

df_test = pd.read_csv("/kaggle/input/neural-craft/test_complaints.csv")

# Apply Cleaning

df_train['clean_text'] = df_train['complaint_text'].apply(clean_text)

df_test['clean_text'] = df_test['complaint_text'].apply(clean_text)

# --- Label Encoding ---

# We need to map text categories to numbers (0, 1, 2...)

primary_encoder = LabelEncoder()

secondary_encoder = LabelEncoder()

df_train['primary_label'] = primary_encoder.fit_transform(df_train['primary_category'])

df_train['secondary_label'] = secondary_encoder.fit_transform(df_train['secondary_category'])

# Severity is already numeric (1-5), but for regression we ensure it's float

df_train['severity'] = df_train['severity'].astype(float)

# Save number of classes for the model architecture

NUM_PRIMARY_LABELS = len(primary_encoder.classes_)

NUM_SECONDARY_LABELS = len(secondary_encoder.classes_)

print(f"Primary Categories: {NUM_PRIMARY_LABELS}")

print(f"Secondary Categories: {NUM_SECONDARY_LABELS}")



class ComplaintDataset(Dataset):

    def __init__(self, df, tokenizer, max_len, is_test=False):

        self.df = df

        self.tokenizer = tokenizer

        self.max_len = max_len

        self.is_test = is_test

        self.text = df['clean_text'].values

        

        if not self.is_test:

            self.primary = df['primary_label'].values

            self.secondary = df['secondary_label'].values

            self.severity = df['severity'].values

    def __len__(self):

        return len(self.df)

    def __getitem__(self, index):

        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(

            text,

            None,

            add_special_tokens=True,

            max_length=self.max_len,

            padding='max_length',

            truncation=True,

            return_token_type_ids=False

        )

        item = {

            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),

            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)

        }

        if not self.is_test:

            item['primary_labels'] = torch.tensor(self.primary[index], dtype=torch.long)

            item['secondary_labels'] = torch.tensor(self.secondary[index], dtype=torch.long)

            # Severity is float for Regression (MSE Loss)

            item['severity_labels'] = torch.tensor(self.severity[index], dtype=torch.float)

        return item



class MultiTaskBERT(nn.Module):

    def __init__(self, n_primary, n_secondary):

        super(MultiTaskBERT, self).__init__()

        self.bert = BertModel.from_pretrained(Config.BERT_MODEL_NAME)

        

        # INCREASED CAPACITY: Add an extra dense layer

        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)

        self.classifier_dropout = nn.Dropout(p=0.3)

        self.activation = nn.Tanh() # Tanh is often used in BERT heads

        # Head 1: Primary

        self.out_primary = nn.Linear(self.bert.config.hidden_size, n_primary)

        

        # Head 2: Secondary

        self.out_secondary = nn.Linear(self.bert.config.hidden_size, n_secondary)

        

        # Head 3: Severity

        self.out_severity = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        pooler_output = output.pooler_output

        

        # Pass through the new intermediate layer

        hidden_state = self.pre_classifier(pooler_output)

        hidden_state = self.activation(hidden_state)

        hidden_state = self.classifier_dropout(hidden_state)

        

        # Final predictions

        primary_logits = self.out_primary(hidden_state)

        secondary_logits = self.out_secondary(hidden_state)

        severity_logits = self.out_severity(hidden_state)

        

        return primary_logits, secondary_logits, severity_logits



def train_fn(data_loader, model, optimizer, device, scheduler):

    model.train()

    total_loss = 0

    

    # === CRITICAL CHANGE: Apply Class Weights here ===

    loss_fct_prim = nn.CrossEntropyLoss(weight=weights_prim)

    loss_fct_sec = nn.CrossEntropyLoss(weight=weights_sec)

    

    # Severity is regression, so we don't use class weights, 

    # but we can multiply the loss to make the model focus on it more.

    loss_fct_sev = nn.MSELoss()

    for d in data_loader:

        input_ids = d["input_ids"].to(device)

        attention_mask = d["attention_mask"].to(device)

        

        primary_targets = d["primary_labels"].to(device)

        secondary_targets = d["secondary_labels"].to(device)

        severity_targets = d["severity_labels"].to(device)

        optimizer.zero_grad()

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Calculate individual losses

        loss_prim = loss_fct_prim(o_prim, primary_targets)

        loss_sec = loss_fct_sec(o_sec, secondary_targets)

        loss_sev = loss_fct_sev(o_sev.view(-1), severity_targets)

        

        # === TUNING THE COMBINED LOSS ===

        # We assume Secondary is hardest, so we might give it slightly more weight 

        # or keep them equal. Let's boost Severity importance slightly to improve R2.

        loss = (1.0 * loss_prim) + (2.0 * loss_sec) + (1.0 * loss_sev)

        

        total_loss += loss.item()

        loss.backward()

        

        # Gradient Clipping (Prevents "exploding gradients" which ruin accuracy)

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        

        optimizer.step()

        scheduler.step()

        

    return total_loss / len(data_loader)

def eval_fn(data_loader, model, device):

    model.eval()

    

    fin_targets_prim = []

    fin_outputs_prim = []

    

    fin_targets_sec = []

    fin_outputs_sec = []

    

    fin_targets_sev = []

    fin_outputs_sev = []

    with torch.no_grad():

        for d in data_loader:

            input_ids = d["input_ids"].to(device)

            attention_mask = d["attention_mask"].to(device)

            

            p_t = d["primary_labels"].cpu().detach().numpy()

            s_t = d["secondary_labels"].cpu().detach().numpy()

            sev_t = d["severity_labels"].cpu().detach().numpy()

            

            o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

            

            # For classification, get the index of max logit

            prim_preds = torch.argmax(o_prim, dim=1).cpu().detach().numpy()

            sec_preds = torch.argmax(o_sec, dim=1).cpu().detach().numpy()

            # For regression, keep raw float

            sev_preds = o_sev.view(-1).cpu().detach().numpy()

            fin_targets_prim.extend(p_t)

            fin_outputs_prim.extend(prim_preds)

            

            fin_targets_sec.extend(s_t)

            fin_outputs_sec.extend(sec_preds)

            

            fin_targets_sev.extend(sev_t)

            fin_outputs_sev.extend(sev_preds)

    return (fin_outputs_prim, fin_targets_prim, 

            fin_outputs_sec, fin_targets_sec, 

            fin_outputs_sev, fin_targets_sev)





from sklearn.utils.class_weight import compute_class_weight

# Calculate weights for Primary Category

class_weights_prim = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['primary_label']), 

    y=df_train['primary_label']

)

# Convert to Tensor

weights_prim = torch.tensor(class_weights_prim, dtype=torch.float).to(Config.device)

# Calculate weights for Secondary Category

class_weights_sec = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['secondary_label']), 

    y=df_train['secondary_label']

)

# Convert to Tensor

weights_sec = torch.tensor(class_weights_sec, dtype=torch.float).to(Config.device)

print("Class weights calculated and moved to device.")



# Split Train into Train/Validation

df_train_split, df_val_split = train_test_split(df_train, test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained(Config.BERT_MODEL_NAME)

train_dataset = ComplaintDataset(df_train_split, tokenizer, Config.MAX_LEN)

valid_dataset = ComplaintDataset(df_val_split, tokenizer, Config.MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=True)

valid_data_loader = DataLoader(valid_dataset, batch_size=Config.VALID_BATCH_SIZE)

# --- Setup ---

best_score = -float('inf')

best_model_path = "best_multitask_bert.bin"

# Initialize Model

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model = model.to(Config.device)

# Optimizer & Scheduler

param_optimizer = list(model.named_parameters())

no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [

    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},

    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},

]

num_train_steps = int(len(df_train_split) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)

optimizer = AdamW(optimizer_grouped_parameters, lr=Config.LEARNING_RATE)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# --- Training Loop ---

print(f"Starting training for {Config.EPOCHS} epochs...")

for epoch in range(Config.EPOCHS):

    # 1. Train

    train_loss = train_fn(train_data_loader, model, optimizer, Config.device, scheduler)

    

    # 2. Validate

    o_prim, t_prim, o_sec, t_sec, o_sev, t_sev = eval_fn(valid_data_loader, model, Config.device)

    

    # 3. Calculate Metrics

    acc_prim = accuracy_score(t_prim, o_prim)

    acc_sec = accuracy_score(t_sec, o_sec)

    r2_sev = r2_score(t_sev, o_sev)

    

    # User's Custom Metric

    final_score = (0.3 * acc_prim) + (0.4 * acc_sec) + (0.3 * r2_sev)

    

    print(f"Epoch {epoch+1}/{Config.EPOCHS} | Train Loss: {train_loss:.4f}")

    print(f"Scores -> Primary: {acc_prim:.4f} | Secondary: {acc_sec:.4f} | Severity R2: {r2_sev:.4f}")

    print(f"Weighted Score: {final_score:.4f}")

    

    # 4. Save Logic: If this score is better than the best, save it!

    if final_score > best_score:

        print(f"--> Improvement detected! Saving model (Old Best: {best_score:.4f} -> New Best: {final_score:.4f})")

        torch.save(model.state_dict(), best_model_path)

        best_score = final_score

    else:

        print(f"--> No improvement (Best is still: {best_score:.4f})")

    

    print("-" * 30)

print(f"\nTraining Complete. Best Weighted Score achieved: {best_score:.4f}")



# --- Load the Best Model ---

print("Loading the best saved model for inference...")

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model.load_state_dict(torch.load("best_multitask_bert.bin"))

model = model.to(Config.device)

model.eval()

# --- Setup Test Data ---

test_dataset = ComplaintDataset(df_test, tokenizer, Config.MAX_LEN, is_test=True)

test_data_loader = DataLoader(test_dataset, batch_size=Config.VALID_BATCH_SIZE, shuffle=False)

predictions_prim = []

predictions_sec = []

predictions_sev = []

# --- Predict ---

with torch.no_grad():

    for d in test_data_loader:

        input_ids = d["input_ids"].to(Config.device)

        attention_mask = d["attention_mask"].to(Config.device)

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Classification: Get Max Logit

        predictions_prim.extend(torch.argmax(o_prim, dim=1).cpu().detach().numpy())

        predictions_sec.extend(torch.argmax(o_sec, dim=1).cpu().detach().numpy())

        

        # Regression: Get Raw Float

        predictions_sev.extend(o_sev.view(-1).cpu().detach().numpy())

# --- Inverse Transform Labels ---

final_prim_labels = primary_encoder.inverse_transform(predictions_prim)

final_sec_labels = secondary_encoder.inverse_transform(predictions_sec)

# --- Post-process Severity ---

# Round to nearest integer (1.9 -> 2) and clip to range [1, 5]

final_sev_labels = [int(max(1, min(5, round(x)))) for x in predictions_sev]

# --- Create Submission ---

submission = pd.DataFrame({

    'complaint_id': df_test['complaint_id'],

    'complaint_text': df_test['complaint_text'],

    'primary_category': final_prim_labels,

    'secondary_category': final_sec_labels,

    'severity': final_sev_labels

})

submission.to_csv("submission_best_model.csv", index=False)

print("Submission file 'submission_best_model.csv' created successfully using the best model.")

print(submission.head())

In [None]:
import pandas as pd

import numpy as np

import torch

import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

from torch.optim import AdamW 

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, r2_score

import re

# --- Configuration ---

class Config:

    MAX_LEN = 512           

    TRAIN_BATCH_SIZE = 16

    VALID_BATCH_SIZE = 16

    EPOCHS = 10

    LEARNING_RATE = 3e-5

    BERT_MODEL_NAME = 'bert-base-uncased' 

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {Config.device}")



def clean_text(text):

    if not isinstance(text, str):

        return ""

    

    # 1. Remove the "XXXX" redaction placeholders

    text = re.sub(r'X{2,}', '', text) 

    

    # 2. Standardize money format {$100.00} -> money_token

    text = re.sub(r'\{\$[\d,]+\.?\d*\}', '[MONEY]', text)

    

    # 3. Remove non-alphanumeric characters (keep basic punctuation)

    text = re.sub(r'[^a-zA-Z0-9\s\.,!?]', '', text)

    

    # 4. Remove extra whitespace

    text = re.sub(r'\s+', ' ', text).strip()

    

    return text

# --- Load Data ---

# Assuming files are named 'train.csv' and 'test.csv'

# Replace with your actual file paths if different

df_train = pd.read_csv("/kaggle/input/neural-craft/train_complaints.csv")

df_test = pd.read_csv("/kaggle/input/neural-craft/test_complaints.csv")

# Apply Cleaning

df_train['clean_text'] = df_train['complaint_text'].apply(clean_text)

df_test['clean_text'] = df_test['complaint_text'].apply(clean_text)

# --- Label Encoding ---

# We need to map text categories to numbers (0, 1, 2...)

primary_encoder = LabelEncoder()

secondary_encoder = LabelEncoder()

df_train['primary_label'] = primary_encoder.fit_transform(df_train['primary_category'])

df_train['secondary_label'] = secondary_encoder.fit_transform(df_train['secondary_category'])

# Severity is already numeric (1-5), but for regression we ensure it's float

df_train['severity'] = df_train['severity'].astype(float)

# Save number of classes for the model architecture

NUM_PRIMARY_LABELS = len(primary_encoder.classes_)

NUM_SECONDARY_LABELS = len(secondary_encoder.classes_)

print(f"Primary Categories: {NUM_PRIMARY_LABELS}")

print(f"Secondary Categories: {NUM_SECONDARY_LABELS}")



class ComplaintDataset(Dataset):

    def __init__(self, df, tokenizer, max_len, is_test=False):

        self.df = df

        self.tokenizer = tokenizer

        self.max_len = max_len

        self.is_test = is_test

        self.text = df['clean_text'].values

        

        if not self.is_test:

            self.primary = df['primary_label'].values

            self.secondary = df['secondary_label'].values

            self.severity = df['severity'].values

    def __len__(self):

        return len(self.df)

    def __getitem__(self, index):

        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(

            text,

            None,

            add_special_tokens=True,

            max_length=self.max_len,

            padding='max_length',

            truncation=True,

            return_token_type_ids=False

        )

        item = {

            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),

            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)

        }

        if not self.is_test:

            item['primary_labels'] = torch.tensor(self.primary[index], dtype=torch.long)

            item['secondary_labels'] = torch.tensor(self.secondary[index], dtype=torch.long)

            # Severity is float for Regression (MSE Loss)

            item['severity_labels'] = torch.tensor(self.severity[index], dtype=torch.float)

        return item



class MultiTaskBERT(nn.Module):

    def __init__(self, n_primary, n_secondary):

        super(MultiTaskBERT, self).__init__()

        self.bert = BertModel.from_pretrained(Config.BERT_MODEL_NAME)

        

        # INCREASED CAPACITY: Add an extra dense layer

        self.pre_classifier = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)

        self.classifier_dropout = nn.Dropout(p=0.3)

        self.activation = nn.Tanh() # Tanh is often used in BERT heads

        # Head 1: Primary

        self.out_primary = nn.Linear(self.bert.config.hidden_size, n_primary)

        

        # Head 2: Secondary

        self.out_secondary = nn.Linear(self.bert.config.hidden_size, n_secondary)

        

        # Head 3: Severity

        self.out_severity = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        pooler_output = output.pooler_output

        

        # Pass through the new intermediate layer

        hidden_state = self.pre_classifier(pooler_output)

        hidden_state = self.activation(hidden_state)

        hidden_state = self.classifier_dropout(hidden_state)

        

        # Final predictions

        primary_logits = self.out_primary(hidden_state)

        secondary_logits = self.out_secondary(hidden_state)

        severity_logits = self.out_severity(hidden_state)

        

        return primary_logits, secondary_logits, severity_logits



def train_fn(data_loader, model, optimizer, device, scheduler):

    model.train()

    total_loss = 0

    

    # === CRITICAL CHANGE: Apply Class Weights here ===

    loss_fct_prim = nn.CrossEntropyLoss(weight=weights_prim)

    loss_fct_sec = nn.CrossEntropyLoss(weight=weights_sec)

    

    # Severity is regression, so we don't use class weights, 

    # but we can multiply the loss to make the model focus on it more.

    loss_fct_sev = nn.MSELoss()

    for d in data_loader:

        input_ids = d["input_ids"].to(device)

        attention_mask = d["attention_mask"].to(device)

        

        primary_targets = d["primary_labels"].to(device)

        secondary_targets = d["secondary_labels"].to(device)

        severity_targets = d["severity_labels"].to(device)

        optimizer.zero_grad()

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Calculate individual losses

        loss_prim = loss_fct_prim(o_prim, primary_targets)

        loss_sec = loss_fct_sec(o_sec, secondary_targets)

        loss_sev = loss_fct_sev(o_sev.view(-1), severity_targets)

        

        # === TUNING THE COMBINED LOSS ===

        # We assume Secondary is hardest, so we might give it slightly more weight 

        # or keep them equal. Let's boost Severity importance slightly to improve R2.

        loss = (1.0 * loss_prim) + (2.0 * loss_sec) + (1.0 * loss_sev)

        

        total_loss += loss.item()

        loss.backward()

        

        # Gradient Clipping (Prevents "exploding gradients" which ruin accuracy)

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        

        optimizer.step()

        scheduler.step()

        

    return total_loss / len(data_loader)

def eval_fn(data_loader, model, device):

    model.eval()

    

    fin_targets_prim = []

    fin_outputs_prim = []

    

    fin_targets_sec = []

    fin_outputs_sec = []

    

    fin_targets_sev = []

    fin_outputs_sev = []

    with torch.no_grad():

        for d in data_loader:

            input_ids = d["input_ids"].to(device)

            attention_mask = d["attention_mask"].to(device)

            

            p_t = d["primary_labels"].cpu().detach().numpy()

            s_t = d["secondary_labels"].cpu().detach().numpy()

            sev_t = d["severity_labels"].cpu().detach().numpy()

            

            o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

            

            # For classification, get the index of max logit

            prim_preds = torch.argmax(o_prim, dim=1).cpu().detach().numpy()

            sec_preds = torch.argmax(o_sec, dim=1).cpu().detach().numpy()

            # For regression, keep raw float

            sev_preds = o_sev.view(-1).cpu().detach().numpy()

            fin_targets_prim.extend(p_t)

            fin_outputs_prim.extend(prim_preds)

            

            fin_targets_sec.extend(s_t)

            fin_outputs_sec.extend(sec_preds)

            

            fin_targets_sev.extend(sev_t)

            fin_outputs_sev.extend(sev_preds)

    return (fin_outputs_prim, fin_targets_prim, 

            fin_outputs_sec, fin_targets_sec, 

            fin_outputs_sev, fin_targets_sev)





from sklearn.utils.class_weight import compute_class_weight

# Calculate weights for Primary Category

class_weights_prim = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['primary_label']), 

    y=df_train['primary_label']

)

# Convert to Tensor

weights_prim = torch.tensor(class_weights_prim, dtype=torch.float).to(Config.device)

# Calculate weights for Secondary Category

class_weights_sec = compute_class_weight(

    class_weight='balanced', 

    classes=np.unique(df_train['secondary_label']), 

    y=df_train['secondary_label']

)

# Convert to Tensor

weights_sec = torch.tensor(class_weights_sec, dtype=torch.float).to(Config.device)

print("Class weights calculated and moved to device.")



# Split Train into Train/Validation

df_train_split, df_val_split = train_test_split(df_train, test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained(Config.BERT_MODEL_NAME)

train_dataset = ComplaintDataset(df_train_split, tokenizer, Config.MAX_LEN)

valid_dataset = ComplaintDataset(df_val_split, tokenizer, Config.MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=True)

valid_data_loader = DataLoader(valid_dataset, batch_size=Config.VALID_BATCH_SIZE)

# --- Setup ---

best_score = -float('inf')

best_model_path = "best_multitask_bert.bin"

# Initialize Model

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model = model.to(Config.device)

# Optimizer & Scheduler

param_optimizer = list(model.named_parameters())

no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [

    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},

    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},

]

num_train_steps = int(len(df_train_split) / Config.TRAIN_BATCH_SIZE * Config.EPOCHS)

optimizer = AdamW(optimizer_grouped_parameters, lr=Config.LEARNING_RATE)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

# --- Training Loop ---

print(f"Starting training for {Config.EPOCHS} epochs...")

for epoch in range(Config.EPOCHS):

    # 1. Train

    train_loss = train_fn(train_data_loader, model, optimizer, Config.device, scheduler)

    

    # 2. Validate

    o_prim, t_prim, o_sec, t_sec, o_sev, t_sev = eval_fn(valid_data_loader, model, Config.device)

    

    # 3. Calculate Metrics

    acc_prim = accuracy_score(t_prim, o_prim)

    acc_sec = accuracy_score(t_sec, o_sec)

    r2_sev = r2_score(t_sev, o_sev)

    

    # User's Custom Metric

    final_score = (0.3 * acc_prim) + (0.4 * acc_sec) + (0.3 * r2_sev)

    

    print(f"Epoch {epoch+1}/{Config.EPOCHS} | Train Loss: {train_loss:.4f}")

    print(f"Scores -> Primary: {acc_prim:.4f} | Secondary: {acc_sec:.4f} | Severity R2: {r2_sev:.4f}")

    print(f"Weighted Score: {final_score:.4f}")

    

    # 4. Save Logic: If this score is better than the best, save it!

    if final_score > best_score:

        print(f"--> Improvement detected! Saving model (Old Best: {best_score:.4f} -> New Best: {final_score:.4f})")

        torch.save(model.state_dict(), best_model_path)

        best_score = final_score

    else:

        print(f"--> No improvement (Best is still: {best_score:.4f})")

    

    print("-" * 30)

print(f"\nTraining Complete. Best Weighted Score achieved: {best_score:.4f}")



# --- Load the Best Model ---

print("Loading the best saved model for inference...")

model = MultiTaskBERT(n_primary=NUM_PRIMARY_LABELS, n_secondary=NUM_SECONDARY_LABELS)

model.load_state_dict(torch.load("best_multitask_bert.bin"))

model = model.to(Config.device)

model.eval()

# --- Setup Test Data ---

test_dataset = ComplaintDataset(df_test, tokenizer, Config.MAX_LEN, is_test=True)

test_data_loader = DataLoader(test_dataset, batch_size=Config.VALID_BATCH_SIZE, shuffle=False)

predictions_prim = []

predictions_sec = []

predictions_sev = []

# --- Predict ---

with torch.no_grad():

    for d in test_data_loader:

        input_ids = d["input_ids"].to(Config.device)

        attention_mask = d["attention_mask"].to(Config.device)

        

        o_prim, o_sec, o_sev = model(input_ids=input_ids, attention_mask=attention_mask)

        

        # Classification: Get Max Logit

        predictions_prim.extend(torch.argmax(o_prim, dim=1).cpu().detach().numpy())

        predictions_sec.extend(torch.argmax(o_sec, dim=1).cpu().detach().numpy())

        

        # Regression: Get Raw Float

        predictions_sev.extend(o_sev.view(-1).cpu().detach().numpy())

# --- Inverse Transform Labels ---

final_prim_labels = primary_encoder.inverse_transform(predictions_prim)

final_sec_labels = secondary_encoder.inverse_transform(predictions_sec)

# --- Post-process Severity ---

# Round to nearest integer (1.9 -> 2) and clip to range [1, 5]

final_sev_labels = [int(max(1, min(5, round(x)))) for x in predictions_sev]

# --- Create Submission ---

submission = pd.DataFrame({

    'complaint_id': df_test['complaint_id'],

    'complaint_text': df_test['complaint_text'],

    'primary_category': final_prim_labels,

    'secondary_category': final_sec_labels,

    'severity': final_sev_labels

})

submission.to_csv("submission_best_model.csv", index=False)

print("Submission file 'submission_best_model.csv' created successfully using the best model.")

print(submission.head())