In [1]:
# Run this first if packages aren't installed
!uv pip install transformers torch scikit-learn datasets pandas numpy emoji 'accelerate>=0.26.0'

[2mUsing Python 3.13.5 environment at: /home/kjnyua/miniconda3/envs/daystar[0m
[2mAudited [1m8 packages[0m [2min 15ms[0m[0m


In [2]:
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    
print("Libraries imported successfully!")
print(f"Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

Libraries imported successfully!
Using device: CPU


In [3]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())
print("\nFirst few rows:")
print(train_df.head())
print("\nLabel distribution:")
print(train_df['type'].value_counts())

Training data shape: (39650, 3)
Test data shape: (15581, 2)

Training data columns: ['Tweet_ID', 'tweet', 'type']

First few rows:
      Tweet_ID                                              tweet  \
0  ID_0022DWKP  Had a dream i got raped last night. By a guy i...   
1  ID_00395QYM  he thought the word raped means sex and told m...   
2  ID_003EOSSF  She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...   
3  ID_004BBHOD  I was sexually abused for 3 years at age 4 to ...   
4  ID_004F7516  Chessy Prout can do better by telling the trut...   

              type  
0  sexual_violence  
1  sexual_violence  
2  sexual_violence  
3  sexual_violence  
4  sexual_violence  

Label distribution:
type
sexual_violence                 32648
Physical_violence                5946
emotional_violence                651
economic_violence                 217
Harmful_Traditional_practice      188
Name: count, dtype: int64


In [4]:
def preprocess_text(text):
    """Clean tweet text"""
    if pd.isna(text):
        return ""
    text = str(text)
    # Normalize mentions and URLs (common in tweets)
    text = text.replace("@", "@user")
    # Keep hashtags but normalize
    text = text.replace("#", " #")
    return text.strip()

# Apply preprocessing
train_df['tweeet'] = train_df['tweet'].apply(preprocess_text)
test_df['tweet'] = test_df['tweet'].apply(preprocess_text)

print("Text preprocessing completed!")
print("Sample processed text:", train_df['tweet'].iloc[0])

Text preprocessing completed!
Sample processed text: Had a dream i got raped last night. By a guy i work with. Actually a guy i smoked with once at my house but he was doing too much tryna be sexual and it wasn’t even like that for me just wanted to smoke.


In [5]:
# Label encoding
label_list = sorted(train_df['type'].unique().tolist())  # Sort for consistency
n_labels = len(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
train_df['label'] = train_df['type'].map(label2id)

print(f"Number of labels: {n_labels}")
print(f"Labels: {label_list}")
print(f"Label mapping: {label2id}")

Number of labels: 5
Labels: ['Harmful_Traditional_practice', 'Physical_violence', 'economic_violence', 'emotional_violence', 'sexual_violence']
Label mapping: {'Harmful_Traditional_practice': 0, 'Physical_violence': 1, 'economic_violence': 2, 'emotional_violence': 3, 'sexual_violence': 4}


In [6]:
# Calculate class weights for imbalanced data
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

print("Class weights (to handle imbalanced data):")
for label, weight in zip(label_list, class_weights):
    print(f"  {label}: {weight:.3f}")

Class weights (to handle imbalanced data):
  Harmful_Traditional_practice: 42.181
  Physical_violence: 1.334
  economic_violence: 36.544
  emotional_violence: 12.181
  sexual_violence: 0.243


In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)

def tokenize_function(examples):
    """Tokenize text with proper padding"""
    return tokenizer(
        examples['tweet'], 
        truncation=True, 
        padding='max_length', 
        max_length=128,
        return_tensors=None
    )

# Prepare datasets
print("Preparing datasets...")
full_dataset = Dataset.from_pandas(train_df[['tweet', 'label']])
full_dataset = full_dataset.map(tokenize_function, batched=True)

test_dataset = Dataset.from_pandas(test_df[['tweet']])
test_dataset = test_dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)
print("Datasets prepared successfully!")

Preparing datasets...


Map:   0%|          | 0/39650 [00:00<?, ? examples/s]

Map:   0%|          | 0/15581 [00:00<?, ? examples/s]

Datasets prepared successfully!


In [None]:
# Alternative Cell 8: No Custom Trainer (Simpler)
# Just use the standard Trainer - still very effective!

def compute_metrics(eval_pred):
    """Compute accuracy"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(labels, predictions)
    
    return {'accuracy': accuracy}

print("Simple trainer setup completed!")

# Alternative Cell 9: Cross-validation without custom trainer
print("Starting 5-fold cross-validation...")
print("This may take 15-30 minutes - grab a coffee! ☕")

import transformers
print(f"Transformers version: {transformers.__version__}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(train_df), n_labels))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f"\n### Fold {fold+1}/5")
    
    train_ds = full_dataset.select(train_idx)
    val_ds = full_dataset.select(val_idx)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        "vinai/bertweet-base",
        num_labels=n_labels,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    
    # Simple TrainingArguments - no custom trainer needed
    args = TrainingArguments(
        output_dir=f"out_fold{fold}",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        seed=SEED,
        logging_steps=100,
        report_to=None,
        save_strategy="no",
    )
    
    # Use standard Trainer (no class weights, but still very effective)
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    try:
        # Train the model
        trainer.train()
        
        # Evaluate on validation set
        eval_result = trainer.evaluate()
        fold_accuracy = eval_result['eval_accuracy']
        
        # Get predictions for out-of-fold validation
        preds = trainer.predict(val_ds).predictions
        oof_preds[val_idx] = preds
        
        fold_scores.append(fold_accuracy)
        print(f"Fold {fold+1} accuracy: {fold_accuracy:.4f}")
        
        # Clear memory
        del model, trainer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"Out of memory error in fold {fold+1}. Trying with smaller batch size...")
            
            # Clear memory first
            del model, trainer
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
            # Recreate model with smaller batch size
            model = AutoModelForSequenceClassification.from_pretrained(
                "vinai/bertweet-base",
                num_labels=n_labels,
                id2label=id2label,
                label2id=label2id,
                ignore_mismatched_sizes=True
            )
            
            args = TrainingArguments(
                output_dir=f"out_fold{fold}",
                learning_rate=2e-5,
                per_device_train_batch_size=8,  # Reduced batch size
                per_device_eval_batch_size=8,
                num_train_epochs=3,
                weight_decay=0.01,
                seed=SEED,
                logging_steps=100,
                report_to=None,
                save_strategy="no",
            )
            
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=train_ds,
                eval_dataset=val_ds,
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
            )
            
            trainer.train()
            eval_result = trainer.evaluate()
            fold_accuracy = eval_result['eval_accuracy']
            preds = trainer.predict(val_ds).predictions
            oof_preds[val_idx] = preds
            fold_scores.append(fold_accuracy)
            print(f"Fold {fold+1} accuracy (reduced batch): {fold_accuracy:.4f}")
            
            # Clear memory again
            del model, trainer
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
        else:
            print(f"Error in fold {fold+1}: {e}")
            # Add a dummy score to continue
            fold_scores.append(0.0)

print(f"\n🎉 Cross-validation completed!")
print(f"Fold scores: {[f'{score:.4f}' for score in fold_scores if score > 0]}")

if len([s for s in fold_scores if s > 0]) > 0:
    valid_scores = [s for s in fold_scores if s > 0]
    print(f"Average CV accuracy: {np.mean(valid_scores):.4f}")
else:
    print("⚠️ No successful folds completed. Check your setup.")

Simple trainer setup completed!
Starting 5-fold cross-validation...
This may take 15-30 minutes - grab a coffee! ☕
Transformers version: 4.53.3

### Fold 1/5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
