### Training DistlBERT model for LLM augmented features

Firstly, we will prepare the 2 LLM-augmented feature columns by fine-tuning the DistilBERT model. This process will be greatly sped up by leveraging a GPU for the training phase.  
If you have a GPU but torch is not detecting it, you can try running the below lines of code in terminal to try fix it.

1. Check if NVIDIA drivers are installed by running `nvidia-simi` and noting the version number e.g. 12.1, 11.8
    - If they are not installed, install them from the NVIDIA Website
2. Run `pip uninstall torch`
3. Reinstall torch
    - For version number 12.1, run `pip install torch --index-url https://download.pytorch.org/whl/cu121`
    - For version 11.8, run `pip install torch --index-url https://download.pytorch.org/whl/cu118`


In [1]:
import torch

if torch.cuda.is_available():
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, running on CPU")

# Import libraries
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
import json

CUDA available: True
GPU: NVIDIA GeForce RTX 3060 Ti


We will being by training the model to produce probabilities that the user is bot based on  `description` feature only first.  
  
This code assumes that you have a folder in the current directory named 'data' and contains 'train.csv' for the training dataset and 'test.csv' as the pre-split test dataset.  
  
Please change `train_data_path` and `test_data_path` if this is not the case.

In [2]:
def load_extract_data():
    """Load train and test data"""
    train_data_path = "data/train.csv"
    test_data_path = "data/test.csv"

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    # Extract out description and target from train, description from test
    train_texts = train_df['description'].fillna("").astype(str).tolist()
    train_labels = train_df['target'].tolist()
    test_texts = test_df['description'].fillna("").astype(str).tolist()

    print(f"Train dataset size: {len(train_df)} samples")
    print(f"Test dataset size: {len(test_df)} samples")

    return train_texts, train_labels, test_texts

# Load the preprocessed data
train_texts, train_labels, test_texts = load_extract_data()

# Model Setup for Binary Classification

model_name = "distilbert-base-uncased" 

# Load tokenizer (shared across all folds)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Model configured: {model_name}")
print("Tokenizer loaded successfully!")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=128  # Reasonable for Twitter descriptions
    )

def create_model():
    """Create a fresh model for each fold"""
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,  # Binary classification
        id2label={0: "human", 1: "bot"},
        label2id={"human": 0, "bot": 1}
    )

print("Model and tokenizer configured successfully!")

Train dataset size: 26206 samples
Test dataset size: 11232 samples
Model configured: distilbert-base-uncased
Tokenizer loaded successfully!
Model and tokenizer configured successfully!


In [3]:
# 5-Fold Cross-Validation Training with Immediate Test Predictions
# Train 5 models using cross-validation, generate bot_prob_from_desc feature

from sklearn.metrics import roc_auc_score

# Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(predictions), dim=-1)
    bot_probs = probs[:, 1].numpy()
    pred_labels = np.argmax(predictions, axis=1)

    return {
        'accuracy': accuracy_score(labels, pred_labels),
        'f1': f1_score(labels, pred_labels, average='weighted'),
        'auc': roc_auc_score(labels, bot_probs)
    }

# Training arguments optimized for Colab
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,                    # Reduced for faster CV
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=1000,                    # Reduced logging frequency
    eval_strategy="no",                    # Skip evaluation during training for speed
    save_strategy="no",
    fp16=True,
    dataloader_pin_memory=False,
    seed=42,
    report_to="none"
)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Setup 5-fold cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Convert to numpy arrays for easier indexing
train_texts_array = np.array(train_texts)
train_labels_array = np.array(train_labels)

# Store predictions for each fold
fold_predictions = np.zeros(len(train_texts))
test_predictions = []  # Store test predictions from each fold

# Create test dataset once
test_dataset = Dataset.from_dict({'text': test_texts, 'labels': [0] * len(test_texts)})  # Dummy labels
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("Starting 5-fold cross-validation training...")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_texts_array, train_labels_array)):
    print(f"\nFold {fold + 1}/{n_folds}")
    print("-" * 30)

    # Split data for this fold
    fold_train_texts = train_texts_array[train_idx].tolist()
    fold_train_labels = train_labels_array[train_idx].tolist()
    fold_val_texts = train_texts_array[val_idx].tolist()
    fold_val_labels = train_labels_array[val_idx].tolist()

    # Create datasets for this fold
    fold_train_dataset = Dataset.from_dict({'text': fold_train_texts, 'labels': fold_train_labels})
    fold_val_dataset = Dataset.from_dict({'text': fold_val_texts, 'labels': fold_val_labels})

    # Tokenize datasets
    fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True)
    fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True)

    # Create fresh model for this fold
    model = create_model()

    # Create trainer for this fold
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=fold_train_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    print(f"Training fold {fold + 1}...")
    trainer.train()

    # Get predictions on validation set
    val_predictions = trainer.predict(fold_val_tokenized)
    val_probs = torch.nn.functional.softmax(torch.tensor(val_predictions.predictions), dim=-1)
    val_bot_probs = val_probs[:, 1].numpy()  # Bot probabilities

    # Store validation predictions in the correct positions
    fold_predictions[val_idx] = val_bot_probs

    # Get predictions on test set immediately
    print(f"Generating test predictions with fold {fold + 1} model...")
    test_pred = trainer.predict(test_tokenized)
    test_probs = torch.nn.functional.softmax(torch.tensor(test_pred.predictions), dim=-1)
    test_bot_probs = test_probs[:, 1].numpy()
    test_predictions.append(test_bot_probs)

    # Calculate fold performance
    val_pred_labels = np.argmax(val_predictions.predictions, axis=1)
    fold_accuracy = accuracy_score(fold_val_labels, val_pred_labels)
    fold_auc = roc_auc_score(fold_val_labels, val_bot_probs)
    print(f"Fold {fold + 1} - Accuracy: {fold_accuracy:.4f}, AUC: {fold_auc:.4f}")

    # Clear GPU memory - now safe to delete
    del model, trainer
    torch.cuda.empty_cache()

print("\n" + "=" * 60)
print("Cross-validation training completed!")
overall_auc = roc_auc_score(train_labels, fold_predictions)
overall_accuracy = accuracy_score(train_labels, (fold_predictions > 0.5).astype(int))
print(f"Overall CV - Accuracy: {overall_accuracy:.4f}, AUC: {overall_auc:.4f}")

Map:   0%|          | 0/11232 [00:00<?, ? examples/s]

Starting 5-fold cross-validation training...

Fold 1/5
------------------------------


Map:   0%|          | 0/20964 [00:00<?, ? examples/s]

Map:   0%|          | 0/5242 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 1...


Step,Training Loss
1000,0.5383
2000,0.4868
3000,0.4683
4000,0.4342
5000,0.4358
6000,0.373
7000,0.3636


Generating test predictions with fold 1 model...




Fold 1 - Accuracy: 0.7667, AUC: 0.8160

Fold 2/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 2...


Step,Training Loss
1000,0.5286
2000,0.5039
3000,0.4753
4000,0.4432
5000,0.4237
6000,0.372
7000,0.3502


Generating test predictions with fold 2 model...




Fold 2 - Accuracy: 0.7689, AUC: 0.8199

Fold 3/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 3...


Step,Training Loss
1000,0.5302
2000,0.4982
3000,0.4726
4000,0.4362
5000,0.4291
6000,0.3754
7000,0.3546


Generating test predictions with fold 3 model...




Fold 3 - Accuracy: 0.7596, AUC: 0.8113

Fold 4/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 4...


Step,Training Loss
1000,0.5316
2000,0.4911
3000,0.4702
4000,0.4382
5000,0.4397
6000,0.3701
7000,0.3696


Generating test predictions with fold 4 model...




Fold 4 - Accuracy: 0.7651, AUC: 0.8125

Fold 5/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 5...


Step,Training Loss
1000,0.5291
2000,0.4974
3000,0.4701
4000,0.4359
5000,0.4381
6000,0.3721
7000,0.3688


Generating test predictions with fold 5 model...




Fold 5 - Accuracy: 0.7478, AUC: 0.8025

Cross-validation training completed!
Overall CV - Accuracy: 0.7616, AUC: 0.8115


The single column of probabilities are stored as a csv file in `train_output_path` and `test_output_path`.  
  
Edit these accordingly if they are to be saved somewhere else. Changing this location has downstream implications as it these locations will be referenced during the execution of the main pipeline.

In [4]:
# Average test predictions across all folds
ensemble_test_predictions = np.mean(test_predictions, axis=0)

# Save bot_prob_from_desc directly as CSV
train_bot_prob_df = pd.DataFrame({'bot_prob_from_desc': fold_predictions})
test_bot_prob_df = pd.DataFrame({'bot_prob_from_desc': ensemble_test_predictions})

train_output_path = "data/train_bot_prob_from_desc.csv"
test_output_path = "data/test_bot_prob_from_desc.csv"

train_bot_prob_df.to_csv(train_output_path, index=False)
test_bot_prob_df.to_csv(test_output_path, index=False)

print(f"\nTraining bot_prob_from_desc saved to: {train_output_path}")
print(f"Test bot_prob_from_desc saved to: {test_output_path}")


Training bot_prob_from_desc saved to: data/train_bot_prob_from_desc.csv
Test bot_prob_from_desc saved to: data/test_bot_prob_from_desc.csv


We will proceed to train the other feature column - bot_prob_from_cat, which produces a single column reflecting probabilities predicted by the trained model that the user is a bot based on all the categorical variables.

In [5]:
def load_extract_data_cat():
    """Load train and test data"""
    train_data_path = "data/train.csv"
    test_data_path = "data/test.csv"

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    # Extract features as JSON with description first
    train_texts = []
    for _, row in train_df.iterrows():
        data = {
            "description": row['description'] if pd.notna(row['description']) else "",
            "lang": row['lang'] if pd.notna(row['lang']) else "",
            "location": row['location'] if pd.notna(row['location']) else "",
            "screen_name": row['screen_name'] if pd.notna(row['screen_name']) else ""
        }
        train_texts.append(json.dumps(data))

    train_labels = train_df['target'].tolist()

    test_texts = []
    for _, row in test_df.iterrows():
        data = {
            "description": row['description'] if pd.notna(row['description']) else "",
            "lang": row['lang'] if pd.notna(row['lang']) else "",
            "location": row['location'] if pd.notna(row['location']) else "",
            "screen_name": row['screen_name'] if pd.notna(row['screen_name']) else ""
        }
        test_texts.append(json.dumps(data))

    print(f"Train dataset size: {len(train_df)} samples")
    print(f"Test dataset size: {len(test_df)} samples")

    return train_texts, train_labels, test_texts, train_df, test_df

# Load the preprocessed data
train_texts, train_labels, test_texts, train_df, test_df = load_extract_data_cat()

Train dataset size: 26206 samples
Test dataset size: 11232 samples


In [6]:
# 5-Fold Cross-Validation Training with Immediate Test Predictions
# Train 5 models using cross-validation, generate bot_prob_from_desc feature

from sklearn.metrics import roc_auc_score

# Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(predictions), dim=-1)
    bot_probs = probs[:, 1].numpy()
    pred_labels = np.argmax(predictions, axis=1)

    return {
        'accuracy': accuracy_score(labels, pred_labels),
        'f1': f1_score(labels, pred_labels, average='weighted'),
        'auc': roc_auc_score(labels, bot_probs)
    }

# Training arguments optimized for Colab
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,                    # Reduced for faster CV
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=1000,                    # Reduced logging frequency
    eval_strategy="no",                    # Skip evaluation during training for speed
    save_strategy="no",
    fp16=True,
    dataloader_pin_memory=False,
    seed=42,
    report_to="none"
)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Setup 5-fold cross-validation
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Convert to numpy arrays for easier indexing
train_texts_array = np.array(train_texts)
train_labels_array = np.array(train_labels)

# Store predictions for each fold
fold_predictions_cat = np.zeros(len(train_texts))
test_predictions_cat = []  # Store test predictions from each fold

# Create test dataset once
test_dataset = Dataset.from_dict({'text': test_texts, 'labels': [0] * len(test_texts)})  # Dummy labels
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("Starting 5-fold cross-validation training...")
print("=" * 60)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_texts_array, train_labels_array)):
    print(f"\nFold {fold + 1}/{n_folds}")
    print("-" * 30)

    # Split data for this fold
    fold_train_texts = train_texts_array[train_idx].tolist()
    fold_train_labels = train_labels_array[train_idx].tolist()
    fold_val_texts = train_texts_array[val_idx].tolist()
    fold_val_labels = train_labels_array[val_idx].tolist()

    # Create datasets for this fold
    fold_train_dataset = Dataset.from_dict({'text': fold_train_texts, 'labels': fold_train_labels})
    fold_val_dataset = Dataset.from_dict({'text': fold_val_texts, 'labels': fold_val_labels})

    # Tokenize datasets
    fold_train_tokenized = fold_train_dataset.map(tokenize_function, batched=True)
    fold_val_tokenized = fold_val_dataset.map(tokenize_function, batched=True)

    # Create fresh model for this fold
    model = create_model()

    # Create trainer for this fold
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=fold_train_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    print(f"Training fold {fold + 1}...")
    trainer.train()

    # Get predictions on validation set
    val_predictions = trainer.predict(fold_val_tokenized)
    val_probs = torch.nn.functional.softmax(torch.tensor(val_predictions.predictions), dim=-1)
    val_bot_probs = val_probs[:, 1].numpy()  # Bot probabilities

    # Store validation predictions in the correct positions
    fold_predictions_cat[val_idx] = val_bot_probs

    # Get predictions on test set immediately
    print(f"Generating test predictions with fold {fold + 1} model...")
    test_pred = trainer.predict(test_tokenized)
    test_probs = torch.nn.functional.softmax(torch.tensor(test_pred.predictions), dim=-1)
    test_bot_probs = test_probs[:, 1].numpy()
    test_predictions_cat.append(test_bot_probs)

    # Calculate fold performance
    val_pred_labels = np.argmax(val_predictions.predictions, axis=1)
    fold_accuracy = accuracy_score(fold_val_labels, val_pred_labels)
    fold_auc = roc_auc_score(fold_val_labels, val_bot_probs)
    print(f"Fold {fold + 1} - Accuracy: {fold_accuracy:.4f}, AUC: {fold_auc:.4f}")

    # Clear GPU memory - now safe to delete
    del model, trainer
    torch.cuda.empty_cache()

print("\n" + "=" * 60)
print("Cross-validation training completed!")
overall_auc = roc_auc_score(train_labels, fold_predictions_cat)
overall_accuracy = accuracy_score(train_labels, (fold_predictions_cat > 0.5).astype(int))
print(f"Overall CV - Accuracy: {overall_accuracy:.4f}, AUC: {overall_auc:.4f}")

Map:   0%|          | 0/11232 [00:00<?, ? examples/s]

Starting 5-fold cross-validation training...

Fold 1/5
------------------------------


Map:   0%|          | 0/20964 [00:00<?, ? examples/s]

Map:   0%|          | 0/5242 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 1...


Step,Training Loss
1000,0.5396
2000,0.4794
3000,0.4562
4000,0.4217
5000,0.4159
6000,0.3435
7000,0.3389


Generating test predictions with fold 1 model...




Fold 1 - Accuracy: 0.7751, AUC: 0.8272

Fold 2/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 2...


Step,Training Loss
1000,0.5439
2000,0.4947
3000,0.4681
4000,0.4313
5000,0.4111
6000,0.3655
7000,0.3386


Generating test predictions with fold 2 model...




Fold 2 - Accuracy: 0.7783, AUC: 0.8372

Fold 3/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 3...


Step,Training Loss
1000,0.542
2000,0.4824
3000,0.459
4000,0.4144
5000,0.407
6000,0.3503
7000,0.3253


Generating test predictions with fold 3 model...




Fold 3 - Accuracy: 0.7598, AUC: 0.8168

Fold 4/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 4...


Step,Training Loss
1000,0.5444
2000,0.4791
3000,0.4577
4000,0.4219
5000,0.4212
6000,0.348
7000,0.3462


Generating test predictions with fold 4 model...




Fold 4 - Accuracy: 0.7686, AUC: 0.8231

Fold 5/5
------------------------------


Map:   0%|          | 0/20965 [00:00<?, ? examples/s]

Map:   0%|          | 0/5241 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Training fold 5...


Step,Training Loss
1000,0.5352
2000,0.485
3000,0.4561
4000,0.4174
5000,0.4183
6000,0.3537
7000,0.3394


Generating test predictions with fold 5 model...




Fold 5 - Accuracy: 0.7605, AUC: 0.8152

Cross-validation training completed!
Overall CV - Accuracy: 0.7684, AUC: 0.8231


Similarly, the columns will be saved as csv files.

In [7]:
# Average test predictions across all folds
ensemble_test_predictions_cat = np.mean(test_predictions_cat, axis=0)

# Save bot_prob_from_desc directly as CSV
train_bot_prob_df = pd.DataFrame({'bot_prob_from_cat': fold_predictions_cat})
test_bot_prob_df = pd.DataFrame({'bot_prob_from_cat': ensemble_test_predictions_cat})

train_output_path = "data/train_bot_prob_from_cat.csv"
test_output_path = "data/test_bot_prob_from_cat.csv"

train_bot_prob_df.to_csv(train_output_path, index=False)
test_bot_prob_df.to_csv(test_output_path, index=False)

print(f"\nTraining bot_prob_from_desc saved to: {train_output_path}")
print(f"Test bot_prob_from_desc saved to: {test_output_path}")


Training bot_prob_from_desc saved to: data/train_bot_prob_from_cat.csv
Test bot_prob_from_desc saved to: data/test_bot_prob_from_cat.csv


## Main Pipeline

Edit `train_path` and `test_path` if the train and test datasets are not in the specified paths.

In [15]:
import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import re
import pandas as pd

train_path = 'data/train.csv'
test_path = 'data/test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

### Data preprocessing and Feature engineering  
In this section we conduct data preprocessing and feature engineering. The LLM augmented features from earlier is appended to the dataframe at the end. Please update the `read_csv` paths if the csv files where saved to different locations paths than the default.

In [16]:
# log transform some features
log_features = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count','average_tweets_per_day']
for feature in log_features:
    train_df[f'log_{feature}'] = np.log1p(train_df[feature])
    test_df[f'log_{feature}'] = np.log1p(test_df[feature])

# add binary indicator if there is description
train_df[f'description_ind'] = (~train_df['description'].isna()).astype(int)
test_df[f'description_ind'] = (~test_df['description'].isna()).astype(int)

# add description length feature
train_df['desc_length'] = train_df['description'].fillna('').str.len()
test_df['desc_length'] = test_df['description'].fillna('').str.len()

# convert bool features to binary
bool_cols = ['default_profile', 'default_profile_image', 'geo_enabled', 'verified']
for col in bool_cols:
    train_df[f'{col}_bin'] = train_df[col].astype(int)
    test_df[f'{col}_bin'] = test_df[col].astype(int)

# Create separate dataframes for CatBoost (before dropping categorical features)
train_df_cat = train_df.copy()
test_df_cat = test_df.copy()

# Fill missing values for categorical features in CatBoost dataframes
train_df_cat['lang'] = train_df_cat['lang'].fillna("unknown")
test_df_cat['lang'] = test_df_cat['lang'].fillna("unknown")
train_df_cat['location'] = train_df_cat['location'].fillna("unknown")
test_df_cat['location'] = test_df_cat['location'].fillna("unknown")
train_df_cat['description'] = train_df_cat['description'].fillna("unknown")
test_df_cat['description'] = test_df_cat['description'].fillna("unknown")

# Drop only non-categorical features for CatBoost
cat_features_to_drop = ['created_at', 'profile_background_image_url', 'profile_image_url', 'screen_name', 'id']
train_df_cat = train_df_cat.drop(cat_features_to_drop, axis=1)
test_df_cat = test_df_cat.drop(cat_features_to_drop, axis=1)

# encode categorical features with OneHotEncoder for XGBoost/LightGBM
cat_cols = ['lang', 'location']
for col in cat_cols:
    train_df[col] = train_df[col].fillna("unknown")
    test_df[col] = test_df[col].fillna("unknown")

    # Get top 30 most frequent categories
    value_counts = train_df[col].value_counts()
    top_categories = value_counts.head(30).index.tolist()

    # Group less frequent categories as 'other'
    train_col = train_df[col].apply(lambda x: x if x in top_categories else 'other')
    test_col = test_df[col].apply(lambda x: x if x in top_categories else 'other')

    # Apply one-hot encoding
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    train_encoded = encoder.fit_transform(train_col.values.reshape(-1, 1))
    test_encoded = encoder.transform(test_col.values.reshape(-1, 1))

    # Create column names and DataFrames with sanitized names
    def sanitize_feature_name(name):
        # Replace special characters with underscore
        name = re.sub(r'[^\w\s]', '_', str(name))
        # Replace spaces with underscore
        name = re.sub(r'\s+', '_', name)
        # Remove consecutive underscores
        name = re.sub(r'_+', '_', name)
        # Remove leading/trailing underscores
        name = name.strip('_')
        return name

    feature_names = [f'{col}_{sanitize_feature_name(category)}' for category in encoder.categories_[0]]
    train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names, index=train_df.index)
    test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names, index=test_df.index)

    # Concatenate with original DataFrames
    train_df = pd.concat([train_df, train_encoded_df], axis=1)
    test_df = pd.concat([test_df, test_encoded_df], axis=1)

# drop features
# drop id to reduce noise
features_to_drop = ['created_at', 'description', 'profile_background_image_url',
                    'profile_image_url', 'screen_name','default_profile', 'default_profile_image', 'geo_enabled', 'verified',
                    'lang', 'location', 'id']
train_df = train_df.drop(features_to_drop, axis=1)
test_df = test_df.drop(features_to_drop, axis=1)

# append the llm augmented features from earlier
# prob from all cat features
train_df['bot_prob_from_cat'] = pd.read_csv('data/train_bot_prob_from_cat.csv')['bot_prob_from_cat']
test_df['bot_prob_from_cat'] = pd.read_csv('data/test_bot_prob_from_cat.csv')['bot_prob_from_cat']
train_df_cat['bot_prob_from_cat'] = pd.read_csv('data/train_bot_prob_from_cat.csv')['bot_prob_from_cat']
test_df_cat['bot_prob_from_cat'] = pd.read_csv('data/train_bot_prob_from_cat.csv')['bot_prob_from_cat']

# add desc feature from other llm output
train_df['bot_prob_from_desc'] = pd.read_csv('data/train_bot_prob_from_desc.csv')['bot_prob_from_desc']
test_df['bot_prob_from_desc'] = pd.read_csv('data/test_bot_prob_from_desc.csv')['bot_prob_from_desc']
train_df_cat['bot_prob_from_desc'] = pd.read_csv('data/train_bot_prob_from_desc.csv')['bot_prob_from_desc']
test_df_cat['bot_prob_from_desc'] = pd.read_csv('data/test_bot_prob_from_desc.csv')['bot_prob_from_desc']

In [17]:
for df in [train_df, test_df, train_df_cat, test_df_cat]:
    # Network interaction feature
    df['network_log'] = df['log_friends_count'] * df['log_followers_count']

    # Average metrics per tweet (using log-transformed features, avoid division by zero)
    df['avg_faves_per_tweets'] = df['log_favourites_count'] / (df['log_statuses_count'] + 1e-8)
    df['avg_followers_per_tweet'] = df['log_followers_count'] / (df['log_statuses_count'] + 1e-8)
    df['avg_friends_per_tweet'] = df['log_friends_count'] / (df['log_statuses_count'] + 1e-8)

    # Daily acquisition rates (log-transformed, avoid division by zero)
    df['follower_acq_rate'] = np.log1p(df['followers_count'] / (df['account_age_days'] + 1))
    df['friends_acq_rate'] = np.log1p(df['friends_count'] / (df['account_age_days'] + 1))
    df['favs_rate'] = np.log1p(df['favourites_count'] / (df['account_age_days'] + 1))

    # Behavioral ratios (bots often have unusual patterns)
    df['followers_friends_ratio'] = df['followers_count'] / (df['friends_count'] + 1)
    df['tweets_per_follower'] = df['statuses_count'] / (df['followers_count'] + 1)
    df['favorites_per_tweet'] = df['favourites_count'] / (df['statuses_count'] + 1)

    # Activity intensity
    df['total_activity'] = df['statuses_count'] + df['favourites_count']
    df['activity_per_day'] = df['total_activity'] / (df['account_age_days'] + 1)

# Default profile indicators (only for train_df and test_df that have _bin columns)
for df in [train_df, test_df]:
    df['has_defaults'] = (df['default_profile_bin'] + df['default_profile_image_bin'])
    df['profile_completeness'] = df['description_ind'] + df['geo_enabled_bin'] + df['verified_bin']

# For CatBoost dataframes (they have original boolean columns)
for df in [train_df_cat, test_df_cat]:
    df['has_defaults'] = (df['default_profile'].astype(int) + df['default_profile_image'].astype(int))
    df['profile_completeness'] = df['description_ind'] + df['geo_enabled'].astype(int) + df['verified'].astype(int)

### Training and Evaluation
In this section we conduct training and evaluation.   
  
This pipeline uses the full training data. To see the version of the pipeline where the training data is further split into a validation set for fast iterative testing, view the last 2 cells.

In [18]:
# Prepare data for XGBoost and LightGBM
feature_cols = [col for col in train_df.columns if col != 'target']
X = train_df[feature_cols]
y = train_df['target']

print(f"Using {len(feature_cols)} features for XGBoost/LightGBM")

# Prepare data for CatBoost with categorical features
cat_feature_cols = [col for col in train_df_cat.columns if col != 'target']
X_cat = train_df_cat[cat_feature_cols]
y_cat = train_df_cat['target']

# Identify categorical features for CatBoost
cat_features = ['default_profile', 'default_profile_image', 'geo_enabled', 'verified', 'lang', 'location', 'description']
cat_feature_indices = [X_cat.columns.get_loc(col) for col in cat_features if col in X_cat.columns]

print(f"Using {len(cat_feature_cols)} features for CatBoost (including {len(cat_feature_indices)} categorical features)")

# Train XGBoost with hyperparameter tuning
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.2],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 10]
}

xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
xgb_grid_search.fit(X, y)

best_xgb = xgb_grid_search.best_estimator_
print(f"Best XGBoost params: {xgb_grid_search.best_params_}")
print(f"Best XGBoost CV AUC: {xgb_grid_search.best_score_:.4f}")

# Train CatBoost with hyperparameter tuning
cat_param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6],
    'learning_rate': [0.1, 0.2],
    'l2_leaf_reg': [3, 5]
}

cat_model = CatBoostClassifier(random_seed=42, verbose=False, cat_features=cat_feature_indices)
cat_grid_search = GridSearchCV(cat_model, cat_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
cat_grid_search.fit(X_cat, y_cat)

best_cat = cat_grid_search.best_estimator_
print(f"Best CatBoost params: {cat_grid_search.best_params_}")
print(f"Best CatBoost CV AUC: {cat_grid_search.best_score_:.4f}")

# Train LightGBM
lgbm_model = LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42, verbose=-1)
lgbm_model.fit(X, y)

# Store models and CV scores
ensemble_models = {
    'xgboost': best_xgb,
    'catboost': best_cat,
    'lightgbm': lgbm_model
}

cv_auc = xgb_grid_search.best_score_
print(f"\nCross-Validation AUC: {cv_auc:.4f}")

Using 95 features for XGBoost/LightGBM
Using 40 features for CatBoost (including 7 categorical features)
Best XGBoost params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 10}
Best XGBoost CV AUC: 0.9480
Best CatBoost params: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best CatBoost CV AUC: 0.9472

Cross-Validation AUC: 0.9480


### Submission generation
The final predictions are generated and saved to 'data' folder as a csv file with the naming convention: `submission-{cv_auc:.5f}.csv`. Please edit the save path accordingly if needed.

In [19]:
X_test = test_df[feature_cols]
X_test_cat = test_df_cat[cat_feature_cols]

# Get predictions from each model
predictions_xgb = ensemble_models['xgboost'].predict_proba(X_test)[:, 1]
predictions_cat = ensemble_models['catboost'].predict_proba(X_test_cat)[:, 1]
predictions_lgbm = ensemble_models['lightgbm'].predict_proba(X_test)[:, 1]

# Ensemble predictions with weighted average
predictions = (2 * predictions_xgb + 1 * predictions_cat + 1 * predictions_lgbm) / 4

submission = pd.DataFrame({
    'index': range(len(predictions)),
    'target': predictions
})
output_path = f"data/submission-{cv_auc:.5f}.csv"
submission.to_csv(output_path, index=False)
print(f"Submission saved to {output_path}")

Submission saved to data/submission-0.94801.csv


# DO NOT RUN
The below cell is to illustrate the pipeline during the iterative testing phase, where the training data is split into train and validation for faster iterative testing, as well as to ensure that the final models are not overfitted.

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [21]:
# Prepare data for XGBoost and LightGBM
feature_cols = [col for col in train_df.columns if col != 'target']
X = train_df[feature_cols]
y = train_df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Using {len(feature_cols)} features for XGBoost/LightGBM")

# Prepare data for CatBoost with categorical features
cat_feature_cols = [col for col in train_df_cat.columns if col != 'target']
X_cat = train_df_cat[cat_feature_cols]
y_cat = train_df_cat['target']

X_cat_train, X_cat_val, y_cat_train, y_cat_val = train_test_split(X_cat, y_cat, test_size=0.2, random_state=42, stratify=y_cat)

# Identify categorical features for CatBoost
cat_features = ['default_profile', 'default_profile_image', 'geo_enabled', 'verified', 'lang', 'location', 'description']
cat_feature_indices = [X_cat.columns.get_loc(col) for col in cat_features if col in X_cat.columns]

print(f"Using {len(cat_feature_cols)} features for CatBoost (including {len(cat_feature_indices)} categorical features)")

# Train XGBoost with hyperparameter tuning

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.2],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 10]
}

xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

best_xgb = xgb_grid_search.best_estimator_
print(f"Best XGBoost params: {xgb_grid_search.best_params_}")
print(f"Best XGBoost CV AUC: {xgb_grid_search.best_score_:.4f}")

# Train CatBoost with hyperparameter tuning

cat_param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6],
    'learning_rate': [0.1, 0.2],
    'l2_leaf_reg': [3, 5]
}

cat_model = CatBoostClassifier(random_seed=42, verbose=False, cat_features=cat_feature_indices)
cat_grid_search = GridSearchCV(cat_model, cat_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
cat_grid_search.fit(X_cat_train, y_cat_train)

best_cat = cat_grid_search.best_estimator_
print(f"Best CatBoost params: {cat_grid_search.best_params_}")
print(f"Best CatBoost CV AUC: {cat_grid_search.best_score_:.4f}")

# Train LightGBM

lgbm_model = LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42, verbose=-1)
lgbm_model.fit(X_train, y_train)

# Evaluate individual models
print("\nModel Performance Comparison:")
print("=" * 50)

# XGBoost predictions
y_val_proba_xgb = best_xgb.predict_proba(X_val)[:, 1]
auc_xgb = roc_auc_score(y_val, y_val_proba_xgb)
print(f"XGBoost Val AUC: {auc_xgb:.4f}")

# CatBoost predictions (using categorical data)
y_cat_val_proba_cat = best_cat.predict_proba(X_cat_val)[:, 1]
auc_cat = roc_auc_score(y_cat_val, y_cat_val_proba_cat)
print(f"CatBoost Val AUC: {auc_cat:.4f}")

# LightGBM predictions
y_val_proba_lgbm = lgbm_model.predict_proba(X_val)[:, 1]
auc_lgbm = roc_auc_score(y_val, y_val_proba_lgbm)
print(f"LightGBM Val AUC: {auc_lgbm:.4f}")

# Manual ensemble predictions (weighted average)
y_train_proba_xgb = best_xgb.predict_proba(X_train)[:, 1]
y_train_proba_cat = best_cat.predict_proba(X_cat_train)[:, 1]
y_train_proba_lgbm = lgbm_model.predict_proba(X_train)[:, 1]
# higher weight to XGBoost
y_train_proba = (2 * y_train_proba_xgb + 1 * y_train_proba_cat + 1 * y_train_proba_lgbm) / 4
y_train_pred = (y_train_proba >= 0.5).astype(int)

y_val_proba = (2 * y_val_proba_xgb + 1 * y_cat_val_proba_cat + 1 * y_val_proba_lgbm) / 4
y_val_pred = (y_val_proba >= 0.5).astype(int)

train_metrics = {
    'auc': roc_auc_score(y_train, y_train_proba),
    'accuracy': accuracy_score(y_train, y_train_pred),
    'precision': precision_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1': f1_score(y_train, y_train_pred)
}

val_metrics = {
    'auc': roc_auc_score(y_val, y_val_proba),
    'accuracy': accuracy_score(y_val, y_val_pred),
    'precision': precision_score(y_val, y_val_pred),
    'recall': recall_score(y_val, y_val_pred),
    'f1': f1_score(y_val, y_val_pred)
}

print(f"Ensemble Val AUC: {val_metrics['auc']:.4f}")

# Store models and feature columns
ensemble_models = {
    'xgboost': best_xgb,
    'catboost': best_cat,
    'lightgbm': lgbm_model
}

Using 95 features for XGBoost/LightGBM
Using 40 features for CatBoost (including 7 categorical features)
Best XGBoost params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 10}
Best XGBoost CV AUC: 0.9457
Best CatBoost params: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best CatBoost CV AUC: 0.9452

Model Performance Comparison:
XGBoost Val AUC: 0.9517
CatBoost Val AUC: 0.9508
LightGBM Val AUC: 0.9508
Ensemble Val AUC: 0.9528


In [22]:
print(f"Train - AUC: {train_metrics['auc']:.4f}, Acc: {train_metrics['accuracy']:.4f}, Prec: {train_metrics['precision']:.4f}, Rec: {train_metrics['recall']:.4f}, F1: {train_metrics['f1']:.4f}")
print(f"Val   - AUC: {val_metrics['auc']:.4f}, Acc: {val_metrics['accuracy']:.4f}, Prec: {val_metrics['precision']:.4f}, Rec: {val_metrics['recall']:.4f}, F1: {val_metrics['f1']:.4f}")

Train - AUC: 0.9789, Acc: 0.9287, Prec: 0.9348, Rec: 0.8464, F1: 0.8884
Val   - AUC: 0.9528, Acc: 0.8932, Prec: 0.8800, Rec: 0.7888, F1: 0.8319
