## 1. Setup and Imports

In [None]:
# Install required packages (run once)
# !pip install pandas numpy scikit-learn catboost matplotlib seaborn imbalanced-learn

In [None]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Sklearn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)

# CatBoost
from catboost import CatBoostClassifier

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', None)

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
def create_synthetic_dataset(n_samples=5000):
    """
    Create a synthetic dataset mimicking Cresci-2017 structure.
    This allows the code to run even without the actual dataset.
    """
    np.random.seed(RANDOM_STATE)
    
    n_genuine = n_samples // 2
    n_bots = n_samples - n_genuine
    
    # Generate genuine user features
    genuine_data = {
        'id': range(1, n_genuine + 1),
        'statuses_count': np.random.lognormal(mean=6, sigma=1.5, size=n_genuine).astype(int),
        'followers_count': np.random.lognormal(mean=4, sigma=2, size=n_genuine).astype(int),
        'friends_count': np.random.lognormal(mean=4, sigma=1.5, size=n_genuine).astype(int),
        'favourites_count': np.random.lognormal(mean=5, sigma=2, size=n_genuine).astype(int),
        'listed_count': np.random.lognormal(mean=1, sigma=1.5, size=n_genuine).astype(int),
        'default_profile': np.random.choice([1, 0], size=n_genuine, p=[0.2, 0.8]),
        'geo_enabled': np.random.choice([1, 0], size=n_genuine, p=[0.4, 0.6]),
        'profile_use_background_image': np.random.choice([1, 0], size=n_genuine, p=[0.7, 0.3]),
        'verified': np.random.choice([1, 0], size=n_genuine, p=[0.05, 0.95]),
        'label': 0,
        'category': 'genuine'
    }
    
    base_timestamp = pd.Timestamp('2010-01-01')
    genuine_data['created_at'] = [
        base_timestamp + pd.Timedelta(days=np.random.randint(0, 3000)) 
        for _ in range(n_genuine)
    ]
    
    # Generate bot features
    bot_data = {
        'id': range(n_genuine + 1, n_samples + 1),
        'statuses_count': np.concatenate([
            np.random.lognormal(mean=8, sigma=1, size=n_bots//2).astype(int),
            np.random.lognormal(mean=2, sigma=1, size=n_bots - n_bots//2).astype(int)
        ]),
        'followers_count': np.random.lognormal(mean=2, sigma=2, size=n_bots).astype(int),
        'friends_count': np.random.lognormal(mean=6, sigma=1, size=n_bots).astype(int),
        'favourites_count': np.random.lognormal(mean=2, sigma=2, size=n_bots).astype(int),
        'listed_count': np.random.lognormal(mean=0.5, sigma=1, size=n_bots).astype(int),
        'default_profile': np.random.choice([1, 0], size=n_bots, p=[0.6, 0.4]),
        'geo_enabled': np.random.choice([1, 0], size=n_bots, p=[0.1, 0.9]),
        'profile_use_background_image': np.random.choice([1, 0], size=n_bots, p=[0.3, 0.7]),
        'verified': np.random.choice([1, 0], size=n_bots, p=[0.001, 0.999]),
        'label': 1,
        'category': 'bot'
    }
    
    bot_data['created_at'] = [
        base_timestamp + pd.Timedelta(days=np.random.choice([100, 500, 1000, 1500]) + np.random.randint(0, 30))
        for _ in range(n_bots)
    ]
    
    genuine_df = pd.DataFrame(genuine_data)
    bot_df = pd.DataFrame(bot_data)
    
    combined_df = pd.concat([genuine_df, bot_df], ignore_index=True)
    combined_df = combined_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    
    return combined_df

# Create dataset
print("Creating dataset...")
df = create_synthetic_dataset(n_samples=5000)
print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

In [None]:
# Display sample data
print("Sample data:")
df.head(10)

In [None]:
# Dataset statistics
print("Dataset Statistics:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
labels = ['Genuine Users', 'Bots']
sizes = df['label'].value_counts().values
colors = ['#2ecc71', '#e74c3c']
axes[0].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')

# Bar chart
sns.countplot(data=df, x='label', palette=['#2ecc71', '#e74c3c'], ax=axes[1])
axes[1].set_xticklabels(['Genuine', 'Bot'])
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')
axes[1].set_title('Account Count by Class', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by class
numeric_features = ['statuses_count', 'followers_count', 'friends_count', 
                   'favourites_count', 'listed_count']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(numeric_features):
    sns.boxplot(data=df, x='label', y=feature, palette=['#2ecc71', '#e74c3c'], ax=axes[idx])
    axes[idx].set_xticklabels(['Genuine', 'Bot'])
    axes[idx].set_title(f'{feature} Distribution', fontsize=12)
    axes[idx].set_yscale('log')

axes[5].axis('off')  # Hide empty subplot
plt.suptitle('Feature Distributions by Class (Log Scale)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Boolean feature comparison
bool_features = ['default_profile', 'geo_enabled', 'profile_use_background_image', 'verified']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, feature in enumerate(bool_features):
    crosstab = pd.crosstab(df['label'], df[feature], normalize='index')
    crosstab.plot(kind='bar', ax=axes[idx], color=['#3498db', '#e74c3c'])
    axes[idx].set_xticklabels(['Genuine', 'Bot'], rotation=0)
    axes[idx].set_title(f'{feature}', fontsize=12)
    axes[idx].set_ylabel('Proportion')
    axes[idx].legend(['False', 'True'])

plt.suptitle('Boolean Features Distribution by Class', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
def extract_features(df):
    """
    Extract and engineer features from the Twitter user data.
    """
    features = pd.DataFrame()
    epsilon = 1e-6
    
    # Basic count features
    count_features = ['statuses_count', 'followers_count', 'friends_count', 
                     'favourites_count', 'listed_count']
    for feature in count_features:
        if feature in df.columns:
            features[feature] = df[feature].fillna(0).astype(float)
    
    # Boolean features
    bool_features = ['default_profile', 'geo_enabled', 'profile_use_background_image', 'verified']
    for feature in bool_features:
        if feature in df.columns:
            features[feature] = df[feature].fillna(0).astype(int)
    
    # Ratio features
    features['follower_friend_ratio'] = (
        features['followers_count'] / (features['friends_count'] + epsilon)
    ).clip(0, 100)
    
    features['tweets_per_follower'] = (
        features['statuses_count'] / (features['followers_count'] + epsilon)
    ).clip(0, 1000)
    
    features['favorites_per_tweet'] = (
        features['favourites_count'] / (features['statuses_count'] + epsilon)
    ).clip(0, 100)
    
    features['listed_per_follower'] = (
        features['listed_count'] / (features['followers_count'] + epsilon)
    ).clip(0, 10)
    
    # Account age features
    if 'created_at' in df.columns:
        created_at = pd.to_datetime(df['created_at'], errors='coerce')
        reference_date = pd.Timestamp.now()
        features['account_age_days'] = (reference_date - created_at).dt.days.fillna(0).clip(0)
        
        features['tweets_per_day'] = (
            features['statuses_count'] / (features['account_age_days'] + epsilon)
        ).clip(0, 1000)
        
        features['followers_per_day'] = (
            features['followers_count'] / (features['account_age_days'] + epsilon)
        ).clip(0, 1000)
    
    # Log transformations
    for feature in count_features:
        features[f'{feature}_log'] = np.log1p(features[feature])
    
    # Composite scores
    features['engagement_score'] = (
        np.log1p(features['followers_count']) + 
        np.log1p(features['listed_count']) * 2 + 
        np.log1p(features['favourites_count'])
    )
    
    features['activity_score'] = (
        np.log1p(features['statuses_count']) + 
        np.log1p(features['favourites_count'])
    )
    
    # Handle missing/infinite values
    features = features.fillna(0)
    features = features.replace([np.inf, -np.inf], 0)
    
    return features

# Extract features
X = extract_features(df)
y = df['label'].values

print(f"Feature matrix shape: {X.shape}")
print(f"\nExtracted features:")
print(X.columns.tolist())

In [None]:
# Feature correlation heatmap
plt.figure(figsize=(16, 14))
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            linewidths=0.5, square=True)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Data Splitting and Preprocessing

In [None]:
# Store feature names
feature_names = X.columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining label distribution:")
print(f"  Genuine: {np.sum(y_train == 0)}")
print(f"  Bot: {np.sum(y_train == 1)}")

# Scale features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6. Model Training and Evaluation

In [None]:
def evaluate_model(y_true, y_pred, y_proba=None):
    """Calculate evaluation metrics."""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred)
    }
    
    if y_proba is not None:
        y_proba_pos = y_proba[:, 1] if len(y_proba.shape) > 1 else y_proba
        metrics['roc_auc'] = roc_auc_score(y_true, y_proba_pos)
    
    return metrics

# Initialize results storage
results = {}
predictions = {}
probabilities = {}
trained_models = {}

### 6.1 Support Vector Machine (SVM)

In [None]:
print("Training SVM...")

svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True,
    random_state=RANDOM_STATE,
    class_weight='balanced'
)

svm_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test_scaled)
y_proba_svm = svm_model.predict_proba(X_test_scaled)

# Evaluate
results['SVM'] = evaluate_model(y_test, y_pred_svm, y_proba_svm)
predictions['SVM'] = y_pred_svm
probabilities['SVM'] = y_proba_svm
trained_models['SVM'] = svm_model

print(f"\nSVM Results:")
for metric, value in results['SVM'].items():
    print(f"  {metric}: {value:.4f}")

### 6.2 Random Forest

In [None]:
print("Training Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=RANDOM_STATE,
    class_weight='balanced',
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)

# Evaluate
results['Random Forest'] = evaluate_model(y_test, y_pred_rf, y_proba_rf)
predictions['Random Forest'] = y_pred_rf
probabilities['Random Forest'] = y_proba_rf
trained_models['Random Forest'] = rf_model

print(f"\nRandom Forest Results:")
for metric, value in results['Random Forest'].items():
    print(f"  {metric}: {value:.4f}")

### 6.3 Gradient Boosting

In [None]:
print("Training Gradient Boosting...")

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=RANDOM_STATE
)

gb_model.fit(X_train, y_train)

# Predictions
y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)

# Evaluate
results['Gradient Boosting'] = evaluate_model(y_test, y_pred_gb, y_proba_gb)
predictions['Gradient Boosting'] = y_pred_gb
probabilities['Gradient Boosting'] = y_proba_gb
trained_models['Gradient Boosting'] = gb_model

print(f"\nGradient Boosting Results:")
for metric, value in results['Gradient Boosting'].items():
    print(f"  {metric}: {value:.4f}")

### 6.4 CatBoost

In [None]:
print("Training CatBoost...")

catboost_model = CatBoostClassifier(
    iterations=100,
    depth=6,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    verbose=False,
    auto_class_weights='Balanced'
)

catboost_model.fit(X_train, y_train)

# Predictions
y_pred_cb = catboost_model.predict(X_test)
y_proba_cb = catboost_model.predict_proba(X_test)

# Evaluate
results['CatBoost'] = evaluate_model(y_test, y_pred_cb, y_proba_cb)
predictions['CatBoost'] = y_pred_cb
probabilities['CatBoost'] = y_proba_cb
trained_models['CatBoost'] = catboost_model

print(f"\nCatBoost Results:")
for metric, value in results['CatBoost'].items():
    print(f"  {metric}: {value:.4f}")

## 7. Model Comparison

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df.index.name = 'Model'
comparison_df = comparison_df.round(4)

print("=" * 70)
print("MODEL COMPARISON RESULTS")
print("=" * 70)
display(comparison_df)

In [None]:
# Metrics comparison bar chart
fig, ax = plt.subplots(figsize=(14, 6))

metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']
x = np.arange(len(comparison_df.index))
width = 0.15
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

for idx, metric in enumerate(metrics_to_plot):
    offset = (idx - len(metrics_to_plot)/2 + 0.5) * width
    bars = ax.bar(x + offset, comparison_df[metric], width, label=metric.replace('_', ' ').title(),
                 color=colors[idx])
    
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width()/2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom', fontsize=8, rotation=45)

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df.index, fontsize=11)
ax.legend(loc='lower right', fontsize=10)
ax.set_ylim(0, 1.15)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# ROC Curves
plt.figure(figsize=(10, 8))

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

for idx, (model_name, y_proba) in enumerate(probabilities.items()):
    y_proba_pos = y_proba[:, 1] if len(y_proba.shape) > 1 else y_proba
    fpr, tpr, _ = roc_curve(y_test, y_proba_pos)
    auc = roc_auc_score(y_test, y_proba_pos)
    plt.plot(fpr, tpr, color=colors[idx], lw=2,
            label=f'{model_name} (AUC = {auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Confusion Matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, (model_name, y_pred) in enumerate(predictions.items()):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Genuine', 'Bot'],
               yticklabels=['Genuine', 'Bot'],
               ax=axes[idx])
    axes[idx].set_title(f'{model_name}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.suptitle('Confusion Matrices - Model Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
fig, axes = plt.subplots(1, 3, figsize=(18, 8))

tree_models = {
    'Random Forest': trained_models['Random Forest'],
    'Gradient Boosting': trained_models['Gradient Boosting'],
    'CatBoost': trained_models['CatBoost']
}

for idx, (model_name, model) in enumerate(tree_models.items()):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1][:15]
    
    axes[idx].barh(range(len(indices)), importances[indices][::-1], color='steelblue')
    axes[idx].set_yticks(range(len(indices)))
    axes[idx].set_yticklabels([feature_names[i] for i in indices][::-1])
    axes[idx].set_xlabel('Importance')
    axes[idx].set_title(f'{model_name}', fontsize=12, fontweight='bold')

plt.suptitle('Top 15 Feature Importances', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 9. Classification Reports

In [None]:
# Detailed classification reports
for model_name, y_pred in predictions.items():
    print("=" * 60)
    print(f"Classification Report - {model_name}")
    print("=" * 60)
    print(classification_report(y_test, y_pred, target_names=['Genuine', 'Bot']))
    print()

## 10. Summary and Conclusions

In [None]:
# Find best model
best_f1_model = comparison_df['f1_score'].idxmax()
best_auc_model = comparison_df['roc_auc'].idxmax()
best_accuracy_model = comparison_df['accuracy'].idxmax()

print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"\nBest Model by F1 Score: {best_f1_model} ({comparison_df.loc[best_f1_model, 'f1_score']:.4f})")
print(f"Best Model by ROC-AUC: {best_auc_model} ({comparison_df.loc[best_auc_model, 'roc_auc']:.4f})")
print(f"Best Model by Accuracy: {best_accuracy_model} ({comparison_df.loc[best_accuracy_model, 'accuracy']:.4f})")

print("\n" + "=" * 70)
print(f"üèÜ RECOMMENDED MODEL: {best_f1_model}")
print("=" * 70)
print(f"\nThe {best_f1_model} model achieves the best balance between")
print("precision and recall for Twitter bot detection.")

In [None]:
# Save comparison results
comparison_df.to_csv('model_comparison_results.csv')
print("Results saved to 'model_comparison_results.csv'")