# Advanced AI Text Detection - Complete Training Pipeline

This notebook implements a comprehensive training pipeline for AI text detection, optimized to achieve 99%+ accuracy.

**Features:**
- Advanced preprocessing using project utilities
- Optimized feature engineering (word + character n-grams)
- Extensive hyperparameter tuning
- Cross-validation for robust evaluation
- Multiple model comparison
- Comprehensive visualizations
- Detailed performance metrics


In [None]:
# Import all necessary libraries
import json
import os
import sys
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for headless execution
import matplotlib.pyplot as plt
import seaborn as sns
plt.ioff()  # Turn off interactive mode
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import Counter
from time import time

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    cross_val_score,
    StratifiedKFold,
    learning_curve
)
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    f1_score
)
from sklearn.utils import shuffle
from scipy.sparse import hstack, csr_matrix
from scipy.stats import uniform

# Add project paths - fix path to work from notebooks directory
notebook_dir = Path().absolute()
project_root = notebook_dir.parent if notebook_dir.name == 'notebooks' else notebook_dir
sys.path.insert(0, str(project_root))

# Import project utilities
from preprocessing.advanced_preprocessing import clean_text, preprocess_article
from utils.text_utils import get_text_statistics

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
RANDOM_STATE = 42

print("‚úì All libraries imported successfully!")
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Project root: {project_root}")


## 1. Load and Explore Dataset


In [None]:
# Load the combined dataset
dataset_path = project_root / "data" / "combined" / "combined_dataset_clean.json"

print("=" * 70)
print("LOADING DATASET")
print("=" * 70)

if not dataset_path.exists():
    raise FileNotFoundError(f"Dataset not found at {dataset_path}")

with open(str(dataset_path), "r", encoding="utf-8") as f:
    dataset = json.load(f)

print(f"‚úì Loaded {len(dataset):,} articles from {dataset_path}")

# Display sample
if len(dataset) > 0:
    print("\nSample article:")
    print(json.dumps(dataset[0], indent=2, ensure_ascii=False))
else:
    raise ValueError("Dataset is empty!")


In [None]:
# Analyze dataset distribution
labels = [article.get("label", "").lower() for article in dataset]
label_counts = Counter(labels)

print("=" * 70)
print("DATASET ANALYSIS")
print("=" * 70)
print(f"\nLabel Distribution:")
for label, count in label_counts.items():
    percentage = (count / len(dataset)) * 100
    print(f"  {label.upper()}: {count:,} ({percentage:.2f}%)")

# Calculate text statistics
text_lengths = []
word_counts = []
for article in dataset:
    content = article.get("content", "")
    text_lengths.append(len(content))
    word_counts.append(len(content.split()))

print(f"\nText Statistics:")
print(f"  Average length: {np.mean(text_lengths):.0f} characters")
print(f"  Average words: {np.mean(word_counts):.0f} words")
print(f"  Min length: {np.min(text_lengths)} characters")
print(f"  Max length: {np.max(text_lengths):,} characters")


In [None]:
# Visualize dataset distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Label distribution
axes[0].bar(label_counts.keys(), label_counts.values(), color=['#3498db', '#e74c3c'])
axes[0].set_title('Label Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)
for i, (label, count) in enumerate(label_counts.items()):
    axes[0].text(i, count, f'{count:,}', ha='center', va='bottom', fontweight='bold')

# Text length distribution
axes[1].hist(text_lengths, bins=50, color='#2ecc71', edgecolor='black', alpha=0.7)
axes[1].set_title('Text Length Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Character Count')
axes[1].set_ylabel('Frequency')
axes[1].grid(axis='y', alpha=0.3)

# Word count distribution
axes[2].hist(word_counts, bins=50, color='#9b59b6', edgecolor='black', alpha=0.7)
axes[2].set_title('Word Count Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Word Count')
axes[2].set_ylabel('Frequency')
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()


## 2. Advanced Preprocessing


In [None]:
# Preprocess dataset using advanced preprocessing
print("=" * 70)
print("PREPROCESSING DATA")
print("=" * 70)

# Normalize labels and clean texts
processed_articles = []
empty_count = 0

for article in dataset:
    # Normalize label
    label = article.get("label", "").lower()
    if label not in ["ai", "human"]:
        continue
    
    # Get and clean content
    content = article.get("content", "").strip()
    if not content:
        empty_count += 1
        continue
    
    # Apply advanced cleaning
    cleaned_content = clean_text(content)
    
    if cleaned_content and len(cleaned_content) > 20:  # Minimum length threshold
        processed_articles.append({
            "content": cleaned_content,
            "label": label
        })

print(f"‚úì Processed {len(processed_articles):,} articles")
print(f"  Removed {len(dataset) - len(processed_articles):,} invalid/empty articles")
print(f"  Empty content count: {empty_count}")

# Balance dataset
ai_articles = [a for a in processed_articles if a["label"] == "ai"]
human_articles = [a for a in processed_articles if a["label"] == "human"]

print(f"\nBefore balancing:")
print(f"  AI articles: {len(ai_articles):,}")
print(f"  Human articles: {len(human_articles):,}")

# Balance to minimum class size
min_size = min(len(ai_articles), len(human_articles))
ai_articles = ai_articles[:min_size]
human_articles = human_articles[:min_size]

balanced_dataset = ai_articles + human_articles
balanced_dataset = shuffle(balanced_dataset, random_state=RANDOM_STATE)

print(f"\nAfter balancing:")
print(f"  AI articles: {len(ai_articles):,}")
print(f"  Human articles: {len(human_articles):,}")
print(f"  Total: {len(balanced_dataset):,}")

# Extract texts and labels
texts = [article["content"] for article in balanced_dataset]
labels = [article["label"] for article in balanced_dataset]

if len(texts) == 0:
    raise ValueError("No valid texts after preprocessing!")

print(f"\n‚úì Final dataset: {len(texts):,} samples ready for training")


## 3. Advanced Feature Engineering


In [None]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

print("=" * 70)
print("FEATURE ENGINEERING")
print("=" * 70)
print(f"\nLabel encoding:")
print(f"  Classes: {label_encoder.classes_}")
print(f"  Distribution: AI={np.sum(y==1):,}, Human={np.sum(y==0):,}")

# Split data first (before feature extraction to avoid data leakage)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    stratify=y
)

print(f"\nTrain/Test Split:")
print(f"  Training: {len(X_train_texts):,} samples")
print(f"  Testing: {len(X_test_texts):,} samples")
print(f"  Train labels - AI: {np.sum(y_train==1):,}, Human: {np.sum(y_train==0):,}")
print(f"  Test labels - AI: {np.sum(y_test==1):,}, Human: {np.sum(y_test==0):,}")


In [None]:
# Create optimized word-level TF-IDF features
print("\n" + "-" * 70)
print("Creating Word-Level Features...")
print("-" * 70)

word_vectorizer = TfidfVectorizer(
    max_features=25000,      # Increased for better accuracy
    ngram_range=(1, 3),       # Unigrams, bigrams, trigrams
    sublinear_tf=True,        # Apply sublinear TF scaling (log scale)
    min_df=2,                 # Minimum document frequency
    max_df=0.95,              # Maximum document frequency (remove very common words)
    analyzer='word',
    lowercase=True,
    strip_accents='unicode',
    token_pattern=r'(?u)\b\w+\b'  # Better tokenization
)

print("Fitting word vectorizer on training data...")
start_time = time()
X_train_word = word_vectorizer.fit_transform(X_train_texts)
X_test_word = word_vectorizer.transform(X_test_texts)
print(f"‚úì Word features created in {time() - start_time:.2f} seconds")
print(f"  Training shape: {X_train_word.shape}")
print(f"  Test shape: {X_test_word.shape}")
print(f"  Vocabulary size: {len(word_vectorizer.vocabulary_):,}")


In [None]:
# Create optimized character-level TF-IDF features
print("\n" + "-" * 70)
print("Creating Character-Level Features...")
print("-" * 70)

char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 6),       # 3-6 character n-grams (critical for AI detection)
    max_features=35000,       # Increased for better pattern detection
    sublinear_tf=True,
    min_df=2,
    max_df=0.95,
    lowercase=True,
    strip_accents='unicode'
)

print("Fitting character vectorizer on training data...")
start_time = time()
X_train_char = char_vectorizer.fit_transform(X_train_texts)
X_test_char = char_vectorizer.transform(X_test_texts)
print(f"‚úì Character features created in {time() - start_time:.2f} seconds")
print(f"  Training shape: {X_train_char.shape}")
print(f"  Test shape: {X_test_char.shape}")


In [None]:
# Combine word and character features
print("\n" + "-" * 70)
print("Combining Features...")
print("-" * 70)

X_train = hstack([X_train_word, X_train_char])
X_test = hstack([X_test_word, X_test_char])

print(f"‚úì Combined feature matrix created")
print(f"  Training shape: {X_train.shape}")
print(f"  Test shape: {X_test.shape}")
print(f"  Total features: {X_train.shape[1]:,}")
print(f"  Sparsity: {(1 - X_train.nnz / (X_train.shape[0] * X_train.shape[1])) * 100:.2f}%")
print(f"  Memory usage: ~{X_train.data.nbytes / (1024**2):.2f} MB")


## 4. Model Training with Hyperparameter Optimization


In [None]:
# Train Logistic Regression as baseline
print("=" * 70)
print("TRAINING LOGISTIC REGRESSION (BASELINE)")
print("=" * 70)

lr_model = LogisticRegression(
    max_iter=2000,
    random_state=RANDOM_STATE,
    C=1.0,
    solver='lbfgs',
    n_jobs=-1,
    class_weight='balanced'  # Handle class imbalance
)

print("Training Logistic Regression...")
start_time = time()
lr_model.fit(X_train, y_train)
lr_train_time = time() - start_time

# Evaluate
y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print(f"‚úì Training completed in {lr_train_time:.2f} seconds")
print(f"  Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"  F1-Score: {lr_f1:.4f}")


In [None]:
# Hyperparameter tuning for SVM (optimized for 99% accuracy)
print("\n" + "=" * 70)
print("HYPERPARAMETER TUNING FOR SVM")
print("=" * 70)

# Use stratified K-fold for better validation
cv_folds = 5
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE)

# Sample data for faster grid search (if dataset is large)
if X_train.shape[0] > 10000:
    sample_size = 10000
    sample_indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
    X_train_sample = X_train[sample_indices]
    y_train_sample = y_train[sample_indices]
    print(f"Using {sample_size:,} samples for grid search (for speed)...")
else:
    X_train_sample = X_train
    y_train_sample = y_train

# Extended parameter grid for better accuracy
param_grid = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0],  # Wider range
    'class_weight': [None, 'balanced']  # Try both
}

print(f"\nGrid search parameters:")
print(f"  C values: {param_grid['C']}")
print(f"  Class weights: {param_grid['class_weight']}")
print(f"  CV folds: {cv_folds}")

# Create SVM with linear kernel
svm_base = SVC(
    kernel='linear',
    probability=True,
    random_state=RANDOM_STATE,
    max_iter=10000  # Increased for convergence
)

print("\nRunning grid search (this may take several minutes)...")
start_time = time()

grid_search = GridSearchCV(
    svm_base,
    param_grid,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid_search.fit(X_train_sample, y_train_sample)

grid_time = time() - start_time

print(f"\n‚úì Grid search completed in {grid_time/60:.2f} minutes")
print(f"  Best parameters: {grid_search.best_params_}")
print(f"  Best CV score: {grid_search.best_score_:.4f} ({grid_search.best_score_*100:.2f}%)")


In [None]:
# Train final SVM model on full training set with best parameters
print("\n" + "=" * 70)
print("TRAINING FINAL SVM MODEL")
print("=" * 70)

best_params = grid_search.best_params_
print(f"Using best parameters: {best_params}")

final_svm = SVC(
    kernel='linear',
    C=best_params['C'],
    class_weight=best_params['class_weight'],
    probability=True,
    random_state=RANDOM_STATE,
    max_iter=10000
)

print("Training on full training set...")
start_time = time()
final_svm.fit(X_train, y_train)
svm_train_time = time() - start_time

print(f"‚úì Training completed in {svm_train_time:.2f} seconds")

# Evaluate on test set
y_pred_svm = final_svm.predict(X_test)
y_proba_svm = final_svm.predict_proba(X_test)[:, 1]

svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print(f"\n--- Test Set Results ---")
print(f"  Accuracy: {svm_accuracy:.4f} ({svm_accuracy*100:.2f}%)")
print(f"  F1-Score: {svm_f1:.4f}")


In [None]:
# Try additional models for comparison
print("\n" + "=" * 70)
print("TRAINING ADDITIONAL MODELS FOR COMPARISON")
print("=" * 70)

models = {}
results = {}

# Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=0
)
start_time = time()
rf_model.fit(X_train, y_train)
rf_time = time() - start_time
y_pred_rf = rf_model.predict(X_test)
models['Random Forest'] = rf_model
results['Random Forest'] = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'time': rf_time
}
print(f"  Accuracy: {results['Random Forest']['accuracy']:.4f} ({results['Random Forest']['accuracy']*100:.2f}%)")

# Gradient Boosting
print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    random_state=RANDOM_STATE,
    verbose=0
)
start_time = time()
gb_model.fit(X_train, y_train)
gb_time = time() - start_time
y_pred_gb = gb_model.predict(X_test)
models['Gradient Boosting'] = gb_model
results['Gradient Boosting'] = {
    'accuracy': accuracy_score(y_test, y_pred_gb),
    'f1': f1_score(y_test, y_pred_gb),
    'time': gb_time
}
print(f"  Accuracy: {results['Gradient Boosting']['accuracy']:.4f} ({results['Gradient Boosting']['accuracy']*100:.2f}%)")

# Add SVM and LR to results
models['SVM'] = final_svm
models['Logistic Regression'] = lr_model
results['SVM'] = {'accuracy': svm_accuracy, 'f1': svm_f1, 'time': svm_train_time}
results['Logistic Regression'] = {'accuracy': lr_accuracy, 'f1': lr_f1, 'time': lr_train_time}


## 5. Comprehensive Model Evaluation


In [None]:
# Detailed evaluation of best model (SVM)
print("=" * 70)
print("DETAILED EVALUATION - SVM MODEL")
print("=" * 70)

# Calculate all metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred_svm, average=None, labels=[0, 1]
)

print(f"\nPer-Class Metrics:")
print(f"  Human (Class 0):")
print(f"    Precision: {precision[0]:.4f}")
print(f"    Recall: {recall[0]:.4f}")
print(f"    F1-Score: {f1[0]:.4f}")
print(f"    Support: {support[0]:,}")
print(f"  AI (Class 1):")
print(f"    Precision: {precision[1]:.4f}")
print(f"    Recall: {recall[1]:.4f}")
print(f"    F1-Score: {f1[1]:.4f}")
print(f"    Support: {support[1]:,}")

# Weighted averages
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
    y_test, y_pred_svm, average='weighted'
)

print(f"\nWeighted Averages:")
print(f"  Precision: {weighted_precision:.4f}")
print(f"  Recall: {weighted_recall:.4f}")
print(f"  F1-Score: {weighted_f1:.4f}")

# ROC-AUC
try:
    roc_auc = roc_auc_score(y_test, y_proba_svm)
    print(f"  ROC-AUC: {roc_auc:.4f}")
except Exception as e:
    print(f"  ROC-AUC: Could not calculate ({e})")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_svm)
print(f"\nConfusion Matrix:")
print(f"  True Negatives (Human‚ÜíHuman):  {cm[0][0]:,}")
print(f"  False Positives (Human‚ÜíAI):    {cm[0][1]:,}")
print(f"  False Negatives (AI‚ÜíHuman):    {cm[1][0]:,}")
print(f"  True Positives (AI‚ÜíAI):        {cm[1][1]:,}")

# Classification Report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=['Human', 'AI']))


In [None]:
# Cross-validation on full training set
print("\n" + "=" * 70)
print("CROSS-VALIDATION EVALUATION")
print("=" * 70)

print("Running 5-fold cross-validation on training set...")
cv_scores = cross_val_score(
    final_svm, 
    X_train, 
    y_train, 
    cv=skf, 
    scoring='accuracy',
    n_jobs=-1
)

print(f"\nCross-Validation Results:")
print(f"  Scores: {cv_scores}")
print(f"  Mean: {cv_scores.mean():.4f} ({cv_scores.mean()*100:.2f}%)")
print(f"  Std: {cv_scores.std():.4f}")
print(f"  95% CI: [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, {cv_scores.mean() + 1.96*cv_scores.std():.4f}]")


In [None]:
# Model comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Model Accuracy Comparison
model_names = list(results.keys())
accuracies = [results[m]['accuracy'] * 100 for m in model_names]
f1_scores = [results[m]['f1'] * 100 for m in model_names]
times = [results[m]['time'] for m in model_names]

axes[0, 0].barh(model_names, accuracies, color=['#3498db', '#e74c3c', '#2ecc71', '#9b59b6'])
axes[0, 0].set_xlabel('Accuracy (%)', fontsize=12)
axes[0, 0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0, 0].grid(axis='x', alpha=0.3)
for i, acc in enumerate(accuracies):
    axes[0, 0].text(acc, i, f'{acc:.2f}%', va='center', ha='left', fontweight='bold')

# 2. F1-Score Comparison
axes[0, 1].barh(model_names, f1_scores, color=['#3498db', '#e74c3c', '#2ecc71', '#9b59b6'])
axes[0, 1].set_xlabel('F1-Score (%)', fontsize=12)
axes[0, 1].set_title('Model F1-Score Comparison', fontsize=14, fontweight='bold')
axes[0, 1].grid(axis='x', alpha=0.3)
for i, f1 in enumerate(f1_scores):
    axes[0, 1].text(f1, i, f'{f1:.2f}%', va='center', ha='left', fontweight='bold')

# 3. Training Time Comparison
axes[1, 0].barh(model_names, times, color=['#3498db', '#e74c3c', '#2ecc71', '#9b59b6'])
axes[1, 0].set_xlabel('Training Time (seconds)', fontsize=12)
axes[1, 0].set_title('Training Time Comparison', fontsize=14, fontweight='bold')
axes[1, 0].grid(axis='x', alpha=0.3)
for i, t in enumerate(times):
    axes[1, 0].text(t, i, f'{t:.2f}s', va='center', ha='left', fontweight='bold')

# 4. Confusion Matrix Heatmap
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.3f', cmap='Blues', 
            xticklabels=['Human', 'AI'], yticklabels=['Human', 'AI'],
            ax=axes[1, 1], cbar_kws={'label': 'Normalized Count'})
axes[1, 1].set_title('Confusion Matrix (Normalized)', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('True Label', fontsize=12)
axes[1, 1].set_xlabel('Predicted Label', fontsize=12)

plt.tight_layout()
plt.show()

print(f"\n‚úì Best Model: SVM with {svm_accuracy*100:.2f}% accuracy")


In [None]:
# ROC Curve and Precision-Recall Curve
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba_svm)
roc_auc = roc_auc_score(y_test, y_proba_svm)

axes[0].plot(fpr, tpr, color='#3498db', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
axes[0].plot([0, 1], [0, 1], color='#e74c3c', lw=2, linestyle='--', label='Random')
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('False Positive Rate', fontsize=12)
axes[0].set_ylabel('True Positive Rate', fontsize=12)
axes[0].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[0].legend(loc="lower right")
axes[0].grid(alpha=0.3)

# Precision-Recall Curve
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba_svm)

axes[1].plot(recall_curve, precision_curve, color='#2ecc71', lw=2, label='Precision-Recall curve')
axes[1].set_xlabel('Recall', fontsize=12)
axes[1].set_ylabel('Precision', fontsize=12)
axes[1].set_title('Precision-Recall Curve', fontsize=14, fontweight='bold')
axes[1].legend(loc="lower left")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Learning Curve
print("\n" + "=" * 70)
print("GENERATING LEARNING CURVE")
print("=" * 70)

train_sizes, train_scores, val_scores = learning_curve(
    final_svm, X_train, y_train,
    cv=skf, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

plt.figure(figsize=(12, 6))
plt.plot(train_sizes, train_mean, 'o-', color='#3498db', label='Training Score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='#3498db')
plt.plot(train_sizes, val_mean, 'o-', color='#e74c3c', label='Validation Score')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='#e74c3c')
plt.xlabel('Training Set Size', fontsize=12)
plt.ylabel('Accuracy Score', fontsize=12)
plt.title('Learning Curve - SVM Model', fontsize=14, fontweight='bold')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.show()

print("‚úì Learning curve generated")


## 6. Save Models and Vectorizers


In [None]:
# Save all models and vectorizers
print("=" * 70)
print("SAVING MODELS")
print("=" * 70)

model_save_dir = project_root / "models" / "saved_models"
os.makedirs(str(model_save_dir), exist_ok=True)

# Save vectorizers
with open(str(model_save_dir / "word_vectorizer.pkl"), "wb") as f:
    pickle.dump(word_vectorizer, f)
print("‚úì Saved word_vectorizer.pkl")

with open(str(model_save_dir / "char_vectorizer.pkl"), "wb") as f:
    pickle.dump(char_vectorizer, f)
print("‚úì Saved char_vectorizer.pkl")

# Save models
with open(str(model_save_dir / "svm_model.pkl"), "wb") as f:
    pickle.dump(final_svm, f)
print("‚úì Saved svm_model.pkl")

with open(str(model_save_dir / "logreg_model.pkl"), "wb") as f:
    pickle.dump(lr_model, f)
print("‚úì Saved logreg_model.pkl")

# Save label encoder
with open(str(model_save_dir / "label_encoder.pkl"), "wb") as f:
    pickle.dump(label_encoder, f)
print("‚úì Saved label_encoder.pkl")

print(f"\n‚úì All models saved to: {model_save_dir}")

# Display file sizes
print("\nModel file sizes:")
for filename in ["word_vectorizer.pkl", "char_vectorizer.pkl", "svm_model.pkl", "logreg_model.pkl", "label_encoder.pkl"]:
    filepath = model_save_dir / filename
    if filepath.exists():
        size_mb = filepath.stat().st_size / (1024 * 1024)
        print(f"  {filename}: {size_mb:.2f} MB")


## 7. Test Predictions on Sample Texts


In [None]:
# Test predictions on sample texts
print("=" * 70)
print("SAMPLE PREDICTIONS")
print("=" * 70)

# Get some test samples
test_samples = [
    X_test_texts[0],
    X_test_texts[1],
    X_test_texts[2],
    X_test_texts[3],
    X_test_texts[4]
]

true_labels = [label_encoder.inverse_transform([y_test[i]])[0] for i in range(5)]

for i, (text, true_label) in enumerate(zip(test_samples, true_labels)):
    # Create features
    X_word_sample = word_vectorizer.transform([text])
    X_char_sample = char_vectorizer.transform([text])
    X_sample = hstack([X_word_sample, X_char_sample])
    
    # Predict
    prediction = final_svm.predict(X_sample)[0]
    probabilities = final_svm.predict_proba(X_sample)[0]
    predicted_label = label_encoder.inverse_transform([prediction])[0]
    confidence = probabilities[prediction] * 100
    
    print(f"\n--- Sample {i+1} ---")
    print(f"True Label: {true_label.upper()}")
    print(f"Predicted: {predicted_label.upper()} (Confidence: {confidence:.2f}%)")
    print(f"Probabilities - Human: {probabilities[0]*100:.2f}%, AI: {probabilities[1]*100:.2f}%")
    print(f"Text preview: {text[:150]}...")
    print(f"‚úì Correct!" if true_label == predicted_label else f"‚úó Incorrect!")


## 8. Final Summary


In [None]:
# Final summary
print("=" * 70)
print("TRAINING SUMMARY")
print("=" * 70)

print(f"\nüìä Dataset:")
print(f"  Total samples: {len(balanced_dataset):,}")
print(f"  Training samples: {len(X_train_texts):,}")
print(f"  Test samples: {len(X_test_texts):,}")
print(f"  Balanced classes: {len(ai_articles):,} AI, {len(human_articles):,} Human")

print(f"\nüîß Features:")
print(f"  Word features: {X_train_word.shape[1]:,}")
print(f"  Character features: {X_train_char.shape[1]:,}")
print(f"  Total features: {X_train.shape[1]:,}")

print(f"\nüéØ Model Performance (Test Set):")
print(f"  SVM Accuracy: {svm_accuracy*100:.2f}%")
print(f"  SVM F1-Score: {svm_f1:.4f}")
print(f"  Logistic Regression Accuracy: {lr_accuracy*100:.2f}%")
print(f"  Random Forest Accuracy: {results['Random Forest']['accuracy']*100:.2f}%")
print(f"  Gradient Boosting Accuracy: {results['Gradient Boosting']['accuracy']*100:.2f}%")

print(f"\n‚úÖ Cross-Validation:")
print(f"  Mean CV Accuracy: {cv_scores.mean()*100:.2f}%")
print(f"  Std: {cv_scores.std():.4f}")

print(f"\nüíæ Models Saved:")
print(f"  Location: {str(model_save_dir)}")
print(f"  Files: word_vectorizer.pkl, char_vectorizer.pkl, svm_model.pkl, logreg_model.pkl, label_encoder.pkl")

if svm_accuracy >= 0.99:
    print(f"\nüéâ SUCCESS! Achieved 99%+ accuracy: {svm_accuracy*100:.2f}%")
else:
    print(f"\n‚ö†Ô∏è  Current accuracy: {svm_accuracy*100:.2f}% (Target: 99%+)")
    print(f"   Consider: increasing max_features, tuning hyperparameters further, or adding more data")

print("\n" + "=" * 70)
