In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import xgboost as xgb
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import re
import warnings
warnings.filterwarnings('ignore')

In [13]:
def load_csv_robust(filepath):
    """
    Robust CSV loading with multiple fallback strategies
    Args:
        filepath: Path to CSV file
    Returns:
        DataFrame
    """
    print(f"Attempting to load: {filepath}")

    # Strategy 1: Standard pandas read_csv
    try:
        print("Trying standard CSV loading...")
        df = pd.read_csv(filepath)
        print(f"✓ Successfully loaded with standard method")
        return df
    except Exception as e:
        print(f"✗ Standard loading failed: {str(e)[:100]}")

    # Strategy 2: Handle quote issues with error_bad_lines parameter
    try:
        print("Trying with on_bad_lines='skip'...")
        df = pd.read_csv(filepath, on_bad_lines='skip', engine='python')
        print(f"✓ Successfully loaded, some bad lines were skipped")
        return df
    except Exception as e:
        print(f"✗ Failed: {str(e)[:100]}")

    # Strategy 3: Handle malformed quotes
    try:
        print("Trying with quoting=csv.QUOTE_NONE...")
        import csv
        df = pd.read_csv(filepath, quoting=csv.QUOTE_NONE, engine='python', on_bad_lines='skip')
        print(f"✓ Successfully loaded with no quoting")
        return df
    except Exception as e:
        print(f"✗ Failed: {str(e)[:100]}")

    # Strategy 4: Custom delimiter and encoding
    try:
        print("Trying with different encoding (latin-1)...")
        df = pd.read_csv(filepath, encoding='latin-1', on_bad_lines='skip', engine='python')
        print(f"✓ Successfully loaded with latin-1 encoding")
        return df
    except Exception as e:
        print(f"✗ Failed: {str(e)[:100]}")

    # Strategy 5: Read line by line (slowest but most robust)
    try:
        print("Trying line-by-line parsing (this may take a while)...")
        data = []
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            header = f.readline().strip().split(',')
            print(f"Header: {header}")

            for i, line in enumerate(f):
                try:
                    # Simple splitting - assumes 2 columns
                    parts = line.strip().rsplit(',', 1)  # Split from right to get last column
                    if len(parts) == 2:
                        text = parts[0].strip('"').strip("'")
                        label = parts[1].strip()
                        data.append({'text': text, 'generated': label})
                except:
                    continue

                if (i + 1) % 10000 == 0:
                    print(f"Processed {i + 1} lines...")

        df = pd.DataFrame(data)
        print(f"✓ Successfully loaded {len(df)} rows with line-by-line parsing")
        return df
    except Exception as e:
        print(f"✗ Line-by-line parsing failed: {str(e)[:100]}")

    raise ValueError("All loading strategies failed. Please check your CSV file format.")

def load_and_preprocess_data(df):
    """
    Load and preprocess the dataset
    Args:
        df: DataFrame with 'text' and 'generated' columns
    Returns:
        Preprocessed DataFrame
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING")
    print("="*80)

    print(f"\nInitial shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")

    # Handle different column name cases
    df.columns = df.columns.str.lower().str.strip()

    # Check for required columns
    if 'text' not in df.columns or 'generated' not in df.columns:
        print(f"\nAvailable columns: {df.columns.tolist()}")
        raise ValueError("DataFrame must have 'text' and 'generated' columns")

    # Remove duplicates
    before = len(df)
    df = df.drop_duplicates(subset=['text'])
    print(f"Removed {before - len(df)} duplicate rows")

    # Remove null values
    before = len(df)
    df = df.dropna(subset=['text', 'generated'])
    print(f"Removed {before - len(df)} rows with null values")

    # Convert to proper types
    df['text'] = df['text'].astype(str)
    df['text'] = df['text'].str.strip()

    # Clean generated column (handle various formats)
    df['generated'] = df['generated'].astype(str).str.strip().str.lower()

    # Map to binary (handle different formats: 0/1, true/false, yes/no, etc.)
    def map_to_binary(val):
        val = str(val).lower().strip()
        if val in ['1', '1.0', 'true', 'yes', 'ai', 'generated']:
            return 1
        elif val in ['0', '0.0', 'false', 'no', 'human']:
            return 0
        else:
            return None

    df['generated'] = df['generated'].apply(map_to_binary)

    # Remove rows where mapping failed
    before = len(df)
    df = df.dropna(subset=['generated'])
    df['generated'] = df['generated'].astype(int)
    print(f"Removed {before - len(df)} rows with invalid labels")

    # Remove very short texts (less than 50 characters)
    before = len(df)
    df = df[df['text'].str.len() > 50]
    print(f"Removed {before - len(df)} rows with text < 50 characters")

    # Remove very long texts that might cause issues (> 10000 chars)
    before = len(df)
    df = df[df['text'].str.len() <= 10000]
    print(f"Removed {before - len(df)} rows with text > 10000 characters")

    # Reset index
    df = df.reset_index(drop=True)

    print(f"\nFinal dataset shape: {df.shape}")
    print(f"\nClass distribution:")
    print(df['generated'].value_counts())
    print(f"\nClass distribution (%):")
    print(df['generated'].value_counts(normalize=True)*100)

    # Check for class imbalance
    class_counts = df['generated'].value_counts()
    ratio = class_counts.max() / class_counts.min()
    if ratio > 3:
        print(f"\n⚠ WARNING: Significant class imbalance detected (ratio: {ratio:.2f})")
        print("Consider using class_weight='balanced' in models")

    return df


In [14]:
def extract_linguistic_features(text):
    """Extract linguistic features that might distinguish AI from human text"""
    features = {}

    # Length features
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['avg_word_length'] = np.mean([len(word) for word in text.split()])

    # Sentence features
    sentences = re.split(r'[.!?]+', text)
    sentences = [s for s in sentences if s.strip()]
    features['sentence_count'] = len(sentences)
    features['avg_sentence_length'] = np.mean([len(s.split()) for s in sentences]) if sentences else 0

    # Punctuation features
    features['comma_count'] = text.count(',')
    features['semicolon_count'] = text.count(';')
    features['exclamation_count'] = text.count('!')
    features['question_count'] = text.count('?')

    # Vocabulary diversity
    words = text.lower().split()
    features['unique_word_ratio'] = len(set(words)) / len(words) if words else 0

    # AI text often has more consistent structure
    features['sentence_length_variance'] = np.var([len(s.split()) for s in sentences]) if len(sentences) > 1 else 0

    return features

def create_feature_dataframe(df):
    """Create feature DataFrame from text"""
    features_list = df['text'].apply(extract_linguistic_features).tolist()
    features_df = pd.DataFrame(features_list)
    return features_df

In [15]:
def train_traditional_ml_models(X_train, X_test, y_train, y_test, text_train, text_test):
    """
    Train ensemble of traditional ML models with TF-IDF features
    """
    print("\n" + "="*80)
    print("TRAINING TRADITIONAL ML MODELS")
    print("="*80)

    # TF-IDF Vectorization
    print("\nCreating TF-IDF features...")
    tfidf = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )

    X_train_tfidf = tfidf.fit_transform(text_train)
    X_test_tfidf = tfidf.transform(text_test)

    # Combine TF-IDF with linguistic features
    from scipy.sparse import hstack
    X_train_combined = hstack([X_train_tfidf, X_train.values])
    X_test_combined = hstack([X_test_tfidf, X_test.values])

    # Model 1: Logistic Regression
    print("\nTraining Logistic Regression...")
    lr_model = LogisticRegression(C=2.0, max_iter=1000, class_weight='balanced', random_state=42)
    lr_model.fit(X_train_combined, y_train)
    lr_pred = lr_model.predict(X_test_combined)
    lr_acc = accuracy_score(y_test, lr_pred)
    print(f"Logistic Regression Accuracy: {lr_acc:.4f}")

    # Model 2: XGBoost
    print("\nTraining XGBoost...")
    xgb_model = xgb.XGBClassifier(
        n_estimators=300,
        max_depth=7,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_model.fit(X_train_combined, y_train)
    xgb_pred = xgb_model.predict(X_test_combined)
    xgb_acc = accuracy_score(y_test, xgb_pred)
    print(f"XGBoost Accuracy: {xgb_acc:.4f}")

    # Model 3: Random Forest
    print("\nTraining Random Forest...")
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        random_state=42,
        class_weight='balanced'
    )
    rf_model.fit(X_train_combined, y_train)
    rf_pred = rf_model.predict(X_test_combined)
    rf_acc = accuracy_score(y_test, rf_pred)
    print(f"Random Forest Accuracy: {rf_acc:.4f}")

    # Ensemble with soft voting
    print("\nCreating Ensemble Model...")
    ensemble = VotingClassifier(
        estimators=[
            ('lr', lr_model),
            ('xgb', xgb_model),
            ('rf', rf_model)
        ],
        voting='soft'
    )
    ensemble.fit(X_train_combined, y_train)
    ensemble_pred = ensemble.predict(X_test_combined)
    ensemble_acc = accuracy_score(y_test, ensemble_pred)

    print("\n" + "-"*80)
    print("TRADITIONAL ML RESULTS")
    print("-"*80)
    print(f"Ensemble Accuracy: {ensemble_acc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, ensemble_pred, target_names=['Human', 'AI']))
    print(f"\nROC-AUC Score: {roc_auc_score(y_test, ensemble.predict_proba(X_test_combined)[:, 1]):.4f}")

    return ensemble, tfidf, ensemble_acc

In [16]:
class TextDataset(Dataset):
    """Custom Dataset for transformer models"""
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_transformer_model(text_train, text_test, y_train, y_test):
    """
    Train a transformer-based model (DistilBERT for efficiency)
    """
    print("\n" + "="*80)
    print("TRAINING TRANSFORMER MODEL")
    print("="*80)

    # Use DistilBERT for faster training while maintaining high accuracy
    model_name = 'distilbert-base-uncased'

    print(f"\nLoading tokenizer and model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    # Create datasets
    train_dataset = TextDataset(
        text_train.tolist(),
        y_train.tolist(),
        tokenizer,
        max_length=512
    )

    test_dataset = TextDataset(
        text_test.tolist(),
        y_test.tolist(),
        tokenizer,
        max_length=512
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        learning_rate=2e-5,
    )

    # Metrics
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc}

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    # Train
    print("\nStarting training...")
    trainer.train()

    # Evaluate
    print("\nEvaluating on test set...")
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(-1)
    transformer_acc = accuracy_score(y_test, preds)

    print("\n" + "-"*80)
    print("TRANSFORMER MODEL RESULTS")
    print("-"*80)
    print(f"Accuracy: {transformer_acc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, preds, target_names=['Human', 'AI']))
    print(f"\nROC-AUC Score: {roc_auc_score(y_test, torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()):.4f}")

    return model, tokenizer, transformer_acc

In [17]:
def run_full_pipeline(df):
    """
    Run the complete pipeline
    Args:
        df: DataFrame with 'text' and 'generated' columns
    """
    print("="*80)
    print("AI TEXT DETECTION PIPELINE")
    print("="*80)

    # Step 1: Preprocess data
    df = load_and_preprocess_data(df)

    # Step 2: Create linguistic features
    print("\nExtracting linguistic features...")
    features_df = create_feature_dataframe(df)

    # Step 3: Split data (stratified to maintain class balance)
    X_train, X_test, y_train, y_test = train_test_split(
        features_df,
        df['generated'],
        test_size=0.2,
        random_state=42,
        stratify=df['generated']
    )

    text_train = df.loc[X_train.index, 'text'].reset_index(drop=True)
    text_test = df.loc[X_test.index, 'text'].reset_index(drop=True)
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    # Step 4: Train traditional ML models
    ensemble_model, tfidf, ensemble_acc = train_traditional_ml_models(
        X_train, X_test, y_train, y_test, text_train, text_test
    )

    # Step 5: Train transformer model
    transformer_model, tokenizer, transformer_acc = train_transformer_model(
        text_train, text_test, y_train, y_test
    )

    # Final summary
    print("\n" + "="*80)
    print("FINAL SUMMARY")
    print("="*80)
    print(f"Traditional ML Ensemble Accuracy: {ensemble_acc:.4f} ({ensemble_acc*100:.2f}%)")
    print(f"Transformer Model Accuracy: {transformer_acc:.4f} ({transformer_acc*100:.2f}%)")
    print(f"\nBest Model: {'Transformer' if transformer_acc > ensemble_acc else 'Traditional ML Ensemble'}")
    print(f"Best Accuracy: {max(ensemble_acc, transformer_acc):.4f} ({max(ensemble_acc, transformer_acc)*100:.2f}%)")

    if max(ensemble_acc, transformer_acc) >= 0.95:
        print("\n✓ TARGET ACHIEVED: >95% accuracy")
    else:
        print("\n✗ TARGET NOT MET: Consider:")
        print("  - More training data")
        print("  - Longer training (more epochs)")
        print("  - Larger transformer model (roberta-base, bert-large)")
        print("  - Hyperparameter tuning")

    return {
        'ensemble_model': ensemble_model,
        'tfidf': tfidf,
        'transformer_model': transformer_model,
        'tokenizer': tokenizer,
        'ensemble_acc': ensemble_acc,
        'transformer_acc': transformer_acc
    }

In [None]:
# This function will clean and save a new CSV
def fix_csv_file(input_path, output_path):
    print(f"Fixing CSV: {input_path}")
    data = []
    errors = 0

    with open(input_path, 'r', encoding='utf-8', errors='ignore') as infile:
        # Read header
        header = infile.readline().strip()

        for i, line in enumerate(infile):
            try:
                # Remove any problematic characters
                line = line.strip()

                # Try to parse - assuming last comma separates text from label
                parts = line.rsplit(',', 1)
                if len(parts) == 2:
                    text = parts[0].strip('"').strip("'").replace('"', '""')
                    label = parts[1].strip()
                    data.append(f'"{text}",{label}')
            except:
                errors += 1
                continue

            if (i + 1) % 10000 == 0:
                print(f"Processed {i + 1} lines, {errors} errors")

    # Write cleaned CSV
    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write('text,generated\n')
        for row in data:
            outfile.write(row + '\n')

    print(f"\nFixed CSV saved to: {output_path}")
    print(f"Total rows: {len(data)}, Errors: {errors}")
    return output_path


fixed_file = fix_csv_file('AI_Human.csv', 'fixed_data.csv')
df = pd.read_csv(fixed_file)
results = run_full_pipeline(df)

Fixing CSV: AI_Human.csv
Processed 10000 lines, 0 errors
Processed 20000 lines, 0 errors
Processed 30000 lines, 0 errors
Processed 40000 lines, 0 errors
Processed 50000 lines, 0 errors
Processed 60000 lines, 0 errors
Processed 70000 lines, 0 errors
Processed 80000 lines, 0 errors
Processed 90000 lines, 0 errors
Processed 100000 lines, 0 errors
Processed 110000 lines, 0 errors
Processed 120000 lines, 0 errors
Processed 130000 lines, 0 errors
Processed 140000 lines, 0 errors
Processed 150000 lines, 0 errors
Processed 160000 lines, 0 errors
Processed 170000 lines, 0 errors
Processed 180000 lines, 0 errors
Processed 190000 lines, 0 errors
Processed 200000 lines, 0 errors
Processed 210000 lines, 0 errors
Processed 220000 lines, 0 errors
Processed 230000 lines, 0 errors
Processed 240000 lines, 0 errors
Processed 250000 lines, 0 errors
Processed 260000 lines, 0 errors
Processed 270000 lines, 0 errors
Processed 280000 lines, 0 errors
Processed 290000 lines, 0 errors
Processed 300000 lines, 0 e