In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
import time

def clean_text(text):
    """Light text cleaning"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def optimized_tfidf_lr_approach():
    
    print("Loading data...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    
    # Identify text column
    text_col = [c for c in train_df.columns if c not in ('id','label')][0]
    
    # Prepare data
    X_train_raw = train_df[text_col].apply(clean_text)
    y_train = train_df['label'].values
    X_test_raw = test_df[text_col].apply(clean_text)
    test_ids = test_df['id'].values
    
    # Create validation set
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_raw, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    print("Building pipeline...")
    # Optimized pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=15000,           # More features for better coverage
            ngram_range=(1, 3),           # Unigrams, bigrams, and trigrams
            min_df=2,                     # Ignore very rare terms
            max_df=0.95,                  # Ignore very common terms
            stop_words='english',         # Remove English stop words
            sublinear_tf=True,            # Apply sublinear TF scaling
            strip_accents='unicode'       # Handle unicode characters
        )),
        ('feature_selection', SelectKBest(score_func=chi2, k=5000)),  # Select top 5000 features
        ('classifier', LogisticRegression(
            C=1.0,                        # Regularization parameter
            random_state=42,
            max_iter=1000,
            class_weight='balanced'       # Handle class imbalance
        ))
    ])
    
    print("Training model...")
    start_time = time.time()
    
    # Train the pipeline
    pipeline.fit(X_train_split, y_train_split)
    
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    # Validate
    print("Validating model...")
    y_val_pred = pipeline.predict(X_val_split)
    val_f1 = f1_score(y_val_split, y_val_pred, average='macro')
    
    print(f"Validation Macro F1 Score: {val_f1:.4f}")
    print("\nDetailed Validation Report:")
    print(classification_report(y_val_split, y_val_pred))
    
    # Feature analysis
    print("\nAnalyzing important features...")
    # Get feature names and coefficients
    feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
    selected_features = pipeline.named_steps['feature_selection'].get_support()
    selected_feature_names = feature_names[selected_features]
    coefficients = pipeline.named_steps['classifier'].coef_[0]
    
    # Top features for each class
    feature_coef_pairs = list(zip(selected_feature_names, coefficients))
    feature_coef_pairs.sort(key=lambda x: x[1])
    
    print("Top 10 features for non-hateful class:")
    for feature, coef in feature_coef_pairs[:10]:
        print(f"  {feature}: {coef:.4f}")
    
    print("\nTop 10 features for hateful class:")
    for feature, coef in feature_coef_pairs[-10:]:
        print(f"  {feature}: {coef:.4f}")
    
    # Retrain on full dataset
    print("\nRetraining on full dataset...")
    pipeline.fit(X_train_raw, y_train)
    
    # Predict on test set
    print("Making predictions on test set...")
    y_test_pred = pipeline.predict(X_test_raw)
    
    # Save predictions
    submission_df = pd.DataFrame({
        'id': test_ids,
        'label': y_test_pred
    })
    submission_df.to_csv('Optimized_TFIDF_LR_Prediction.csv', index=False)
    print("Predictions saved to Optimized_TFIDF_LR_Prediction.csv")
    
    # Summary
    print("\n" + "="*60)
    print("OPTIMIZED TF-IDF + LOGISTIC REGRESSION SUMMARY")
    print("="*60)
    print(f"Validation Macro F1 Score: {val_f1:.4f}")
    print(f"Training Time: {training_time:.2f} seconds")
    print(f"Total Features Generated: 15000")
    print(f"Features Selected: 5000")
    print(f"N-gram Range: (1,3)")
    print("Model: Logistic Regression with L2 regularization")
    print("File Saved: Optimized_TFIDF_LR_Prediction.csv")
    
    return val_f1, pipeline

# Run the approach
if __name__ == "__main__":
    print("Starting Optimized TF-IDF + Logistic Regression Approach")
    print("Expected training time: 10-20 minutes")
    print("="*60)
    
    val_score, model = optimized_tfidf_lr_approach()
    
    print(f"\nFinal Validation Score: {val_score:.4f}")
    print("This approach typically achieves 0.75-0.82 Macro F1 on hate speech datasets")

Starting Optimized TF-IDF + Logistic Regression Approach
Expected training time: 10-20 minutes
Loading data...
Building pipeline...
Training model...
Training completed in 1.35 seconds
Validating model...
Validation Macro F1 Score: 0.7000

Detailed Validation Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      2127
           1       0.61      0.67      0.64      1310

    accuracy                           0.71      3437
   macro avg       0.70      0.70      0.70      3437
weighted avg       0.72      0.71      0.71      3437


Analyzing important features...
Top 10 features for non-hateful class:
  cruz: -2.8363
  people: -2.7592
  abortion: -2.3426
  sure: -2.3085
  optics: -2.2109
  bjp: -2.0953
  india: -2.0748
  white house: -1.9187
  prolife: -1.9155
  trump: -1.9150

Top 10 features for hateful class:
  white people: 2.8562
  muslims: 2.9775
  blacks: 2.9908
  deport: 3.0883
  islam: 3.0922
  whitegenocide: 3.1115
  i