# Base Models
### AdaBoost

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class AdaBoostModel:
    """
    AdaBoost model for hate speech classification
    Adaptive Boosting with decision tree base estimators
    """
    
    def __init__(self, random_state=42):
        # Create base estimator (decision tree)
        base_estimator = DecisionTreeClassifier(
            max_depth=3,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=random_state
        )
        
        self.model = AdaBoostClassifier(
            estimator=base_estimator,
            n_estimators=100,
            learning_rate=1.0,
            random_state=random_state
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the AdaBoost model with cross-validation
        """
        print("Training AdaBoost...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"AdaBoost CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("AdaBoost Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_estimator_weights(self):
        """
        Get weights of individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting estimator weights")
        
        return self.model.estimator_weights_
    
    def get_estimator_errors(self):
        """
        Get errors of individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting estimator errors")
        
        return self.model.estimator_errors_ 

### Bagging

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class BaggingClassifierModel:
    """
    Bagging Classifier model for hate speech classification
    Combines multiple base estimators with bootstrap sampling
    """
    
    def __init__(self, random_state=42):
        base_estimator = DecisionTreeClassifier(
            random_state=random_state,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2
        )
        
        self.model = BaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=10,
            max_samples=1.0,
            max_features=1.0,
            bootstrap=True,
            bootstrap_features=False,
            oob_score=False,
            random_state=random_state,
            n_jobs=-1
        )
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the Bagging Classifier model with optional hyperparameter tuning
        """
        print("Training Bagging Classifier...")
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'n_estimators': [5, 10, 15, 20],
                'max_samples': [0.7, 0.8, 0.9, 1.0],
                'max_features': [0.7, 0.8, 0.9, 1.0],
                'base_estimator__max_depth': [5, 10, 15],
                'base_estimator__min_samples_split': [2, 5, 10]
            }
            
            base_estimator = DecisionTreeClassifier(random_state=42)
            
            grid_search = GridSearchCV(
                BaggingClassifier(
                    base_estimator=base_estimator,
                    bootstrap=True,
                    bootstrap_features=False,
                    random_state=42,
                    n_jobs=-1
                ),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Bagging Classifier CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Bagging Classifier Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (average across all estimators)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        # Average feature importance across all estimators
        all_importances = []
        for estimator in self.model.estimators_:
            if hasattr(estimator, 'feature_importances_'):
                all_importances.append(estimator.feature_importances_)
        
        if all_importances:
            avg_importance = np.mean(all_importances, axis=0)
        else:
            avg_importance = np.zeros(X_train.shape[1])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': avg_importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return avg_importance
    
    def get_estimator_predictions(self, X):
        """
        Get predictions from individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting estimator predictions")
        
        predictions = []
        for estimator in self.model.estimators_:
            pred = estimator.predict(X)
            predictions.append(pred)
        
        return np.array(predictions)
    
    def get_estimator_probabilities(self, X):
        """
        Get probability predictions from individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting estimator probabilities")
        
        probabilities = []
        for estimator in self.model.estimators_:
            if hasattr(estimator, 'predict_proba'):
                prob = estimator.predict_proba(X)[:, 1]  # Probability of positive class
                probabilities.append(prob)
        
        return np.array(probabilities)
    
    def get_model_info(self):
        """
        Get detailed model information
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting model info")
        
        return {
            'n_estimators': self.model.n_estimators,
            'max_samples': self.model.max_samples,
            'max_features': self.model.max_features,
            'bootstrap': self.model.bootstrap,
            'bootstrap_features': self.model.bootstrap_features,
            'oob_score': self.model.oob_score,
            'base_estimator_type': type(self.model.base_estimator).__name__,
            'best_params': self.best_params
        }
    
    def get_estimator_variance(self, X):
        """
        Get variance of predictions across estimators (measure of uncertainty)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting estimator variance")
        
        predictions = self.get_estimator_predictions(X)
        variance = np.var(predictions, axis=0)
        return variance 

### CatBoost

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class XGBoostModel:
    """
    XGBoost model for hate speech classification
    Gradient Boosting with use_label_encoder=False
    """
    
    def __init__(self, random_state=42):
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_state,
            use_label_encoder=False,
            eval_metric='logloss',
            n_jobs=-1
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the XGBoost model with cross-validation
        """
        print("Training XGBoost...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"XGBoost CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("XGBoost Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

### Decision Tree

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class DecisionTreeModel:
    """
    Decision Tree model for hate speech classification
    Good for interpretability and handling non-linear relationships
    """
    
    def __init__(self, random_state=42):
        self.model = DecisionTreeClassifier(
            random_state=random_state,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            criterion='gini'
        )
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the decision tree model with optional hyperparameter tuning
        """
        print("Training Decision Tree...")
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'max_depth': [5, 10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'criterion': ['gini', 'entropy']
            }
            
            grid_search = GridSearchCV(
                DecisionTreeClassifier(random_state=42),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Decision Tree CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Decision Tree Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_tree_depth(self):
        """
        Get the depth of the trained tree
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting tree depth")
        return self.model.get_depth() 

### Elastic Net

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class ElasticNetModel:
    """
    Elastic Net model for hate speech classification
    Combines L1 (Lasso) and L2 (Ridge) regularization
    """
    
    def __init__(self, random_state=42):
        self.model = LogisticRegression(
            random_state=random_state,
            penalty='elasticnet',  # Elastic Net regularization
            solver='saga',
            C=1.0,  # Inverse of regularization strength
            l1_ratio=0.5,  # Mixing parameter (0=Ridge, 1=Lasso)
            max_iter=1000
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the Elastic Net model with optional hyperparameter tuning
        """
        print("Training Elastic Net...")
        
        # Scale features for Elastic Net
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'C': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
                'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                'max_iter': [1000, 2000]
            }
            
            grid_search = GridSearchCV(
                LogisticRegression(
                    random_state=42,
                    penalty='elasticnet',
                    solver='saga'
                ),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"Elastic Net CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Elastic Net Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = np.abs(self.model.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_regularization_info(self):
        """
        Get information about regularization parameters
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting regularization info")
        
        coefficients = self.model.coef_[0]
        n_features = len(coefficients)
        n_nonzero = np.count_nonzero(coefficients)
        sparsity_ratio = 1 - (n_nonzero / n_features)
        
        return {
            'C': self.model.C,
            'l1_ratio': self.model.l1_ratio,
            'total_features': n_features,
            'non_zero_features': n_nonzero,
            'zero_features': n_features - n_nonzero,
            'sparsity_ratio': sparsity_ratio,
            'l1_contribution': self.model.l1_ratio,
            'l2_contribution': 1 - self.model.l1_ratio
        }
    
    def get_selected_features(self, feature_names=None, threshold=0.0):
        """
        Get features selected by Elastic Net (non-zero coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting selected features")
        
        coefficients = self.model.coef_[0]
        selected_indices = np.where(np.abs(coefficients) > threshold)[0]
        
        if feature_names is not None:
            selected_features = [feature_names[i] for i in selected_indices]
            selected_coefficients = coefficients[selected_indices]
            
            selected_df = pd.DataFrame({
                'feature': selected_features,
                'coefficient': selected_coefficients,
                'abs_coefficient': np.abs(selected_coefficients)
            }).sort_values('abs_coefficient', ascending=False)
            
            return selected_df
        else:
            return {
                'indices': selected_indices,
                'coefficients': coefficients[selected_indices]
            }
    
    def get_model_params(self):
        """
        Get current model parameters
        """
        return {
            'C': self.model.C,
            'l1_ratio': self.model.l1_ratio,
            'penalty': self.model.penalty,
            'solver': self.model.solver,
            'max_iter': self.model.max_iter,
            'best_params': self.best_params
        } 

### Extra Trees

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class ExtraTreesModel:
    """
    Extra Trees Classifier for hate speech classification
    Introduces more randomness
    """
    
    def __init__(self, random_state=42):
        self.model = ExtraTreesClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='sqrt',
            bootstrap=False,  # Extra Trees doesn't use bootstrap
            random_state=random_state,
            n_jobs=-1
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the Extra Trees model with cross-validation
        """
        print("Training Extra Trees...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Extra Trees CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Extra Trees Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

### Gradient Boosting

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class GradientBoostingModel:
    """
    Gradient Boosting model for hate speech classification
    Excellent performance with sequential weak learners
    """
    
    def __init__(self, random_state=42):
        self.model = GradientBoostingClassifier(
            random_state=random_state,
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            min_samples_split=2,
            min_samples_leaf=1,
            subsample=1.0
        )
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the gradient boosting model with optional hyperparameter tuning
        """
        print("Training Gradient Boosting...")
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.05, 0.1, 0.15],
                'max_depth': [3, 5, 7],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'subsample': [0.8, 0.9, 1.0]
            }
            
            grid_search = GridSearchCV(
                GradientBoostingClassifier(random_state=42),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Gradient Boosting CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Gradient Boosting Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_staged_predictions(self, X, n_stages=None):
        """
        Get predictions from each stage of the boosting process
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting staged predictions")
        
        if n_stages is None:
            n_stages = self.model.n_estimators
        
        staged_preds = []
        for pred in self.model.staged_predict(X):
            staged_preds.append(pred)
            if len(staged_preds) >= n_stages:
                break
        
        return np.array(staged_preds)
    
    def get_staged_probabilities(self, X, n_stages=None):
        """
        Get probability predictions from each stage of the boosting process
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting staged probabilities")
        
        if n_stages is None:
            n_stages = self.model.n_estimators
        
        staged_probs = []
        for prob in self.model.staged_predict_proba(X):
            staged_probs.append(prob[:, 1])  # Probability of positive class
            if len(staged_probs) >= n_stages:
                break
        
        return np.array(staged_probs)
    
    def get_model_info(self):
        """
        Get detailed model information
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting model info")
        
        return {
            'n_estimators': self.model.n_estimators,
            'learning_rate': self.model.learning_rate,
            'max_depth': self.model.max_depth,
            'min_samples_split': self.model.min_samples_split,
            'min_samples_leaf': self.model.min_samples_leaf,
            'subsample': self.model.subsample,
            'best_params': self.best_params
        } 

### K-Nearest Neighbours

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class KNNModel:
    """
    K-Nearest Neighbors model for hate speech classification
    Good for capturing local patterns and non-linear decision boundaries
    """
    
    def __init__(self, random_state=42):
        self.model = KNeighborsClassifier(
            n_neighbors=5,
            weights='uniform',
            algorithm='auto',
            leaf_size=30,
            p=2  # Euclidean distance
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=False):
        """
        Train the KNN model with optional hyperparameter tuning
        """
        print("Training K-Nearest Neighbors...")
        
        # Scale features for KNN (important for distance-based algorithms)
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
                'weights': ['uniform', 'distance'],
                'p': [1, 2],  # Manhattan and Euclidean distance
                'leaf_size': [20, 30, 40]
            }
            
            grid_search = GridSearchCV(
                KNeighborsClassifier(),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"KNN CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("KNN Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_neighbors_info(self, X, k=5):
        """
        Get information about k nearest neighbors for given samples
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting neighbors info")
        
        X_scaled = self.scaler.transform(X)
        distances, indices = self.model.kneighbors(X_scaled, n_neighbors=k)
        
        return {
            'distances': distances,
            'indices': indices
        }
    
    def get_model_params(self):
        """
        Get current model parameters
        """
        return {
            'n_neighbors': self.model.n_neighbors,
            'weights': self.model.weights,
            'algorithm': self.model.algorithm,
            'leaf_size': self.model.leaf_size,
            'p': self.model.p
        } 

### Lasso Classifier

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class LassoClassifierModel:
    """
    Lasso Classifier model for hate speech classification
    Uses L1 regularization for feature selection and sparse solutions
    """
    
    def __init__(self, random_state=42):
        self.model = LogisticRegression(
            random_state=random_state,
            penalty='l1',  # Lasso regularization
            solver='liblinear',
            C=1.0,  # Inverse of regularization strength
            max_iter=1000
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the Lasso Classifier model with optional hyperparameter tuning
        """
        print("Training Lasso Classifier...")
        
        # Scale features for Lasso
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'C': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                'max_iter': [1000, 2000]
            }
            
            grid_search = GridSearchCV(
                LogisticRegression(
                    random_state=42,
                    penalty='l1',
                    solver='liblinear'
                ),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"Lasso Classifier CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Lasso Classifier Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = np.abs(self.model.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_selected_features(self, feature_names=None, threshold=0.0):
        """
        Get features selected by Lasso (non-zero coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting selected features")
        
        coefficients = self.model.coef_[0]
        selected_indices = np.where(np.abs(coefficients) > threshold)[0]
        
        if feature_names is not None:
            selected_features = [feature_names[i] for i in selected_indices]
            selected_coefficients = coefficients[selected_indices]
            
            selected_df = pd.DataFrame({
                'feature': selected_features,
                'coefficient': selected_coefficients,
                'abs_coefficient': np.abs(selected_coefficients)
            }).sort_values('abs_coefficient', ascending=False)
            
            return selected_df
        else:
            return {
                'indices': selected_indices,
                'coefficients': coefficients[selected_indices]
            }
    
    def get_sparsity_info(self):
        """
        Get information about model sparsity
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting sparsity info")
        
        coefficients = self.model.coef_[0]
        n_features = len(coefficients)
        n_nonzero = np.count_nonzero(coefficients)
        sparsity_ratio = 1 - (n_nonzero / n_features)
        
        return {
            'total_features': n_features,
            'non_zero_features': n_nonzero,
            'zero_features': n_features - n_nonzero,
            'sparsity_ratio': sparsity_ratio
        }
    
    def get_model_params(self):
        """
        Get current model parameters
        """
        return {
            'C': self.model.C,
            'penalty': self.model.penalty,
            'solver': self.model.solver,
            'max_iter': self.model.max_iter,
            'best_params': self.best_params
        } 

### LightGBM

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class LightGBMModel:
    """
    LightGBM model for hate speech classification
    Fast gradient boosting with leaf-wise tree growth
    """
    
    def __init__(self, random_state=42):
        self.model = LGBMClassifier(
            random_state=random_state,
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            num_leaves=31,
            min_child_samples=20,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.0,
            reg_lambda=0.0,
            verbose=-1
        )
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the LightGBM model with optional hyperparameter tuning
        """
        print("Training LightGBM...")
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.05, 0.1, 0.15],
                'max_depth': [4, 6, 8],
                'num_leaves': [15, 31, 63],
                'min_child_samples': [10, 20, 30],
                'subsample': [0.7, 0.8, 0.9],
                'colsample_bytree': [0.7, 0.8, 0.9]
            }
            
            grid_search = GridSearchCV(
                LGBMClassifier(random_state=42, verbose=-1),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"LightGBM CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("LightGBM Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None, importance_type='gain'):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_feature_importance_split(self, feature_names=None):
        """
        Get feature importance based on split count
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_(importance_type='split')
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'split_importance': importance
            }).sort_values('split_importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_model_info(self):
        """
        Get detailed model information
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting model info")
        
        return {
            'n_estimators': self.model.n_estimators,
            'learning_rate': self.model.learning_rate,
            'max_depth': self.model.max_depth,
            'num_leaves': self.model.num_leaves,
            'min_child_samples': self.model.min_child_samples,
            'subsample': self.model.subsample,
            'colsample_bytree': self.model.colsample_bytree,
            'reg_alpha': self.model.reg_alpha,
            'reg_lambda': self.model.reg_lambda,
            'best_params': self.best_params
        }
    
    def get_booster_info(self):
        """
        Get information about the underlying booster
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting booster info")
        
        booster = self.model.booster_
        return {
            'num_trees': booster.num_trees(),
            'num_features': booster.num_features(),
            'num_classes': booster.num_classes()
        } 

### Linear SVC

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

class LinearSVCModel:
    """
    Linear Support Vector Classifier model for hate speech classification
    Good for high-dimensional data and linear separability
    """
    
    def __init__(self, random_state=42):
        self.base_model = LinearSVC(
            random_state=random_state,
            C=1.0,
            loss='squared_hinge',
            max_iter=1000,
            dual=True
        )
        # Calibrated classifier for probability estimates
        self.model = CalibratedClassifierCV(
            self.base_model,
            cv=3,
            method='sigmoid'
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the Linear SVC model with optional hyperparameter tuning
        """
        print("Training Linear SVC...")
        
        # Scale features for SVC
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'base_estimator__C': [0.1, 0.5, 1.0, 2.0, 5.0],
                'base_estimator__loss': ['hinge', 'squared_hinge'],
                'base_estimator__max_iter': [1000, 2000]
            }
            
            grid_search = GridSearchCV(
                CalibratedClassifierCV(
                    LinearSVC(random_state=42),
                    cv=3,
                    method='sigmoid'
                ),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"Linear SVC CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def predict_decision_function(self, X):
        """
        Get decision function values (distance from hyperplane)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.decision_function(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Linear SVC Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        # Get coefficients from the base estimator
        base_estimator = self.model.base_estimator_
        importance = np.abs(base_estimator.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_support_vectors_info(self, X_train, y_train):
        """
        Get information about support vectors
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting support vectors info")
        
        X_train_scaled = self.scaler.transform(X_train)
        base_estimator = self.model.base_estimator_
        
        # Get support vector indices
        support_indices = base_estimator.support_
        support_vectors = X_train_scaled[support_indices]
        
        return {
            'n_support_vectors': len(support_indices),
            'support_indices': support_indices,
            'support_vectors': support_vectors,
            'support_labels': y_train[support_indices]
        }
    
    def get_model_params(self):
        """
        Get current model parameters
        """
        base_estimator = self.model.base_estimator_
        return {
            'C': base_estimator.C,
            'loss': base_estimator.loss,
            'max_iter': base_estimator.max_iter,
            'dual': base_estimator.dual,
            'best_params': self.best_params
        } 

### Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class LogisticRegressionModel:
    """
    Logistic Regression model for hate speech classification
    High bias, good baseline model
    """
    
    def __init__(self, random_state=42):
        self.model = LogisticRegression(
            random_state=random_state,
            max_iter=1000,
            C=1.0,
            solver='liblinear'
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the logistic regression model with cross-validation
        """
        print("Training Logistic Regression...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Logistic Regression CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Logistic Regression Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = np.abs(self.model.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 
        
        
        



### Naive Bayes

In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class NaiveBayesModel:
    """
    Naive Bayes model for hate speech classification
    Complements well with sparse data
    """
    
    def __init__(self, random_state=42):
        self.model = MultinomialNB(
            alpha=1.0,
            fit_prior=True,
            class_prior=None
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the Naive Bayes model with cross-validation
        """
        print("Training Naive Bayes...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Naive Bayes CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Naive Bayes Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (log probabilities)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        # For Naive Bayes, we can use the log probabilities as feature importance
        # This shows which features contribute most to the classification
        importance = np.abs(self.model.feature_log_prob_[1] - self.model.feature_log_prob_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

### Random Forest

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class RandomForestModel:
    """
    Random Forest model for hate speech classification
    """
    
    def __init__(self, random_state=42):
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=random_state,
            n_jobs=-1
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the Random Forest model with cross-validation
        """
        print("Training Random Forest...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"Random Forest CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Random Forest Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

### Ridge Classifier

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings('ignore')

class RidgeClassifierModel:
    """
    Ridge Classifier model for hate speech classification
    Good for high-dimensional data with L2 regularization
    """
    
    def __init__(self, random_state=42):
        self.base_model = RidgeClassifier(
            random_state=random_state,
            alpha=1.0,
            solver='auto',
            max_iter=1000
        )
        # Calibrated classifier for probability estimates
        self.model = CalibratedClassifierCV(
            self.base_model,
            cv=3,
            method='sigmoid'
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        self.best_params = None
        
    def train(self, X_train, y_train, cv_folds=5, tune_hyperparameters=True):
        """
        Train the Ridge Classifier model with optional hyperparameter tuning
        """
        print("Training Ridge Classifier...")
        
        # Scale features for Ridge Classifier
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if tune_hyperparameters:
            # Hyperparameter tuning
            param_grid = {
                'base_estimator__alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
                'base_estimator__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
            }
            
            grid_search = GridSearchCV(
                CalibratedClassifierCV(
                    RidgeClassifier(random_state=42),
                    cv=3,
                    method='sigmoid'
                ),
                param_grid,
                cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
                scoring='f1',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            self.model = grid_search.best_estimator_
            self.best_params = grid_search.best_params_
            print(f"Best parameters: {self.best_params}")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"Ridge Classifier CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def predict_decision_function(self, X):
        """
        Get decision function values
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.decision_function(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Ridge Classifier Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        # Get coefficients from the base estimator
        base_estimator = self.model.base_estimator_
        importance = np.abs(base_estimator.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance
    
    def get_model_params(self):
        """
        Get current model parameters
        """
        base_estimator = self.model.base_estimator_
        return {
            'alpha': base_estimator.alpha,
            'solver': base_estimator.solver,
            'max_iter': base_estimator.max_iter,
            'best_params': self.best_params
        }
    
    def get_intercept(self):
        """
        Get model intercept
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting intercept")
        return self.model.base_estimator_.intercept_[0] 

### SVM

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class SVMModel:
    """
    Support Vector Machine with linear kernel for hate speech classification
    """
    
    def __init__(self, random_state=42):
        self.model = LinearSVC(
            random_state=random_state,
            max_iter=1000,
            C=1.0,
            loss='squared_hinge'
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the SVM model with cross-validation
        """
        print("Training SVM (LinearSVC)...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"SVM CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities using decision function
        Note: LinearSVC doesn't have predict_proba, so we use decision_function
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        # Get decision function scores
        decision_scores = self.model.decision_function(X)
        
        # Convert to probabilities using sigmoid-like transformation
        # This is an approximation since LinearSVC doesn't provide probabilities
        proba = 1 / (1 + np.exp(-decision_scores))
        
        # Return as 2D array [prob_class_0, prob_class_1]
        return np.column_stack([1 - proba, proba])
    
    def decision_function(self, X):
        """
        Get raw decision function scores
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.decision_function(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("SVM Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance (coefficients)
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = np.abs(self.model.coef_[0])
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

### XGBoost

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class XGBoostModel:
    """
    XGBoost model for hate speech classification
    Gradient Boosting with use_label_encoder=False
    """
    
    def __init__(self, random_state=42):
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=random_state,
            use_label_encoder=False,
            eval_metric='logloss',
            n_jobs=-1
        )
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the XGBoost model with cross-validation
        """
        print("Training XGBoost...")
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train, y_train)
        self.is_trained = True
        
        print(f"XGBoost CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict(X)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        return self.model.predict_proba(X)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("XGBoost Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self, feature_names=None):
        """
        Get feature importance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting feature importance")
        
        importance = self.model.feature_importances_
        
        if feature_names is not None:
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False)
            return importance_df
        else:
            return importance 

# Model Combination
### Meta Classifier

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

class MetaClassifier:
    """
    Meta-classifier that learns from base model outputs
    Uses Logistic Regression to combine predictions with meta-features
    """
    
    def __init__(self, random_state=42):
        self.meta_model = LogisticRegression(
            random_state=random_state,
            max_iter=1000,
            C=1.0,
            solver='liblinear'
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.base_models = {}
        self.meta_features_names = []
        
    def add_base_model(self, name, model):
        """
        Add a base model to the ensemble
        """
        self.base_models[name] = model
        
    def extract_meta_features(self, X):
        """
        Extract meta-features from base model predictions
        """
        if not self.base_models:
            raise ValueError("No base models added to meta-classifier")
        
        meta_features = []
        predictions = []
        
        # Get predictions from all base models
        for name, model in self.base_models.items():
            if hasattr(model, 'predict_proba'):
                pred_proba = model.predict_proba(X)[:, 1]
            elif hasattr(model, 'decision_function'):
                pred_proba = model.decision_function(X)
            else:
                raise ValueError(f"Model {name} must have predict_proba or decision_function method")
            
            predictions.append(pred_proba)
        
        predictions = np.array(predictions).T  # Shape: (n_samples, n_models)
        
        # Meta-features:
        # 1. Raw probabilities from each model
        meta_features.append(predictions)
        
        # 2. Model disagreement (standard deviation of predictions)
        disagreement = np.std(predictions, axis=1, keepdims=True)
        meta_features.append(disagreement)
        
        # 3. Confidence gap (difference between max and min predictions)
        confidence_gap = np.max(predictions, axis=1, keepdims=True) - np.min(predictions, axis=1, keepdims=True)
        meta_features.append(confidence_gap)
        
        # 4. Prediction variance
        prediction_variance = np.var(predictions, axis=1, keepdims=True)
        meta_features.append(prediction_variance)
        
        # 5. Mean prediction
        mean_prediction = np.mean(predictions, axis=1, keepdims=True)
        meta_features.append(mean_prediction)
        
        # 6. Median prediction
        median_prediction = np.median(predictions, axis=1, keepdims=True)
        meta_features.append(median_prediction)
        
        # 7. Range of predictions
        prediction_range = np.max(predictions, axis=1, keepdims=True) - np.min(predictions, axis=1, keepdims=True)
        meta_features.append(prediction_range)
        
        # 8. Number of models predicting above threshold
        threshold = 0.5
        above_threshold = np.sum(predictions > threshold, axis=1, keepdims=True)
        meta_features.append(above_threshold)
        
        # Combine all meta-features
        combined_features = np.hstack(meta_features)
        
        # Store feature names for interpretability
        if not self.meta_features_names:
            self.meta_features_names = []
            # Add base model prediction names
            for name in self.base_models.keys():
                self.meta_features_names.append(f"{name}_pred")
            # Add meta-feature names
            self.meta_features_names.extend([
                'model_disagreement',
                'confidence_gap',
                'prediction_variance',
                'mean_prediction',
                'median_prediction',
                'prediction_range',
                'models_above_threshold'
            ])
        
        return combined_features
    
    def train(self, X_train, y_train, X_val=None, y_val=None):
        """
        Train the meta-classifier on base model outputs
        """
        print("Training Meta-Classifier...")
        
        # Extract meta-features for training
        X_meta_train = self.extract_meta_features(X_train)
        
        # Scale features
        X_meta_train_scaled = self.scaler.fit_transform(X_meta_train)
        
        # Train meta-classifier
        self.meta_model.fit(X_meta_train_scaled, y_train)
        self.is_trained = True
        
        # Evaluate on validation set if provided
        if X_val is not None and y_val is not None:
            X_meta_val = self.extract_meta_features(X_val)
            X_meta_val_scaled = self.scaler.transform(X_meta_val)
            
            y_pred_meta = self.meta_model.predict(X_meta_val_scaled)
            f1_meta = f1_score(y_val, y_pred_meta)
            accuracy_meta = accuracy_score(y_val, y_pred_meta)
            
            print(f"Meta-classifier validation F1: {f1_meta:.4f}")
            print(f"Meta-classifier validation Accuracy: {accuracy_meta:.4f}")
        
        return self
    
    def predict(self, X):
        """
        Make predictions using meta-classifier
        """
        if not self.is_trained:
            raise ValueError("Meta-classifier must be trained before making predictions")
        
        X_meta = self.extract_meta_features(X)
        X_meta_scaled = self.scaler.transform(X_meta)
        
        return self.meta_model.predict(X_meta_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities from meta-classifier
        """
        if not self.is_trained:
            raise ValueError("Meta-classifier must be trained before making predictions")
        
        X_meta = self.extract_meta_features(X)
        X_meta_scaled = self.scaler.transform(X_meta)
        
        return self.meta_model.predict_proba(X_meta_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate meta-classifier performance
        """
        if not self.is_trained:
            raise ValueError("Meta-classifier must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Meta-Classifier Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_feature_importance(self):
        """
        Get meta-feature importance
        """
        if not self.is_trained:
            raise ValueError("Meta-classifier must be trained before getting feature importance")
        
        importance = np.abs(self.meta_model.coef_[0])
        
        importance_df = pd.DataFrame({
            'feature': self.meta_features_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        return importance_df
    
    def get_base_model_predictions(self, X):
        """
        Get predictions from all base models
        """
        predictions = {}
        
        for name, model in self.base_models.items():
            if hasattr(model, 'predict_proba'):
                pred_proba = model.predict_proba(X)[:, 1]
            elif hasattr(model, 'decision_function'):
                pred_proba = model.decision_function(X)
            else:
                raise ValueError(f"Model {name} must have predict_proba or decision_function method")
            
            predictions[name] = pred_proba
        
        return predictions
    
    def analyze_model_agreement(self, X):
        """
        Analyze agreement between base models
        """
        predictions = self.get_base_model_predictions(X)
        
        # Convert to binary predictions
        binary_predictions = {}
        for name, pred in predictions.items():
            binary_predictions[name] = (pred > 0.5).astype(int)
        
        # Calculate agreement matrix
        model_names = list(binary_predictions.keys())
        n_models = len(model_names)
        agreement_matrix = np.zeros((n_models, n_models))
        
        for i in range(n_models):
            for j in range(n_models):
                agreement = np.mean(binary_predictions[model_names[i]] == binary_predictions[model_names[j]])
                agreement_matrix[i, j] = agreement
        
        return agreement_matrix, model_names 

### Voting Classifier

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class VotingClassifierModel:
    """
    Voting Classifier model for hate speech classification
    Combines multiple different base models with voting mechanism
    """
    
    def __init__(self, random_state=42):
        # Define base estimators
        estimators = [
            ('lr', LogisticRegression(random_state=random_state, max_iter=1000)),
            ('dt', DecisionTreeClassifier(random_state=random_state, max_depth=10)),
            ('svc', SVC(random_state=random_state, probability=True)),
            ('nb', MultinomialNB())
        ]
        
        self.model = VotingClassifier(
            estimators=estimators,
            voting='soft',  # Use probability voting
            weights=None  # Equal weights for all estimators
        )
        self.scaler = StandardScaler()
        self.is_trained = False
        self.cv_scores = None
        
    def train(self, X_train, y_train, cv_folds=5):
        """
        Train the Voting Classifier model
        """
        print("Training Voting Classifier...")
        
        # Scale features for models that need it
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        # Perform cross-validation
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(self.model, X_train_scaled, y_train, cv=skf, scoring='f1')
        
        self.cv_scores = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'f1_scores': cv_scores
        }
        
        # Train on full dataset
        self.model.fit(X_train_scaled, y_train)
        self.is_trained = True
        
        print(f"Voting Classifier CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self.cv_scores
    
    def predict(self, X):
        """
        Make binary predictions
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """
        Get prediction probabilities
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X_test, y_test):
        """
        Evaluate model performance
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before evaluation")
        
        y_pred = self.predict(X_test)
        y_pred_proba = self.predict_proba(X_test)[:, 1]
        
        results = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        print("Voting Classifier Results:")
        for metric, value in results.items():
            print(f"  {metric.capitalize()}: {value:.4f}")
        
        return results
    
    def get_individual_predictions(self, X):
        """
        Get predictions from individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting individual predictions")
        
        X_scaled = self.scaler.transform(X)
        predictions = {}
        
        for name, estimator in self.model.named_estimators_.items():
            if hasattr(estimator, 'predict'):
                pred = estimator.predict(X_scaled)
                predictions[name] = pred
        
        return predictions
    
    def get_individual_probabilities(self, X):
        """
        Get probability predictions from individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting individual probabilities")
        
        X_scaled = self.scaler.transform(X)
        probabilities = {}
        
        for name, estimator in self.model.named_estimators_.items():
            if hasattr(estimator, 'predict_proba'):
                prob = estimator.predict_proba(X_scaled)[:, 1]  # Probability of positive class
                probabilities[name] = prob
        
        return probabilities
    
    def get_estimator_weights(self):
        """
        Get current estimator weights
        """
        return self.model.weights
    
    def set_estimator_weights(self, weights):
        """
        Set custom weights for estimators
        """
        if len(weights) != len(self.model.estimators):
            raise ValueError(f"Number of weights ({len(weights)}) must match number of estimators ({len(self.model.estimators)})")
        
        self.model.weights = weights
    
    def get_model_info(self):
        """
        Get detailed model information
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting model info")
        
        estimator_info = []
        for name, estimator in self.model.named_estimators_.items():
            estimator_info.append({
                'name': name,
                'type': type(estimator).__name__,
                'params': estimator.get_params()
            })
        
        return {
            'voting': self.model.voting,
            'weights': self.model.weights,
            'estimators': estimator_info
        }
    
    def get_consensus_analysis(self, X):
        """
        Analyze consensus among individual estimators
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting consensus analysis")
        
        individual_preds = self.get_individual_predictions(X)
        
        # Convert to array for analysis
        pred_array = np.array(list(individual_preds.values()))
        
        # Calculate consensus metrics
        consensus_ratio = np.mean(pred_array, axis=0)  # Average prediction
        agreement_count = np.sum(pred_array == pred_array[0], axis=0)  # Number of agreeing estimators
        disagreement_ratio = 1 - (agreement_count / len(individual_preds))
        
        return {
            'consensus_ratio': consensus_ratio,
            'agreement_count': agreement_count,
            'disagreement_ratio': disagreement_ratio,
            'individual_predictions': individual_preds
        } 