In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import joblib
import scipy.sparse as sp
import warnings
import os
import gensim
from gensim.models import FastText
from gensim.utils import simple_preprocess
import multiprocessing
warnings.filterwarnings('ignore')

In [2]:
# Define task structure
tasks = {
    'spam': {'type': 'binary', 'column': 'spam'},
    'sentiment': {'type': 'multi-class', 'column': 'sentiment', 'classes': ['Positive', 'Neutral', 'Negative', 'Irrelevant']},
    'toxicity': {'type': 'multi-label', 'columns': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']},
    'hate_speech': {'type': 'multi-class', 'column': 'hate_speech', 'classes': ['normal', 'offensive', 'hatespeech']}
}

In [3]:
# Function to preprocess text for FastText
def preprocess_for_fasttext(text):
    return simple_preprocess(text, deacc=True)  # deacc=True removes punctuations


In [4]:
# Load the provided CSV files
print("Loading data files...")
df_train = pd.read_csv('../datasets/processed/final_train_data.csv')
df_test = pd.read_csv('../datasets/processed/final_test_data.csv')

Loading data files...


In [5]:
# Print data info
print(f"Training data shape: {df_train.shape}")
print(f"Testing data shape: {df_test.shape}")

Training data shape: (197118, 10)
Testing data shape: (49283, 10)


In [6]:
# Extract text columns
X_train = df_train['text']
X_test = df_test['text']

In [15]:
X_train = df_train['text'].fillna('').astype(str)
X_test = df_test['text'].fillna('').astype(str)


In [11]:
# Preprocess text for FastText
print("Preprocessing text for FastText...")
df_train['processed_text'] = df_train['text'].fillna('').astype(str).apply(preprocess_for_fasttext)
df_test['processed_text'] = df_test['text'].fillna('').astype(str).apply(preprocess_for_fasttext)

Preprocessing text for FastText...


In [16]:
# Create or load FastText model
fasttext_model_path = '../src/models/fasttext_model.model'
if os.path.exists(fasttext_model_path):
    print(f"Loading existing FastText model from {fasttext_model_path}")
    fasttext_model = FastText.load(fasttext_model_path)
else:
    print("Training FastText model...")
    # Use all available cores for faster training
    cores = multiprocessing.cpu_count()
    
    # Train FastText model
    fasttext_model = FastText(
        vector_size=100,     # Embedding dimension
        window=5,            # Context window size
        min_count=5,         # Minimum word count
        workers=cores,       # Use all available cores
        sg=1                 # Use skip-gram model (1) instead of CBOW (0)
    )
    
    # Build vocabulary
    fasttext_model.build_vocab(df_train['processed_text'].tolist())
    
    # Train the model
    fasttext_model.train(
        df_train['processed_text'].tolist(),
        total_examples=len(df_train),
        epochs=10
    )
    
    # Save the model
    fasttext_model.save(fasttext_model_path)
    print(f"FastText model saved to {fasttext_model_path}")

Loading existing FastText model from ../src/models/fasttext_model.model


In [17]:
# Function to get document vector from FastText
def get_document_vector(text, model, vector_size=100):
    words = preprocess_for_fasttext(text)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

In [18]:
# Create document vectors for training and test sets
print("Creating FastText document vectors...")
train_fasttext_vectors = np.array([get_document_vector(text, fasttext_model) for text in X_train])
test_fasttext_vectors = np.array([get_document_vector(text, fasttext_model) for text in X_test])


Creating FastText document vectors...


In [19]:
# Create TF-IDF features
print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2),
    stop_words='english'
)

Creating TF-IDF features...


In [20]:
# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF features shape: {X_train_tfidf.shape}")
print(f"FastText features shape: {train_fasttext_vectors.shape}")


TF-IDF features shape: (197118, 10000)
FastText features shape: (197118, 100)


In [21]:
# Combine TF-IDF and FastText features
print("Combining TF-IDF and FastText features...")
X_train_combined = sp.hstack((X_train_tfidf, sp.csr_matrix(train_fasttext_vectors)))
X_test_combined = sp.hstack((X_test_tfidf, sp.csr_matrix(test_fasttext_vectors)))

print(f"Combined features shape: {X_train_combined.shape}")

Combining TF-IDF and FastText features...
Combined features shape: (197118, 10100)


In [22]:
# Function to train and evaluate models for each task
def train_and_evaluate_models():
    models = {}
    
    # 1. Binary Classification: Spam
    spam_col = tasks['spam']['column']
    # Filter out rows with missing labels
    spam_train_mask = ~df_train[spam_col].isna()
    if spam_train_mask.any():
        y_spam_train = df_train.loc[spam_train_mask, spam_col].astype(int)
        X_spam_train = X_train_combined[spam_train_mask]
        
        print(f"Training spam classifier with {len(y_spam_train)} samples")
        
        spam_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False
        )
        spam_model.fit(X_spam_train, y_spam_train)
        models['spam'] = spam_model
    else:
        print("No spam labels found in training data. Skipping spam classifier.")
    
    # 2. Multi-class Classification: Sentiment
    sentiment_col = tasks['sentiment']['column']
    sentiment_train_mask = ~df_train[sentiment_col].isna()
    if sentiment_train_mask.any():
        y_sentiment_train = df_train.loc[sentiment_train_mask, sentiment_col]
        X_sentiment_train = X_train_combined[sentiment_train_mask]
        
        # Label encode sentiment classes
        sentiment_encoder = LabelEncoder()
        y_sentiment_train_encoded = sentiment_encoder.fit_transform(y_sentiment_train)
        
        print(f"Training sentiment classifier with {len(y_sentiment_train)} samples")
        
        sentiment_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            objective='multi:softprob',
            num_class=len(np.unique(y_sentiment_train_encoded)),
            eval_metric='mlogloss',
            use_label_encoder=False
        )
        sentiment_model.fit(X_sentiment_train, y_sentiment_train_encoded)
        models['sentiment'] = {
            'model': sentiment_model,
            'encoder': sentiment_encoder,
            'classes': sorted(list(sentiment_encoder.classes_))
        }
    else:
        print("No sentiment labels found in training data. Skipping sentiment classifier.")
    
    # 3. Multi-label Classification: Toxicity
    toxicity_cols = tasks['toxicity']['columns']
    # For multi-label, we handle each column as a separate binary classification
    toxicity_models = {}
    
    for col in toxicity_cols:
        # Filter rows with available labels
        col_train_mask = ~df_train[col].isna()
        if col_train_mask.any():
            y_col_train = df_train.loc[col_train_mask, col].astype(int)
            X_col_train = X_train_combined[col_train_mask]
            
            print(f"Training {col} classifier with {len(y_col_train)} samples")
            
            model = xgb.XGBClassifier(
                n_estimators=100,
                max_depth=4,
                learning_rate=0.1,
                objective='binary:logistic',
                eval_metric='logloss',
                use_label_encoder=False
            )
            model.fit(X_col_train, y_col_train)
            toxicity_models[col] = model
        else:
            print(f"No {col} labels found in training data. Skipping this classifier.")
    
    if toxicity_models:
        models['toxicity'] = toxicity_models
    
    # 4. Multi-class Classification: Hate Speech
    hate_speech_col = tasks['hate_speech']['column']
    hate_speech_train_mask = ~df_train[hate_speech_col].isna()
    if hate_speech_train_mask.any():
        y_hate_speech_train = df_train.loc[hate_speech_train_mask, hate_speech_col]
        X_hate_speech_train = X_train_combined[hate_speech_train_mask]
        
        # Label encode hate speech classes
        hate_speech_encoder = LabelEncoder()
        y_hate_speech_train_encoded = hate_speech_encoder.fit_transform(y_hate_speech_train)
        
        print(f"Training hate speech classifier with {len(y_hate_speech_train)} samples")
        
        hate_speech_model = xgb.XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            objective='multi:softprob',
            num_class=len(np.unique(y_hate_speech_train_encoded)),
            eval_metric='mlogloss',
            use_label_encoder=False
        )
        hate_speech_model.fit(X_hate_speech_train, y_hate_speech_train_encoded)
        models['hate_speech'] = {
            'model': hate_speech_model,
            'encoder': hate_speech_encoder,
            'classes': sorted(list(hate_speech_encoder.classes_))
        }
    else:
        print("No hate speech labels found in training data. Skipping hate speech classifier.")
    
    return models

In [23]:
# Train all models
print("\nTraining models...")
models = train_and_evaluate_models()


Training models...
Training spam classifier with 1564 samples
Training sentiment classifier with 55592 samples
Training toxic classifier with 127656 samples
Training severe_toxic classifier with 127656 samples
Training obscene classifier with 127656 samples
Training threat classifier with 127656 samples
Training insult classifier with 127656 samples
Training identity_hate classifier with 127656 samples
Training hate speech classifier with 12306 samples


In [24]:
class EnsemblePredictor:
    def __init__(self, tfidf_vectorizer, fasttext_model, models, tasks):
        self.tfidf_vectorizer = tfidf_vectorizer
        self.fasttext_model = fasttext_model
        self.models = models
        self.tasks = tasks
    
    def transform_features(self, texts):
        # Create TF-IDF features
        tfidf_features = self.tfidf_vectorizer.transform(texts)
        
        # Create FastText features
        fasttext_features = np.array([get_document_vector(text, self.fasttext_model) for text in texts])
        
        # Combine features
        X_combined = sp.hstack((tfidf_features, sp.csr_matrix(fasttext_features)))
        
        return X_combined
    
    def predict_with_confidence(self, texts):
        # Transform input texts
        X_vec = self.transform_features(texts)
        
        results = {}
        
        # 1. Spam prediction (if model exists)
        if 'spam' in self.models:
            spam_model = self.models['spam']
            spam_probs = spam_model.predict_proba(X_vec)
            # For binary classification, probability of class 1 is the confidence score
            results['spam'] = {
                'prediction': spam_model.predict(X_vec),
                'confidence': spam_probs[:, 1]
            }
        
        # 2. Sentiment prediction (if model exists)
        if 'sentiment' in self.models:
            sentiment_dict = self.models['sentiment']
            sentiment_model = sentiment_dict['model']
            sentiment_encoder = sentiment_dict['encoder']
            sentiment_classes = sentiment_dict['classes']
            
            sentiment_probs = sentiment_model.predict_proba(X_vec)
            sentiment_preds = sentiment_model.predict(X_vec)
            sentiment_pred_labels = sentiment_encoder.inverse_transform(sentiment_preds)
            
            # Get confidence for each prediction
            sentiment_conf = np.max(sentiment_probs, axis=1)
            
            results['sentiment'] = {
                'prediction': sentiment_pred_labels,
                'confidence': sentiment_conf,
                'all_probs': {class_name: sentiment_probs[:, i] for i, class_name in enumerate(sentiment_classes)}
            }
        
        # 3. Toxicity predictions (if models exist)
        if 'toxicity' in self.models:
            toxicity_models = self.models['toxicity']
            toxicity_results = {}
            
            for col, model in toxicity_models.items():
                probs = model.predict_proba(X_vec)
                preds = model.predict(X_vec)
                # For binary classification, probability of class 1 is the confidence score
                toxicity_results[col] = {
                    'prediction': preds,
                    'confidence': probs[:, 1]
                }
            
            results['toxicity'] = toxicity_results
        
        # 4. Hate Speech prediction (if model exists)
        if 'hate_speech' in self.models:
            hate_speech_dict = self.models['hate_speech']
            hate_speech_model = hate_speech_dict['model']
            hate_speech_encoder = hate_speech_dict['encoder']
            hate_speech_classes = hate_speech_dict['classes']
            
            hate_speech_probs = hate_speech_model.predict_proba(X_vec)
            hate_speech_preds = hate_speech_model.predict(X_vec)
            hate_speech_pred_labels = hate_speech_encoder.inverse_transform(hate_speech_preds)
            
            # Get confidence for each prediction
            hate_speech_conf = np.max(hate_speech_probs, axis=1)
            
            results['hate_speech'] = {
                'prediction': hate_speech_pred_labels,
                'confidence': hate_speech_conf,
                'all_probs': {class_name: hate_speech_probs[:, i] for i, class_name in enumerate(hate_speech_classes)}
            }
        
        return results

In [25]:
# Create the ensemble predictor
predictor = EnsemblePredictor(tfidf_vectorizer, fasttext_model, models, tasks)


In [26]:
# Function to save the model
def save_model(predictor, filename='../src/models/fasttext_tfidf_ensemble.joblib'):
    joblib.dump(predictor, filename)
    print(f"Model saved as {filename}")

# Function to load the model
def load_model(filename='../src/models/fasttext_tfidf_ensemble.joblib'):
    return joblib.load(filename)

In [27]:
save_model(predictor)

Model saved as ../src/models/fasttext_tfidf_ensemble.joblib


In [28]:
# Example of how to use the predictor
def demo_prediction():
    sample_texts = [
        "This video is amazing! I loved every minute of it.",
        "Check out my channel for free iPhone giveaway! Click the link now!",
        "You are so stupid and ugly, nobody likes you.",
        "I respectfully disagree with your opinion on this matter."
    ]
    
    print("\nPrediction Results:")
    predictions = predictor.predict_with_confidence(sample_texts)
    
    for i, text in enumerate(sample_texts):
        print(f"\nText {i+1}: {text}")
        
        # Spam
        if 'spam' in predictions:
            spam_pred = predictions['spam']['prediction'][i]
            spam_conf = predictions['spam']['confidence'][i]
            print(f"Spam: {'Yes' if spam_pred == 1 else 'No'} (confidence: {spam_conf:.4f})")
        
        # Sentiment
        if 'sentiment' in predictions:
            sentiment_pred = predictions['sentiment']['prediction'][i]
            sentiment_conf = predictions['sentiment']['confidence'][i]
            print(f"Sentiment: {sentiment_pred} (confidence: {sentiment_conf:.4f})")
        
        # Toxicity
        if 'toxicity' in predictions:
            print("Toxicity:")
            for col in predictions['toxicity'].keys():
                tox_pred = predictions['toxicity'][col]['prediction'][i]
                tox_conf = predictions['toxicity'][col]['confidence'][i]
                print(f"  - {col}: {'Yes' if tox_pred == 1 else 'No'} (confidence: {tox_conf:.4f})")
        
        # Hate Speech
        if 'hate_speech' in predictions:
            hate_pred = predictions['hate_speech']['prediction'][i]
            hate_conf = predictions['hate_speech']['confidence'][i]
            print(f"Hate Speech: {hate_pred} (confidence: {hate_conf:.4f})")


In [29]:
# Demo the model
demo_prediction()


Prediction Results:

Text 1: This video is amazing! I loved every minute of it.
Spam: No (confidence: 0.0035)
Sentiment: Positive (confidence: 0.8063)
Toxicity:
  - toxic: No (confidence: 0.0262)
  - severe_toxic: No (confidence: 0.0003)
  - obscene: No (confidence: 0.0077)
  - threat: No (confidence: 0.0001)
  - insult: No (confidence: 0.0046)
  - identity_hate: No (confidence: 0.0006)
Hate Speech: normal (confidence: 0.7429)

Text 2: Check out my channel for free iPhone giveaway! Click the link now!
Spam: Yes (confidence: 0.9988)
Sentiment: Neutral (confidence: 0.5342)
Toxicity:
  - toxic: No (confidence: 0.0033)
  - severe_toxic: No (confidence: 0.0002)
  - obscene: No (confidence: 0.0019)
  - threat: No (confidence: 0.0000)
  - insult: No (confidence: 0.0014)
  - identity_hate: No (confidence: 0.0001)
Hate Speech: normal (confidence: 0.8477)

Text 3: You are so stupid and ugly, nobody likes you.
Spam: No (confidence: 0.0199)
Sentiment: Negative (confidence: 0.6409)
Toxicity:
  - t

In [30]:
# Evaluate the model on test data
def evaluate_model():
    print("\nModel Evaluation:")
    
    # Evaluate Spam Detection (if model exists)
    if 'spam' in models:
        spam_col = tasks['spam']['column']
        test_mask = ~df_test[spam_col].isna()
        if test_mask.any():
            y_test = df_test.loc[test_mask, spam_col].astype(int)
            X_test_filtered = X_test_combined[test_mask]
            
            spam_model = models['spam']
            y_pred = spam_model.predict(X_test_filtered)
            
            print("\nSpam Detection Evaluation:")
            print(classification_report(y_test, y_pred))
    
    # Evaluate Sentiment Analysis (if model exists)
    if 'sentiment' in models:
        sentiment_col = tasks['sentiment']['column']
        test_mask = ~df_test[sentiment_col].isna()
        if test_mask.any():
            y_test = df_test.loc[test_mask, sentiment_col]
            X_test_filtered = X_test_combined[test_mask]
            
            sentiment_dict = models['sentiment']
            sentiment_model = sentiment_dict['model']
            sentiment_encoder = sentiment_dict['encoder']
            
            # Encode test labels using the same encoder
            try:
                y_test_encoded = sentiment_encoder.transform(y_test)
                y_pred = sentiment_model.predict(X_test_filtered)
                
                print("\nSentiment Analysis Evaluation:")
                print(classification_report(y_test_encoded, y_pred))
            except ValueError as e:
                print(f"\nError in sentiment evaluation: {e}")
                print("This might be due to new classes in test data that weren't in training data")
    
    # Evaluate Toxicity Detection (if models exist)
    if 'toxicity' in models:
        toxicity_models = models['toxicity']
        for col, model in toxicity_models.items():
            test_mask = ~df_test[col].isna()
            if test_mask.any():
                y_test = df_test.loc[test_mask, col].astype(int)
                X_test_filtered = X_test_combined[test_mask]
                
                y_pred = model.predict(X_test_filtered)
                
                print(f"\n{col.capitalize()} Classification Evaluation:")
                print(classification_report(y_test, y_pred))
    
    # Evaluate Hate Speech Detection (if model exists)
    if 'hate_speech' in models:
        hate_speech_col = tasks['hate_speech']['column']
        test_mask = ~df_test[hate_speech_col].isna()
        if test_mask.any():
            y_test = df_test.loc[test_mask, hate_speech_col]
            X_test_filtered = X_test_combined[test_mask]
            
            hate_speech_dict = models['hate_speech']
            hate_speech_model = hate_speech_dict['model']
            hate_speech_encoder = hate_speech_dict['encoder']
            
            # Encode test labels using the same encoder
            try:
                y_test_encoded = hate_speech_encoder.transform(y_test)
                y_pred = hate_speech_model.predict(X_test_filtered)
                
                print("\nHate Speech Detection Evaluation:")
                print(classification_report(y_test_encoded, y_pred))
            except ValueError as e:
                print(f"\nError in hate speech evaluation: {e}")
                print("This might be due to new classes in test data that weren't in training data")


In [31]:
# Evaluate the model
evaluate_model()


Model Evaluation:

Spam Detection Evaluation:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94       176
           1       0.96      0.94      0.95       216

    accuracy                           0.94       392
   macro avg       0.94      0.94      0.94       392
weighted avg       0.94      0.94      0.94       392


Sentiment Analysis Evaluation:
              precision    recall  f1-score   support

           0       0.57      0.31      0.40      2474
           1       0.63      0.79      0.70      4260
           2       0.62      0.54      0.58      3397
           3       0.59      0.68      0.63      3768

    accuracy                           0.61     13899
   macro avg       0.60      0.58      0.58     13899
weighted avg       0.61      0.61      0.60     13899


Toxic Classification Evaluation:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1    

In [None]:
# Function to use the model for predictions on new data
def predict_youtube_comments(comments, model_path='fasttext_tfidf_ensemble.joblib'):
    """
    Predict labels for new YouTube comments
    
    Args:
        comments: List of strings containing YouTube comments
        model_path: Path to the saved model
        
    Returns:
        Dictionary with predictions and confidence scores
    """
    # Load the model
    predictor = load_model(model_path)
    
    # Make predictions
    predictions = predictor.predict_with_confidence(comments)
    
    return predictions