# IMT 547 Project Part V: Prediction

Chesie Yu

02/26/2024

<style type = "text/css">  
    body {
        font-family: "Serif"; 
        font-size: 12pt;
    }
    em {
        color: #4E7F9E;
    }
    strong {
        color: #436D87;
    }
    li {
        color: #4E7F9E;
    }
    ul {
        color: #4E7F9E;
    }
    img {
        display: block;
        margin: auto;
    } 
    .jp-RenderedHTMLCommon a:link { 
        color: #94C1C9;
    }
    .jp-RenderedHTMLCommon a:visited { 
        color: #94C1C9;
    }
    .jp-RenderedHTMLCommon code {
        color: #4E7F9E;
    }  
    .mark {
        color: #B00D00;
        background-color: #FFF7B1;
    }
</style>

In [1]:
# Load the packages
import time
import warnings
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt  
import seaborn as sns 

In [2]:
# Load the data
yt = pd.read_csv("../data/yt-labeled.csv")
yt.head(3)

Unnamed: 0,channel_id,channel_name,channel_description,channel_country,channel_uploads_id,channel_viewcount,channel_subscribercount,channel_videocount,video_id,video_title,...,comment_weapon,comment_children,comment_monster,comment_ocean,comment_giving,comment_contentment,comment_writing,comment_rural,comment_positive_emotion,comment_musical
0,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,I make videos.,JP,UU-lHJZR3Gqxm24_Vd_AJ5Yw,29238461440,111000000,4753,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,I make videos.,JP,UU-lHJZR3Gqxm24_Vd_AJ5Yw,29238461440,111000000,4753,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,0.0
2,UC-lHJZR3Gqxm24_Vd_AJ5Yw,PewDiePie,I make videos.,JP,UU-lHJZR3Gqxm24_Vd_AJ5Yw,29238461440,111000000,4753,F-yEoHL7MYY,I t̶r̶i̶e̶d̶ ̶t̶o̶ beat Elden Ring Without Dyi...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Average toxicity
toxicity_cols = ["comment_toxicity", "comment_severe_toxicity", "comment_identity_attack", 
                 "comment_insult", "comment_profanity", "comment_threat"]
yt[toxicity_cols].mean()

comment_toxicity           0.113801
comment_severe_toxicity    0.008682
comment_identity_attack    0.013592
comment_insult             0.043258
comment_profanity          0.064113
comment_threat             0.033292
dtype: float64

In [4]:
# Define the threshold alpha
alpha = 0.5

# Create binary labels for toxicity
for col in toxicity_cols:
    yt[f"{col}_label"] = yt[col] > alpha        

# Number of columns exhibiting toxicity
toxicity_label_cols = ["comment_toxicity_label", "comment_severe_toxicity_label", 
                    "comment_identity_attack_label", "comment_insult_label", 
                    "comment_profanity_label", "comment_threat_label"]

# Proportion of toxic comments
yt[toxicity_label_cols].sum() / yt.shape[0]

comment_toxicity_label           0.021964
comment_severe_toxicity_label    0.000016
comment_identity_attack_label    0.000257
comment_insult_label             0.004130
comment_profanity_label          0.017217
comment_threat_label             0.008693
dtype: float64

### Train-Test Split

In [5]:
# Specify the features and labels
X = yt["comment_cleaned"]
X1 = yt[["video_speed", "video_blocked_proportion"]].values
X2 = yt["video_speed"].values
y = yt["comment_toxicity_label"].values
y1 = yt["comment_toxicity_label"].values
y2 = yt["comment_toxicity_label"].values

In [6]:
# Import train_test_split library
from sklearn.model_selection import train_test_split

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y,
                                                    test_size = 0.2, 
                                                    random_state = 547)
print(f"Training Set: {X_train.shape, y_train.shape}")
print(f"Test Set: {X_test.shape, y_test.shape}") 

Training Set: ((99763,), (99763,))
Test Set: ((24941,), (24941,))


### Vectorization

In [7]:
# Import the vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Set up the vectorizer  
tfidf = TfidfVectorizer(stop_words = "english", ngram_range = (1, 2))

# Create the DTM
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [8]:
# Import sparse library
from scipy import sparse

# Convert DTM to sparse matrices
X_train = sparse.hstack([X_train_tfidf]).tocsr()
X_test = sparse.hstack([X_test_tfidf]).tocsr()

print(f"Training Set: {X_train.shape, y_train.shape}")
print(f"Test Set: {X_test.shape, y_test.shape}") 

Training Set: ((99763, 439247), (99763,))
Test Set: ((24941, 439247), (24941,))


### Models

In [9]:
# Load the packages
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from joblib import dump, load

In [10]:
# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [11]:
# Define function train_val that trains a given estimator with specified hyperparameters
# and evaluate its performance.  If a param_dist is given, the function performs a 
# random search for the best hyperparameters using the given distribution. 
# The function takes in eight arguments:
#     estimator: The class of the estimator used 
#     param_dist: The distribution of hyperparameters for random search.  If none, no 
#                 search will be performed; the model will be fit using default settings
#     n_iter: Number of iterations for random search (default = 60)
#     cv: Number of folds for cross-validation (default = 5)
#     X_train: Input matrix for training set (default = X_train)
#     y_train: Output vector for training set (default = y_train) 
#     X_test: Input matrix for test set (default = X_test)
#     y_test: Output vector for test set (default = y_test)
#     verbose: Whether to print the training result (default = True)
# The function will return a pandas DataFrame of the training results
def train_val(estimator, param_dist = None, n_iter = 60, cv = 5, 
              X_train = X_train, y_train = y_train,
              X_test = X_test, y_test = y_test,
              verbose = True):
    
    # Start timing
    start_time = time.time()
    
    # Initialize the estimator
    m = estimator()
    if verbose:
        print(f"-------- {estimator.__name__} --------")
    
    # Perform random search if given param_dist
    if param_dist:
        warnings.filterwarnings("ignore")
        rs = RandomizedSearchCV(estimator = m,
                                param_distributions = param_dist, 
                                scoring=["accuracy", "precision", "recall", "f1", "roc_auc"],
                                n_iter = n_iter, cv = cv, refit = "f1", 
                                verbose = 0)
        rs.fit(X_train, y_train)

        # Find the best hyperparameters
        if verbose:
            print(f"Best hyperparameters: {rs.best_params_}")
            print(f"Best score: {rs.best_score_:3f}")

        # Initialize the estimator with best hyperparameters
        m = estimator(**rs.best_params_)
    
    # Fit the model on training set 
    m.fit(X_train, y_train)

    # Make predictions on test set
    y_pred = m.predict(X_test)
    
    # Evaluate the result
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # End timing 
    end_time = time.time()
    
    if verbose: 
        print(f"Validation Accuracy: {accuracy:.3f}")
        print(f"Validation Precision: {precision:.3f}")
        print(f"Validation Recall: {recall:.3f}")
        print(f"Validation F1 Score: {f1:.3f}")
        print(f"Validation ROC AUC Score: {roc_auc:.3f}")
        print(f"Runtime: {end_time - start_time}")
        print("------------------------------\n")
    
    # Store the result in a DataFrame
    if param_dist:
        result = pd.DataFrame({"estimator": [estimator.__name__],
                               "best_params": [rs.best_params_],
                               "accuracy": [accuracy],
                               "precision": [precision],
                               "recall": [recall],
                               "f1": [f1],
                               "roc_auc": [roc_auc],
                               "y_pred": [y_pred]})
    else:
        result = pd.DataFrame({"estimator": [estimator.__name__],
                               "best_params": ["default"],
                               "accuracy": [accuracy],
                               "precision": [precision],
                               "recall": [recall],
                               "f1": [f1],
                               "roc_auc": [roc_auc],
                               "y_pred": [y_pred]})
    
    return result

In [12]:
## Validation Result

# Define the models and corresponding parameter distributions
models = {# "Multinomial Naive Bayes": (MultinomialNB, None, 1),
          "Logistic Regression": (LogisticRegression, None, 1),
          # "K-Nearest Neighbors": (KNeighborsClassifier, None, 1),
          "Decision Tree": (DecisionTreeClassifier, None, 1),
          "Gradient Boosting": (GradientBoostingClassifier, None, 1)}
          # "Random Forest": (RandomForestClassifier, None, 1),
          # "Support Vector Machine": (SVC, None, 1),
          # "MLP Classifier": (MLPClassifier, None, 1)}

# Create empty list of results
results = []

# Loop through the models and append the results
for name, (model, param, cv) in models.items():
    result = train_val(model, param, cv, verbose = True)
    result["model"] = name
    results.append(result)

results_df = pd.concat(results, ignore_index = True)
results_df.sort_values("f1", ascending = False)

-------- LogisticRegression --------
Validation Accuracy: 0.984
Validation Precision: 0.883
Validation Recall: 0.290
Validation F1 Score: 0.437
Validation ROC AUC Score: 0.645
Runtime: 3.4957399368286133
------------------------------

-------- DecisionTreeClassifier --------
Validation Accuracy: 0.983
Validation Precision: 0.619
Validation Recall: 0.628
Validation F1 Score: 0.623
Validation ROC AUC Score: 0.810
Runtime: 78.1074800491333
------------------------------

-------- GradientBoostingClassifier --------
Validation Accuracy: 0.987
Validation Precision: 0.765
Validation Recall: 0.577
Validation F1 Score: 0.658
Validation ROC AUC Score: 0.786
Runtime: 58.94273114204407
------------------------------



Unnamed: 0,estimator,best_params,accuracy,precision,recall,f1,roc_auc,y_pred,model
2,GradientBoostingClassifier,default,0.986809,0.765133,0.576642,0.657648,0.786333,"[False, False, False, False, False, False, Fal...",Gradient Boosting
1,DecisionTreeClassifier,default,0.983321,0.618705,0.627737,0.623188,0.809523,"[False, False, False, False, False, False, Fal...",Decision Tree
0,LogisticRegression,default,0.983561,0.883333,0.290146,0.436813,0.644643,"[False, False, False, False, False, False, Fal...",Logistic Regression


In [13]:
# Import MultinomialNB library
from sklearn.naive_bayes import MultinomialNB

# Create empty list of results
results = []
top_toxic_words = []

# Cross validate naive bayes classifer on each column
for col in toxicity_label_cols:
    X = yt["comment_text"]
    y = yt[col].values
    
    # Clean and vectorize the data 
    tfidf = TfidfVectorizer(stop_words = "english", 
                            ngram_range = (1, 3))
    X_tfidf = tfidf.fit_transform(X)
    X = sparse.hstack([X_tfidf]).tocsr()
    
    # Initialize naive bayes classifer
    m = MultinomialNB(alpha = .3)
    
    # Report accuracy from cross validation
    scoring = ["accuracy", "precision", "recall", "f1", "roc_auc"]
    scores = cross_validate(m, X, y, cv = 10, scoring = scoring)
    results.append(np.mean(pd.DataFrame(scores), axis = 0))

    # Retrieve top toxic words
    toxic_lp = m.fit(X, y).feature_log_prob_[1]
    top_toxic_words.append([tfidf.get_feature_names_out()[i] for i in np.argsort(toxic_lp)[::-1][:10]])

# Display the result
results_nb = pd.DataFrame(results)
results_nb.insert(0, "toxic_type", toxicity_label_cols)
results_nb = results_nb.rename(columns = {"toxic_type": "Toxic Subtype",
                                          "fit_time": "Fit Time",
                                          "score_time": "Score Time",
                                          "test_accuracy": "Accuracy",
                                          "test_precision": "Precision",
                                          "test_recall": "Recall", 
                                          "test_f1": "F1 Score", 
                                          "test_roc_auc": "ROC-AUC"})
results_nb.style.hide(axis = "index")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
Traceback (most recent call last):
  File "/Users/chesie/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/Users/chesie/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 471, in _score
    return self._sign * self._score_func(y, y_pred, **scoring_kwargs)
  File "/Users/chesie/anaconda3/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func

Toxic Subtype,Fit Time,Score Time,Accuracy,Precision,Recall,F1 Score,ROC-AUC
comment_toxicity_label,0.057625,0.031918,0.977916,0.0,0.0,0.0,0.746468
comment_severe_toxicity_label,0.061973,0.032632,0.999984,0.0,0.0,0.0,0.557498
comment_identity_attack_label,0.060056,0.031167,0.999743,0.0,0.0,0.0,0.74492
comment_insult_label,0.062743,0.035721,0.99587,0.0,0.0,0.0,0.690454
comment_profanity_label,0.045641,0.028262,0.982695,0.0,0.0,0.0,0.78535
comment_threat_label,0.061558,0.033615,0.991275,0.0,0.0,0.0,0.722359


In [14]:
# Display the top toxic words
top_toxic_words = pd.DataFrame(top_toxic_words).T
top_toxic_words.columns = toxicity_label_cols
top_toxic_words

Unnamed: 0,comment_toxicity_label,comment_severe_toxicity_label,comment_identity_attack_label,comment_insult_label,comment_profanity_label,comment_threat_label
0,shit,kills,gay,stupid,damn,kill
1,damn,kills dog nooooo,black,mark,shit,killing
2,fucking,haha fucked kills,homosexual,dumb,fucking,killed
3,ass,human haha fucked,offensive black,idiot,fuck,kills
4,just,human haha,gay music,like,ass,die
5,like,kills human haha,tubbo homosexual,sick,holy,shot
6,fuck,kills human,irish,just,just,mark
7,mark,haha fucked,offensive,bitch,like,gonna
8,kill,lazarbeam kills human,tubbo,shit,game,just
9,holy,dog nooooo,tanzanite,jack,crap,shoot
