# Naives Bayes

### Set up

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import fasttext

### Data

In [2]:
data = pd.read_csv('data/merged_data.csv', nrows=1000)

In [3]:
df = data[['comment_text', 'toxic']]

### Data Split

In [4]:
X = df['comment_text']
y = df['toxic']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y) 

In [6]:
#Check for the lenght of train and test data split
print(f"The train data lenght check is: {X_train.shape[0] == y_train.shape[0]} - {X_train.shape[0]}")
print(f"The test data lenght check is: {X_test.shape[0] == y_test.shape[0]} - {X_test.shape[0]}")


The train data lenght check is: True - 800
The test data lenght check is: True - 200


### CountVectorizer

In [7]:
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)



In [8]:
'''
There are 3 options for Naive Bayes: GaussianNB, MultinomialNB and BernoulliNB 
To use GaussianNB we should check if the data has a normal distribution. Because of 
the assumption of the normal distribution, GaussianNB is used in cases when all our features are continuos.
Multinomial is used when we have discrete data. For example, rating ranging from 1 to 5.
In text learning we have the count of each word to predict the class or label.
BernoulliNB is used when you have only 1 or 0, Binary. 
'''

'\nThere are 3 options for Naive Bayes: GaussianNB, MultinomialNB and BernoulliNB \nTo use GaussianNB we should check if the data has a normal distribution. Because of \nthe assumption of the normal distribution, GaussianNB is used in cases when all our features are continuos.\nMultinomial is used when we have discrete data. For example, rating ranging from 1 to 5.\nIn text learning we have the count of each word to predict the class or label.\nBernoulliNB is used when you have only 1 or 0, Binary. \n'

### Naive Bayes - Multinomial 

In [9]:
model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [10]:
X_test_cv = v.transform(X_test)

In [11]:
y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.29      0.04      0.06        56
           1       0.72      0.97      0.82       144

    accuracy                           0.70       200
   macro avg       0.50      0.50      0.44       200
weighted avg       0.60      0.70      0.61       200



### The same now, but this time using a pipeline

In [12]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# the advantage of doing this way is that we don't need to define and use X_train_cv

clf.fit(X_train,y_train)

In [13]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.29      0.04      0.06        56
           1       0.72      0.97      0.82       144

    accuracy                           0.70       200
   macro avg       0.50      0.50      0.44       200
weighted avg       0.60      0.70      0.61       200



### Function to record different models performance (modified to use a pipeline)

In [14]:
# initialize dataframe that will include the results
results_table = pd.DataFrame()

In [15]:
def evaluate_model(clf, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    predict_probab = clf.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

### Naive Bayes - Multinomial 

In [16]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train, y_train, X_test, y_test,results_table, parameters="", comments="Multinomial_cv" )

### Pre-Process function (Stop Words, Punctuation, Lemma)

In [17]:
# Load english language model and create nlp object from it
nlp = spacy.load('en_core_web_sm')

In [18]:
# Preprocess Function
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens) #this convert the list into a string separated by spaces


### Split (with preprocess corpus)

In [19]:

X_pp = [preprocess(text) for text in X]


In [20]:
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp,y,test_size=0.2, random_state=42)

### Naive Bayes - Multinomial NB with Preprocess Step

In [21]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp, results_table, parameters="", comments="multinomial_cv_pp")


### Naive Bayes - Multinomial NB with Preprocess Step and Bi-grams

In [22]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp,results_table, parameters="bi-grams", comments="multinomial_cv_pp" )

### TF - IDF

In [23]:
# Initialize the pipeline with TfidfVectorizer and MultinomialNB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train, y_train, X_test, y_test,results_table, parameters="", comments="TfidfVectorizer" )

### TF - IDF with Preprocess Data

In [24]:
# Initialize the pipeline with TfidfVectorizer and MultinomialNB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(min_df=5)),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp,results_table, parameters="min_df=5", comments="TfidfVectorizer_pp" )

### Split data using Word Vectors Spacy

In [25]:
nlp = spacy.load("en_core_web_lg")

In [26]:
df = df.copy()
df['vector_spacy'] = df['comment_text'].apply(lambda text: nlp(text).vector)
df

Unnamed: 0,comment_text,toxic,vector_spacy
0,OH yes - Were those evil Christian Missionarie...,1,"[-3.1565988, -0.40741652, -0.95940566, 0.25587..."
1,Why is this black racist crap still on the G&M...,1,"[-1.0249443, 1.5786849, -3.134766, -0.5616368,..."
2,even up here.......BLACKS!,1,"[1.3330283, 0.32543826, -3.1496303, 0.98925835..."
3,Blame men. There's always an excuse to blame ...,1,"[-1.3248723, 2.0334837, -1.770301, 0.16092223,..."
4,And the woman exposing herself saying grab thi...,1,"[0.2207822, 1.8134906, -1.8909769, -0.9731891,..."
...,...,...,...
995,Parliament is full of deadbeat parasites so ho...,1,"[-1.2902673, 2.660552, -3.99906, -2.1853173, 2..."
996,And Billy boy did the most despicable thing to...,1,"[-1.1255469, 2.2887607, -3.2337604, -1.9236423..."
997,I will take it! Its disgusting what you pay to...,0,"[0.11903482, 1.2084537, -4.553807, -2.252022, ..."
998,"Funny, you don't seem like a rogue Catholic to...",1,"[-1.6919639, 1.2666397, -3.3249094, -0.6504828..."


In [27]:
X = df['vector_spacy']
y = df['toxic']
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(X,y,test_size=0.2, random_state=42)
X_train_2Dvec = np.stack(X_train_vec)
X_test_2Dvec = np.stack(X_test_vec)

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2Dvec)
scaled_test_embed = scaler.transform(X_test_2Dvec)

# Initialize the pipeline 
clf = Pipeline([
        ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, scaled_train_embed, y_train_vec, scaled_test_embed, y_test_vec,results_table, parameters="", comments="vectors_spacy" )

### Word Vectors Gensim

In [28]:
import gensim.downloader as api
wv = api.load("glove-twitter-25")

### Preprocess and vectorization function

In [29]:
# Preprocess Function
def preprocess_and_vectorize(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue
        filtered_tokens.append(token.lemma_)

    return wv.get_mean_vector(filtered_tokens)

In [30]:
df.copy()
df['vector_ginsim'] = df['comment_text'].apply(lambda text: preprocess_and_vectorize(text))
df

Unnamed: 0,comment_text,toxic,vector_spacy,vector_ginsim
0,OH yes - Were those evil Christian Missionarie...,1,"[-3.1565988, -0.40741652, -0.95940566, 0.25587...","[0.026361885, 0.06158323, -0.08723717, 0.08079..."
1,Why is this black racist crap still on the G&M...,1,"[-1.0249443, 1.5786849, -3.134766, -0.5616368,...","[0.042466734, 0.03531652, -0.04086705, 0.00051..."
2,even up here.......BLACKS!,1,"[1.3330283, 0.32543826, -3.1496303, 0.98925835...","[-0.0023623807, -0.17442286, 0.06280133, -0.07..."
3,Blame men. There's always an excuse to blame ...,1,"[-1.3248723, 2.0334837, -1.770301, 0.16092223,...","[0.037002154, 0.04110098, -0.12071987, 0.04797..."
4,And the woman exposing herself saying grab thi...,1,"[0.2207822, 1.8134906, -1.8909769, -0.9731891,...","[-0.0141696725, 0.098936565, -0.0466249, -0.04..."
...,...,...,...,...
995,Parliament is full of deadbeat parasites so ho...,1,"[-1.2902673, 2.660552, -3.99906, -2.1853173, 2...","[-0.16026868, 0.15069315, -0.08415713, 0.05298..."
996,And Billy boy did the most despicable thing to...,1,"[-1.1255469, 2.2887607, -3.2337604, -1.9236423...","[0.00063220196, 0.04593676, 0.034617014, 0.020..."
997,I will take it! Its disgusting what you pay to...,0,"[0.11903482, 1.2084537, -4.553807, -2.252022, ...","[0.057151046, 0.1470519, -0.040028956, 0.00079..."
998,"Funny, you don't seem like a rogue Catholic to...",1,"[-1.6919639, 1.2666397, -3.3249094, -0.6504828...","[-0.024161603, 0.035412274, -0.024118718, -0.0..."


In [31]:
X = df['vector_ginsim']
y = df['toxic']
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(X,y,test_size=0.2, random_state=42)
X_train_2Dvec = np.stack(X_train_vec)
X_test_2Dvec = np.stack(X_test_vec)

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2Dvec)
scaled_test_embed = scaler.transform(X_test_2Dvec)

# Initialize the pipeline 
clf = Pipeline([
        ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, scaled_train_embed, y_train_vec, scaled_test_embed, y_test_vec,results_table, parameters="", comments="vectors_ginsim" )

### Results

In [32]:
results_table

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,MultinomialNB,,0.824926,0.500496,0.720207,0.965278,0.705,[[ 2 54]\n [ 5 139]],0 minutes and 0.05 seconds,Multinomial_cv
1,MultinomialNB,,0.858824,0.537634,0.789189,0.941935,0.76,[[ 6 39]\n [ 9 146]],0 minutes and 0.02 seconds,multinomial_cv_pp
2,MultinomialNB,bi-grams,0.864553,0.517204,0.78125,0.967742,0.765,[[ 3 42]\n [ 5 150]],0 minutes and 0.04 seconds,multinomial_cv_pp
3,MultinomialNB,,0.837209,0.5,0.72,1.0,0.72,[[ 0 56]\n [ 0 144]],0 minutes and 0.03 seconds,TfidfVectorizer
4,MultinomialNB,min_df=5,0.87106,0.523656,0.783505,0.980645,0.775,[[ 3 42]\n [ 3 152]],0 minutes and 0.02 seconds,TfidfVectorizer_pp
5,MultinomialNB,,0.873239,0.5,0.775,1.0,0.775,[[ 0 45]\n [ 0 155]],0 minutes and 0.09 seconds,vectors_spacy
6,MultinomialNB,,0.873239,0.5,0.775,1.0,0.775,[[ 0 45]\n [ 0 155]],0 minutes and 0.0 seconds,vectors_ginsim


### Fast Text