# Naives Bayes

### Set up

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

### Data

In [None]:
data = pd.read_csv('data/merged_data.csv', nrows=1000)

In [None]:
df = data[['comment_text', 'toxic']]

### Data Split

In [None]:
X = df['comment_text']
y = df['toxic']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42) 

In [None]:
#Check for the lenght of train and test data split
print(f"The train data lenght check is: {X_train.shape[0] == y_train.shape[0]} - {X_train.shape[0]}")
print(f"The test data lenght check is: {X_test.shape[0] == y_test.shape[0]} - {X_test.shape[0]}")


In [None]:
type(X_train)

### CountVectorizer

In [None]:
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)



In [None]:
'''
There are 3 options for Naive Bayes: GaussianNB, MultinomialNB and BernoulliNB 
To use GaussianNB we should check if the data has a normal distribution. Because of 
the assumption of the normal distribution, GaussianNB is used in cases when all our features are continuos.
Multinomial is used when we have discrete data. For example, rating ranging from 1 to 5.
In text learning we have the count of each word to predict the class or label.
BernoulliNB is used when you have only 1 or 0, Binary. 
'''

### Naive Bayes - Multinomial 

In [None]:
model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [None]:
X_test_cv = v.transform(X_test)

In [None]:
y_pred = model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

### The same now, but this time using a pipeline

In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# the advantage of doing this way is that we don't need to define and use X_train_cv

clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

### Function to record different models performance (modified to use a pipeline)

In [None]:
# initialize dataframe that will include the results
results_table = pd.DataFrame()

In [None]:
def evaluate_model(clf, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    predict_probab = clf.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

### Naive Bayes - Multinomial 

In [None]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train, y_train, X_test, y_test,results_table, parameters="", comments="Multinomial_cv" )

### Pre-Process function (Stop Words, Punctuation, Lemma)

In [None]:
# Load english language model and create nlp object from it
nlp = spacy.load('en_core_web_sm')

In [None]:
# Preprocess Function
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens) #this convert the list into a string separated by spaces


### Split (with preprocess corpus)

In [None]:

X_pp = [preprocess(text) for text in X]


In [None]:
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp,y,test_size=0.2, random_state=42)

### Naive Bayes - Multinomial NB with Preprocess Step

In [None]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp, results_table, parameters="", comments="multinomial_cv_pp")


### Naive Bayes - Multinomial NB with Preprocess Step and Bi-grams

In [None]:
# Initialize the pipeline you want to try
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp,results_table, parameters="bi-grams", comments="multinomial_cv_pp" )

### TF - IDF

In [None]:
# Initialize the pipeline with TfidfVectorizer and MultinomialNB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train, y_train, X_test, y_test,results_table, parameters="", comments="TfidfVectorizer" )

### TF - IDF with Preprocess Data

In [None]:
# Initialize the pipeline with TfidfVectorizer and MultinomialNB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(min_df=5)),
    ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, X_train_pp, y_train_pp, X_test_pp, y_test_pp,results_table, parameters="min_df=5", comments="TfidfVectorizer_pp" )

### Split data using Word Vectors

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
df = df.copy()
df['vector'] = df['comment_text'].apply(lambda text: nlp(text).vector)
df

In [None]:
X = df['vector']
y = df['toxic']
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(df['vector'],y,test_size=0.2, random_state=42)
X_train_2Dvec = np.stack(X_train_vec)
X_test_2Dvec = np.stack(X_test_vec)

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2Dvec)
scaled_test_embed = scaler.transform(X_test_2Dvec)

# Initialize the pipeline 
clf = Pipeline([
        ('nb', MultinomialNB())
])

# Call the function and store the row in the variable result
results_table = evaluate_model(clf, scaled_train_embed, y_train_vec, scaled_test_embed, y_test_vec,results_table, parameters="", comments="vectors_spacy" )

### Results

In [None]:
results_table