# Set up

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
import spacy
import nltk # natural language tool kit
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
from nltk.stem import PorterStemmer


In [None]:
empty_vectors = Vectors(shape=(10000, 300))

data = np.zeros((3, 300), dtype='f')
keys = ["cat", "dog", "rat"]
vectors = Vectors(data=data, keys=keys)


## Getting data ready

In [None]:
data = pd.read_csv('data/train.csv')
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]


In [None]:
# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)

## Split Train and Test

In [None]:
#split the data in train and test

X_train, X_test, y_train, y_test = train_test_split(df_train['comment_text'], df_train['toxic'], random_state=42)

## Function to record different models performance

In [None]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()


In [None]:
def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## Baseline Model (Bag of Words + LogisticRegression)

In [None]:
#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)


# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_vectorized, y_train, X_test_vectorized, y_test, parameters="binary", comments="Baseline" )

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

# TF - IDF + LogisticRegression

In [None]:
# Initialize the TfidfVectorizer with min_df
tfidf_vect = TfidfVectorizer(min_df=30)

# Prepare X_train for the function
X_train_tfidf = tfidf_vect.fit_transform(X_train)

# Prepare X_test for the function
X_test_tfidf = tfidf_vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_tfidf, y_train, X_test_tfidf, y_test, parameters="min_df=30", comments="TfidfVectorizer" )

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [None]:
results_df

## Preprocess techniques

### Stemming(Bag of words) + LogisticRegression

In [None]:

# Initializing stemmer and countvectorizer 
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test, parameters="", comments="Stemming+LogisticRegression")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [None]:
results_df

### Stemming(Bag of words(stopwords)) + LogisticRegression

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
                         
# stop_words contains a list of 179 words that we want to remove from our comments

# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test, parameters="stopwords", comments="Stemming_cv")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [None]:
results_df

### Lemmatization

In [None]:
# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer().build_analyzer()

def lemmatize_word(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function 
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test, parameters="", comments="lemmatization_cv")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)


In [None]:
results_df

## Word Vectors - Spacy

In [None]:
# This initialize a pre-trained model (the small version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_sm")