# Set up

In [50]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
import spacy
import nltk # natural language tool kit
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ericmartinez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ericmartinez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Getting data ready

In [56]:
data = pd.read_csv('data/merged_data.csv')
df_train = data[['comment_text','toxic']]

In [55]:
'''
This alternative data is to go with a bigger data set
data = pd.read_csv('data/train.csv')
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]
# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)
'''

"\nThis alternative data is to go with a bigger data set\ndata = pd.read_csv('data/train.csv')\ndf_cleaned = data.dropna(subset=['comment_text'])\ndf_train = df_cleaned[['comment_text','target']]\n"

## Split Train and Test

In [59]:
#split the data in train and test

X_train, X_test, y_train, y_test = train_test_split(df_train['comment_text'], df_train['toxic'], random_state=42)

## Function to record different models performance

In [60]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()


In [62]:
def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## Baseline Model (Bag of Words + LogisticRegression)

In [63]:
#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)


# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_vectorized, y_train, X_test_vectorized, y_test, parameters="binary", comments="Baseline" )

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

# TF - IDF + LogisticRegression

In [64]:
# Initialize the TfidfVectorizer with min_df
tfidf_vect = TfidfVectorizer(min_df=30)

# Prepare X_train for the function
X_train_tfidf = tfidf_vect.fit_transform(X_train)

# Prepare X_test for the function
X_test_tfidf = tfidf_vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_tfidf, y_train, X_test_tfidf, y_test, parameters="min_df=30", comments="TfidfVectorizer" )

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [65]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,LogisticRegression,binary,0.539521,0.888785,0.679549,0.447341,0.913705,[[96673 2670]\n [ 6995 5662]],0 minutes and 27.88 seconds,Baseline
1,LogisticRegression,min_df=30,0.507491,0.916806,0.769095,0.378684,0.916937,[[97904 1439]\n [ 7864 4793]],0 minutes and 7.0 seconds,TfidfVectorizer


## Preprocess techniques

### Stemming(Bag of words) + LogisticRegression

In [66]:

# Initializing stemmer and countvectorizer 
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test, parameters="", comments="Stemming+LogisticRegression")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [67]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,LogisticRegression,binary,0.539521,0.888785,0.679549,0.447341,0.913705,[[96673 2670]\n [ 6995 5662]],0 minutes and 27.88 seconds,Baseline
1,LogisticRegression,min_df=30,0.507491,0.916806,0.769095,0.378684,0.916937,[[97904 1439]\n [ 7864 4793]],0 minutes and 7.0 seconds,TfidfVectorizer
2,LogisticRegression,,0.519429,0.88376,0.688127,0.41716,0.912768,[[96950 2393]\n [ 7377 5280]],0 minutes and 44.53 seconds,Stemming+LogisticRegression


### Stemming(Bag of words(stopwords)) + LogisticRegression

In [68]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
                         
# stop_words contains a list of 179 words that we want to remove from our comments

# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test, parameters="stopwords", comments="Stemming_cv")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)

In [69]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,LogisticRegression,binary,0.539521,0.888785,0.679549,0.447341,0.913705,[[96673 2670]\n [ 6995 5662]],0 minutes and 27.88 seconds,Baseline
1,LogisticRegression,min_df=30,0.507491,0.916806,0.769095,0.378684,0.916937,[[97904 1439]\n [ 7864 4793]],0 minutes and 7.0 seconds,TfidfVectorizer
2,LogisticRegression,,0.519429,0.88376,0.688127,0.41716,0.912768,[[96950 2393]\n [ 7377 5280]],0 minutes and 44.53 seconds,Stemming+LogisticRegression
3,LogisticRegression,stopwords,0.513327,0.882161,0.683941,0.41084,0.911964,[[96940 2403]\n [ 7457 5200]],0 minutes and 14.16 seconds,Stemming_cv


### Lemmatization

In [70]:
# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer().build_analyzer()

def lemmatize_word(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function 
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
result = evaluate_model(model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test, parameters="", comments="lemmatization_cv")

# Convert the dictionary to a DataFrame
new_row_df = pd.DataFrame([result])
# don't forget to append the result to the results dataframe
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)


In [71]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,LogisticRegression,binary,0.539521,0.888785,0.679549,0.447341,0.913705,[[96673 2670]\n [ 6995 5662]],0 minutes and 27.88 seconds,Baseline
1,LogisticRegression,min_df=30,0.507491,0.916806,0.769095,0.378684,0.916937,[[97904 1439]\n [ 7864 4793]],0 minutes and 7.0 seconds,TfidfVectorizer
2,LogisticRegression,,0.519429,0.88376,0.688127,0.41716,0.912768,[[96950 2393]\n [ 7377 5280]],0 minutes and 44.53 seconds,Stemming+LogisticRegression
3,LogisticRegression,stopwords,0.513327,0.882161,0.683941,0.41084,0.911964,[[96940 2403]\n [ 7457 5200]],0 minutes and 14.16 seconds,Stemming_cv
4,LogisticRegression,,0.531599,0.883216,0.68507,0.434305,0.913509,[[96816 2527]\n [ 7160 5497]],0 minutes and 48.44 seconds,lemmatization_cv


## Word Vectors - Spacy

In [43]:
# This initialize a pre-trained model (the small version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_sm")

# convert words into vectors
docs = [nlp(text) for text in X_train]

maybe give a try to the library gensim https://www.youtube.com/watch?v=Q2NtCcqmIww&t=0s&ab_channel=codebasics