# Modelling RFC


## Imports

In [69]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
import spacy
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
#import lightgbm as lgb

from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity


In [42]:
nlp = spacy.load('en_core_web_sm')

In [20]:
#this initialize tqdm which is useful to show a progress bar when applying operations in a pandas df
tqdm.pandas()

## Data ExtracT

In [4]:
df = pd.read_csv('data/undersampled_data_60_40.csv')


In [5]:
df.columns

Index(['comment_text', 'toxic', 'stopwords_punct_lemma', 'vector_spacy',
       'pos_tags', 'pos_tags_str'],
      dtype='object')

In [10]:
#drop NaNs from df['stopwords_punct_lemma']
df.dropna(subset=['stopwords_punct_lemma'], inplace=True)


## Function to Calculate

In [12]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## RFC

### 1. RFC with stopwords_punct_lemma vectorizer Tfidf

In [18]:
X_stop = df['stopwords_punct_lemma']
y_stop = df['toxic']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X_stop)

# Split the data into training and testing sets
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_tfidf, y_stop, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier()

# Call the evaluate_model function
results = evaluate_model(rf_model, X_train_s, y_train_s, X_test_s, y_test_s, model_name="RandomForestClassifier",parameters="", comments="Using stopwords_punct_lemma, vec - TFIDf")
print(results)


{'Name': 'RandomForestClassifier', 'Parameters': '', 'F1-Score': 0.8148094400203167, 'AUC-ROC': 0.9291810270754639, 'Precision': 0.8596088337734911, 'Recall': 0.774448275862069, 'Accuracy': 0.8583283606944117, 'Confusion Matrix': '[[39393  3668]\n [ 6541 22459]]', 'Training Time': '43 minutes and 46.45 seconds', 'Comments': 'Using stopwords_punct_lemma, vec - TFIDf'}


In [21]:
# Convert the dictionary of results into a DataFrame
rfc_results_df = pd.DataFrame([results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, rfc_results_df], ignore_index=True)

In [22]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"


### 2. RFC with stopwords_punct_lemma vectorizer word2vec

In [23]:
sentences = [text.split() for text in df['stopwords_punct_lemma']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Transform each text into an average Word2Vec vector
word2vec_vectors = []
for text in sentences:
    vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if vectors:
        text_vector = np.mean(vectors, axis=0)
        word2vec_vectors.append(text_vector)
    else:
        # Handle cases where there are no words found in the Word2Vec model
        word2vec_vectors.append(np.zeros(word2vec_model.vector_size))

# Convert the list of Word2Vec vectors into a matrix
X_word2vec = np.vstack(word2vec_vectors)

# Split the data into training and testing sets
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y_stop, test_size=0.2, random_state=42)



In [24]:
# Initialize the RandomForestClassifier for Word2Vec
rf_model_w2v = RandomForestClassifier()

# Call the evaluate_model function for Word2Vec
results_w2v = evaluate_model(rf_model_w2v, X_train_w2v, y_train_w2v, X_test_w2v, y_test_w2v, model_name="RandomForestClassifier",parameters="", comments="Using stopwords_punct_lemma, vec - Word2Vec")

# Convert the dictionary of results into a DataFrame for Word2Vec
word2vec_results_df = pd.DataFrame([results_w2v])

# Append the Word2Vec results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, word2vec_results_df], ignore_index=True)

In [25]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"


### 3. RFC with stopwords_punct_lemma + count vectorizer 

In [27]:
X_c = df['stopwords_punct_lemma']
y_c = df['toxic']

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Transform text data into CountVectorized features
X_count = count_vectorizer.fit_transform(X_c)

In [28]:
# Split the data into training and testing sets
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_count, y_c, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier for CountVectorizer
rf_model_c = RandomForestClassifier()

# Call the evaluate_model function for CountVectorizer
results_c = evaluate_model(rf_model_c, X_train_c, y_train_c, X_test_c, y_test_c, model_name="RandomForestClassifier", parameters="", comments="Using stopwords_punct_lemma, vec - CountVectorizer")

# Convert the dictionary of results into a DataFrame for CountVectorizer
count_vectorizer_results_df = pd.DataFrame([results_c])

# Append the CountVectorizer results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, count_vectorizer_results_df], ignore_index=True)

In [29]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."


### 4. RFC  + spacy vectorizer 

In [44]:
corpus_vect = df['vector_spacy'].str.strip('[]').str.split(expand=True)
corpus_vect = corpus_vect.astype('float')

In [46]:

df['new_vector_spacy'] = corpus_vect.values.tolist()


In [47]:

X_sp = np.array(df['new_vector_spacy'].tolist())
y_sp = df['toxic']

# Split the data into training and testing sets
X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(X_sp, y_sp, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier()

# Fit the model
rf_model.fit(X_train_sp, y_train_sp)

In [48]:
# Call the evaluate_model
results_sp = evaluate_model(rf_model, X_train_sp, y_train_sp, X_test_sp, y_test_sp, model_name="RandomForestClassifier", parameters="", comments="Using SpaCy vectors")

# Convert the dictionary of results into a DataFrame for CountVectorizer
spacy_vectorizer_results_df = pd.DataFrame([results_sp])

# Append the CountVectorizer results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, spacy_vectorizer_results_df], ignore_index=True)

In [49]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
3,RandomForestClassifier,,0.647204,0.812668,0.747471,0.570655,0.749629,[[37470 5591]\n [12451 16549]],8 minutes and 1.77 seconds,Using SpaCy vectors


### 5. RFC + POS + count vectorizer

In [51]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['pos_tags_str'])
y = df['toxic']

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)

# Use the evaluate_model function to assess the model
results = evaluate_model(rfc, X_train, y_train, X_test, y_test, model_name="RandomForestClassifier",parameters='n_estimators=10',comments="POS + RFC")

new_row_df = pd.DataFrame([results])
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)



In [53]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
3,RandomForestClassifier,,0.647204,0.812668,0.747471,0.570655,0.749629,[[37470 5591]\n [12451 16549]],8 minutes and 1.77 seconds,Using SpaCy vectors
4,Random Forest Classifier,n_estimators=10,0.35616,0.580746,0.494366,0.278345,0.59501,[[34805 8256]\n [20928 8072]],1 minutes and 21.09 seconds,POS + RFC


### 6. RFC + POS + TFIDF

In [54]:
vectorizer = TfidfVectorizer()
X_pos_tf = vectorizer.fit_transform(df['pos_tags_str'])
y_pos_tf = df['toxic']


In [55]:
# Split the data into training and testing sets
X_train_pos_tf, X_test_pos_tf, y_train_pos_tf, y_test_pos_tf = train_test_split(X_pos_tf, y_pos_tf, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)

# Use the evaluate_model function to assess the model
results = evaluate_model(rfc, X_train_pos_tf, y_train_pos_tf, X_test_pos_tf, y_test_pos_tf, model_name="RandomForestClassifier", parameters='n_estimators=10', comments="POS + using TF-IDF")




In [56]:
new_row_df = pd.DataFrame([results])
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
3,RandomForestClassifier,,0.647204,0.812668,0.747471,0.570655,0.749629,[[37470 5591]\n [12451 16549]],8 minutes and 1.77 seconds,Using SpaCy vectors
4,Random Forest Classifier,n_estimators=10,0.35616,0.580746,0.494366,0.278345,0.59501,[[34805 8256]\n [20928 8072]],1 minutes and 21.09 seconds,POS + RFC
5,Random Forest Classifier,n_estimators=10,0.346721,0.576378,0.489195,0.268517,0.592789,[[34930 8131]\n [21213 7787]],1 minutes and 6.97 seconds,POS + using TF-IDF


In [58]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
3,RandomForestClassifier,,0.647204,0.812668,0.747471,0.570655,0.749629,[[37470 5591]\n [12451 16549]],8 minutes and 1.77 seconds,Using SpaCy vectors
4,RandomForestClassifier,n_estimators=10,0.35616,0.580746,0.494366,0.278345,0.59501,[[34805 8256]\n [20928 8072]],1 minutes and 21.09 seconds,POS + RFC
5,RandomForestClassifier,n_estimators=10,0.346721,0.576378,0.489195,0.268517,0.592789,[[34930 8131]\n [21213 7787]],1 minutes and 6.97 seconds,POS + using TF-IDF


## NB on new_vector_spacy using Multnomial 

In [76]:
X=np.array(df['new_vector_spacy'].tolist())
y = df['toxic']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Evaluate the model using the provided function
results = evaluate_model(nb_model, X_train_scaled, y_train, X_test_scaled, y_test, model_name="Multinomial Naive Bayes", parameters="MinMaxScaler", comments="spay vector")





In [77]:
new_row_df = pd.DataFrame([results])
results_df = pd.concat([results_df, pd.DataFrame(new_row_df)], ignore_index=True)
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,RandomForestClassifier,,0.814809,0.929181,0.859609,0.774448,0.858328,[[39393 3668]\n [ 6541 22459]],43 minutes and 46.45 seconds,"Using stopwords_punct_lemma, vec - TFIDf"
1,RandomForestClassifier,,0.682158,0.836841,0.766639,0.614448,0.76957,[[37637 5424]\n [11181 17819]],4 minutes and 29.29 seconds,"Using stopwords_punct_lemma, vec - Word2Vec"
2,RandomForestClassifier,,0.813292,0.923005,0.848289,0.781069,0.855678,[[39010 4051]\n [ 6349 22651]],50 minutes and 37.23 seconds,"Using stopwords_punct_lemma, vec - CountVector..."
3,RandomForestClassifier,,0.647204,0.812668,0.747471,0.570655,0.749629,[[37470 5591]\n [12451 16549]],8 minutes and 1.77 seconds,Using SpaCy vectors
4,RandomForestClassifier,n_estimators=10,0.35616,0.580746,0.494366,0.278345,0.59501,[[34805 8256]\n [20928 8072]],1 minutes and 21.09 seconds,POS + RFC
5,RandomForestClassifier,n_estimators=10,0.346721,0.576378,0.489195,0.268517,0.592789,[[34930 8131]\n [21213 7787]],1 minutes and 6.97 seconds,POS + using TF-IDF
6,Multinomial Naive Bayes,MinMaxScaler,0.000207,0.735711,0.75,0.000103,0.597591,[[43060 1]\n [28997 3]],0 minutes and 1.71 seconds,spay vector


In [78]:
results_df.to_csv('data/results_purvi.csv', index=False)
