# Feature Engineering


## Importss

In [1]:
import pandas as pd
import re
import string
import nltk
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
import ast


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix





## Data Extract

In [2]:
# read from csv in data
data = pd.read_csv('data/all_data.csv')

In [3]:
# remove null comment_text
data = data.dropna(subset=['comment_text'])

In [4]:
data['toxic'] = (data['toxicity'] >= 0.5).astype(int)

## Function for calculations

In [5]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## Word2Vec

-  The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

### Text Preprocessing

In [6]:
# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and numbers using regex
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

# new column cleaned_text from 'comment_text' column
data['cleaned_text'] = data['comment_text'].apply(preprocess_text)


### Train Word2Vec model

In [7]:
tokenized_comments = data['cleaned_text'].tolist()

# training model
'''
- vector_size --> dimensionality of word vectors
- window --> max distance between current and predicted word within the sentence
- min_count --> min count of words that needs to be considered
- sg = 0 --> Continuous Bag of Words (CBOW) if it's 1 its Skip-gram model
'''
w2v_model = Word2Vec(tokenized_comments, vector_size=100, window=5, min_count=1, sg=0)


vocabulary = w2v_model.wv.key_to_index
print(list(vocabulary.keys())[:10])  # Print first 10 words as an example



['would', 'people', 'one', 'like', 'dont', 'trump', 'get', 'us', 'time', 'think']


### Training Machine Learning Models


In [8]:
#  get vector (average of word vectors)
def get_doc_vector(tokens):
    word_vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if word_vectors:
        return sum(word_vectors) / len(word_vectors)
    else:
        return np.zeros(w2v_model.vector_size)  

In [9]:
data['doc_vector'] = data['cleaned_text'].apply(get_doc_vector)


In [10]:
X = np.vstack(data['doc_vector'].to_numpy())  # Feature matrix
y = data['toxic'].values  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### 1. Logistic Regression with W2V

In [11]:
# Logistic Regression model definition
lr = LogisticRegression()

# Evaluate Logistic Regression model
lr_results = evaluate_model(lr, X_train, y_train, X_test, y_test, model_name="Logistic Regression", parameters='W2v', comments='w2v+lr')

# Convert the dictionary of results into a DataFrame
lr_results_df = pd.DataFrame([lr_results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, lr_results_df], ignore_index=True)




In [12]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Logistic Regression,W2v,0.152209,0.818287,0.515656,0.089281,0.919718,[[364916 2707]\n [ 29398 2882]],0 minutes and 27.85 seconds,w2v+lr


#### 2. Random Forest Classifier + W2V

In [13]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:

# Evaluation using evaluate_model function
rfc_results = evaluate_model(rf_model, X_train, y_train, X_test, y_test, model_name="Random Forest Classifier", parameters='n_estimators=100', comments='')

# Convert the dictionary of results into a DataFrame
rfc_results_df = pd.DataFrame([rfc_results])

# Append the results to the main results DataFrame (results_df)
results_df = pd.concat([results_df, rfc_results_df], ignore_index=True)


In [15]:
accuracy = rf_model.score(X_test, y_test)

y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy with RFC: {accuracy}")
print(f"F1 Score with RFC: {f1}")
print(f"AUC-ROC Score with RFC: {auc_roc}")

Accuracy with RFC: 0.923008829641188
F1 Score with RFC: 0.11263221604173272
AUC-ROC Score with RFC: 0.7825726628983065


In [16]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Logistic Regression,W2v,0.152209,0.818287,0.515656,0.089281,0.919718,[[364916 2707]\n [ 29398 2882]],0 minutes and 27.85 seconds,w2v+lr
1,Random Forest Classifier,n_estimators=100,0.112632,0.782573,0.80844,0.060533,0.923009,[[367160 463]\n [ 30326 1954]],40 minutes and 48.28 seconds,
