# Feature Engineering


## Importss

In [1]:
import pandas as pd
import torch
import numpy as np
import torch
import torch.optim as optim
import time

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split  # Import train_test_split function
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix




## Data Extract

In [2]:
# read from csv in data
data = pd.read_csv('data/merged_data.csv')

## Function for calculations

In [3]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"
    predicted_probs = model.predict_proba(X_test)[:, 1]

    # Calculate metrics using probabilities
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predicted_probs)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## BERT

In [4]:
# Extract relevant columns
texts = data['comment_text'].tolist()  # 'comment_text' column as input text
labels = data['toxic'].tolist()  # 'toxic' column as binary labels (0 or 1)


In [5]:
# Tokenize text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in texts]


In [6]:
test_encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')


In [7]:
# Padding and Truncation
max_len = max(len(text) for text in tokenized_texts)
padded_texts = [text + [0]*(max_len-len(text)) for text in tokenized_texts]


In [8]:
# Convert to tensors
input_ids = torch.tensor(padded_texts)
labels = torch.tensor(labels)


In [9]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.2, random_state=42)


In [10]:
# Create DataLoader
dataset = TensorDataset(input_ids, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [11]:
# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming binary classification


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
'''
# Initialize and train Logistic Regression model
log_reg_model = LogisticRegression()

# Fit the Logistic Regression model on training data
log_reg_model.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg_model.predict(X_test)

# Get predicted probabilities for positive class (class 1)
y_pred_prob = log_reg_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"F1-score: {f1}")
'''

## 1. Logistic Regression

In [12]:
# Initialize and train Logistic Regression model
log_reg_model = LogisticRegression()

# Fit the Logistic Regression model on training data
log_reg_model.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg_model.predict(X_test)

# Get predicted probabilities for positive class (class 1)
y_pred_prob = log_reg_model.predict_proba(X_test)[:, 1]

lr_results = evaluate_model(log_reg_model, X_train, y_train, X_test, y_test, model_name="Logistic Regression", parameters='binary', comments='bert+lr')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
log_reg_results_df = pd.DataFrame([lr_results])
results_df = pd.concat([results_df, log_reg_results_df], ignore_index=True)
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Logistic Regression,binary,0.002722,0.515277,0.130841,0.001375,0.885502,[[79327 93]\n [10166 14]],0 minutes and 20.44 seconds,bert+lr


## 2. RFC

In [14]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Flatten the input tensors
X_train_flatten = X_train.view(X_train.size(0), -1)
X_test_flatten = X_test.view(X_test.size(0), -1)

# Fit the Random Forest Classifier on training data
rfc.fit(X_train_flatten, y_train)

# Predict on the test set
y_pred = rfc.predict(X_test_flatten)

# Get predicted probabilities for positive class (class 1)
y_pred_prob = rfc.predict_proba(X_test_flatten)[:, 1]

rfc_results = evaluate_model(rfc, X_train_flatten, y_train, X_test_flatten, y_test, model_name="Random Forest Classifier", parameters='n_estimators=100', comments='bert + rfc')
rfc_results_df = pd.DataFrame([rfc_results])
results_df = pd.concat([results_df, rfc_results_df], ignore_index=True)



In [15]:
results_df

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,Logistic Regression,binary,0.002722,0.515277,0.130841,0.001375,0.885502,[[79327 93]\n [10166 14]],0 minutes and 20.44 seconds,bert+lr
1,Random Forest Classifier,n_estimators=100,0.022977,0.595609,0.668539,0.01169,0.887054,[[79361 59]\n [10061 119]],4 minutes and 24.31 seconds,bert + rfc
