# Feature Engineering


## Imports!

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

## Data Extract

In [2]:
# read from csv in data
data = pd.read_csv('data/merged_data.csv',nrows=1000)

## BERT

In [3]:
# Extract relevant columns
texts = data['comment_text'].tolist()
labels = data['toxic'].tolist()

# Tokenize text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(tokenized_texts.input_ids, labels, test_size=0.2, random_state=42)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Train the model (skipped for brevity)

# Evaluate on the test set
with torch.no_grad():
    outputs = model(X_test)
    predicted_probs = outputs.logits.softmax(dim=1)
    predicted_labels = predicted_probs.argmax(dim=1)
    y_test_np = y_test

# Calculate metrics
accuracy = accuracy_score(y_test_np, predicted_labels.cpu().numpy())
auc = roc_auc_score(y_test_np, predicted_probs[:, 1].cpu().numpy())
f1 = f1_score(y_test_np, predicted_labels.cpu().numpy())

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")
print(f"F1-score: {f1}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Accuracy: 0.245
AUC: 0.4428673835125448
F1-score: 0.12716763005780346


In [4]:
results = pd.DataFrame(columns=['Accuracy', 'F1', 'AUC', 'MethodName'])

In [7]:
lr_results = pd.DataFrame({
    'Accuracy': [accuracy],
    'F1': [f1],
    'AUC': [auc],
    'MethodName': ['Logistic Regression with W2V']
})
value=results.shape[0]

results.loc[value] = [accuracy, f1, auc, 'BERT + MERGED DATA with 1000 rows']
results

Unnamed: 0,Accuracy,F1,AUC,MethodName
0,0.245,0.127168,0.442867,BERT + MERGED DATA with 1000 rows
