In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re, string
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

In [2]:
file = "cleaned_train.csv"
df = pd.read_csv(file)

In [3]:
# Remove null values
df['processed_comment_text'] = df['processed_comment_text'].fillna("unknown")
df['cleaned_comment_text'] = df['cleaned_comment_text'].fillna("unknown")

# Encode labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### TF-IDF + Logistic Regression

In [4]:
# Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_comment_text'])
test_tfidf = tfidf_vectorizer.transform(test_data['processed_comment_text'])

# Logistic Regression classifier for each class
lr_model = LogisticRegression(max_iter=1000)
metrics_lr = {}

# Use train_data to fit the model
for label in labels:
    lr_model.fit(train_tfidf, train_data[label])
    preds = lr_model.predict(test_tfidf)
    metrics_lr[label] = {
        'accuracy': accuracy_score(test_data[label], preds),
        'f1': f1_score(test_data[label], preds),
        'precision': precision_score(test_data[label], preds),
        'recall': recall_score(test_data[label], preds)
    }

print("Baseline (TF-IDF + Logistic Regression) Results:", metrics_lr)

Baseline (TF-IDF + Logistic Regression) Results: {'toxic': {'accuracy': 0.9569481435061883, 'f1': np.float64(0.7306938455507644), 'precision': np.float64(0.9110459433040078), 'recall': np.float64(0.6099476439790575)}, 'severe_toxic': {'accuracy': 0.9906000313332289, 'f1': np.float64(0.375), 'precision': np.float64(0.5660377358490566), 'recall': np.float64(0.2803738317757009)}, 'obscene': {'accuracy': 0.9762180792730691, 'f1': np.float64(0.736), 'precision': np.float64(0.9120689655172414), 'recall': np.float64(0.6169096209912537)}, 'threat': {'accuracy': 0.9976813410621964, 'f1': np.float64(0.17777777777777778), 'precision': np.float64(0.5), 'recall': np.float64(0.10810810810810811)}, 'insult': {'accuracy': 0.9693247689174369, 'f1': np.float64(0.6264784433422358), 'precision': np.float64(0.8152929493545183), 'recall': np.float64(0.5086741016109045)}, 'identity_hate': {'accuracy': 0.9916340278865737, 'f1': np.float64(0.26038781163434904), 'precision': np.float64(0.7014925373134329), 'rec

In [5]:
data = []

# Collect data for each label from the metrics
for label in labels:
    data.append({
        'Class': label,
        'Accuracy': metrics_lr[label]['accuracy'],
        'F1 Score': metrics_lr[label]['f1'],
        'Precision': metrics_lr[label]['precision'],
        'Recall': metrics_lr[label]['recall'],
    })

# Create the DataFrame
metrics_df_lr = pd.DataFrame(data)

# Display the DataFrame
metrics_df_lr

Unnamed: 0,Class,Accuracy,F1 Score,Precision,Recall
0,toxic,0.956948,0.730694,0.911046,0.609948
1,severe_toxic,0.9906,0.375,0.566038,0.280374
2,obscene,0.976218,0.736,0.912069,0.61691
3,threat,0.997681,0.177778,0.5,0.108108
4,insult,0.969325,0.626478,0.815293,0.508674
5,identity_hate,0.991634,0.260388,0.701493,0.159864
