In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re, string
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

In [2]:
file = "cleaned_train.csv"
df = pd.read_csv(file)

In [3]:
# Remove null values
df['processed_comment_text'] = df['processed_comment_text'].fillna("unknown")
df['cleaned_comment_text'] = df['cleaned_comment_text'].fillna("unknown")

# Encode labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

### TF-IDF + Logistic Regression

In [None]:
# Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_comment_text'])
test_tfidf = tfidf_vectorizer.transform(test_data['processed_comment_text'])

# Logistic Regression classifier for each class
lr_model = LogisticRegression(max_iter=1000)
metrics_lr = {}

# Use train_data to fit the model
for label in labels:
    lr_model.fit(train_tfidf, train_data[label])
    preds = lr_model.predict(test_tfidf)
    metrics_lr[label] = {
        'accuracy': accuracy_score(test_data[label], preds),
        'f1': f1_score(test_data[label], preds),
        'precision': precision_score(test_data[label], preds),
        'recall': recall_score(test_data[label], preds)
    }

print("Baseline (TF-IDF + Logistic Regression) Results:", metrics_lr)

In [None]:
data = []

# Collect data for each label from the metrics
for label in labels:
    data.append({
        'Class': label,
        'Accuracy': metrics_lr[label]['accuracy'],
        'F1 Score': metrics_lr[label]['f1'],
        'Precision': metrics_lr[label]['precision'],
        'Recall': metrics_lr[label]['recall'],
    })

# Create the DataFrame
metrics_df_lr = pd.DataFrame(data)

# Display the DataFrame
metrics_df_lr