# Logistic Regression Model

### Install Libraries
* **`wordcloud`** A word cloud generator in Python that we used to visualize our dataset

In [1]:
pip install wordcloud

Note: you may need to restart the kernel to use updated packages.


### Importing Libraries

In [24]:
import numpy as np
import pandas as pd
import random
import re
from collections import defaultdict
from collections import Counter
from sklearn.model_selection import train_test_split 
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

### Loading Dataset

In [3]:
FILENAME = './HateSpeechDatasetBalanced.csv'
df = pd.read_csv(FILENAME)

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:\n{df['Label'].value_counts()}")
print(f"\nFirst few rows:\n{df.head()}")

Dataset shape: (726119, 2)

Class distribution:
Label
1    364525
0    361594
Name: count, dtype: int64

First few rows:
                                             Content  Label
0  denial of normal the con be asked to comment o...      1
1  just by being able to tweet this insufferable ...      1
2  that is retarded you too cute to be single tha...      1
3  thought of a real badass mongol style declarat...      1
4                                afro american basho      1


### Cleaning dataset
* Remove punctuation
* Remove links
* Remove user mentions and hashtags
* Remove numbers
* Convert text to lowercase

In [5]:
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    return text

df['Content'] = df['Content'].apply(clean_text)
print("Dataset text cleaned.")

Dataset text cleaned.


### Preprocessing the Data
* Get 5000 examples from each class for the model
* Get rid of words that have less than 100 occurrences in the training set and test (based on the training set in order to prevent cheating)
* Get rid of stopwords from wordcloud library
* Turn into a bag of words using one-hot encoding
* Create Train/Test split for data

In [6]:

# Only use 5000 samples from each class for modeling
df_class_0 = df.query('Label == 0').sample(5000, random_state=42)
df_class_1 = df.query('Label == 1').sample(5000, random_state=42)
data = pd.concat([df_class_0, df_class_1])

X_text = data['Content'].values
y = data['Label'].values   

X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.4, random_state=42)

# Create vocabulary from training data only
train_all_words = []
for text in X_train:
    words = str(text).lower().split()
    train_all_words.extend(words)

train_word_counts = Counter(train_all_words)
print(f"Total unique words in training data: {len(train_word_counts):,}")

MIN_WORD_FREQ = 100

# Identify frequent words (appear at least MIN_WORD_FREQ times)
freq_words = set([word for word, freq in train_word_counts.items() if freq >= MIN_WORD_FREQ ])


def filter_rare_words(text, frequent_words_set):
    words = str(text).split()
    # Keep only words that are in the frequent_words set and not in STOPWORDS
    filtered_words = [word for word in words if word in frequent_words_set and word not in STOPWORDS]
    return ' '.join(filtered_words)

# Apply filtering
X_train_filtered = np.array([filter_rare_words(text, freq_words) for text in X_train])
X_test_filtered = np.array([filter_rare_words(text, freq_words) for text in X_test])
print("Rare words removed from training and test set.")

# One-hot encoding function
def text_to_vector(text, word_to_index, vocab_size):
    vector = np.zeros(vocab_size, dtype=np.int8)
    
    words = str(text).split()
    for word in words:
        if word in word_to_index:
            idx = word_to_index[word]
            vector[idx] = 1  # Binary: word present or not
    
    return vector


# Create word for index mapping for one-hot encoding
train_all_text_filtered = ' '.join(X_train_filtered)
vocabulary = sorted(set(train_all_text_filtered.split()))
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
print(f"One hot encoding mapping: {word_to_index}")

# Convert train and test to one-hot vectors
print("Converting training texts to vectors...")
X_train_onehot = np.array([
    text_to_vector(text, word_to_index, len(vocabulary)) 
    for text in X_train_filtered
])
print("Done.")

print("Converting test texts to vectors...")
X_test_onehot = np.array([
    text_to_vector(text, word_to_index, len(vocabulary)) 
    for text in X_test_filtered
])
print("Done.")



Total unique words in training data: 17,378
Rare words removed from training and test set.
One hot encoding mapping: {'actually': 0, 'address': 1, 'already': 2, 'always': 3, 'another': 4, 'anyone': 5, 'anything': 6, 'around': 7, 'article': 8, 'articles': 9, 'ass': 10, 'back': 11, 'believe': 12, 'better': 13, 'big': 14, 'bitch': 15, 'black': 16, 'blocked': 17, 'bullshit': 18, 'call': 19, 'come': 20, 'country': 21, 'day': 22, 'deleted': 23, 'deletion': 24, 'discussion': 25, 'done': 26, 'dont': 27, 'edit': 28, 'editing': 29, 'edits': 30, 'even': 31, 'every': 32, 'everyone': 33, 'face': 34, 'fact': 35, 'feel': 36, 'find': 37, 'first': 38, 'fuck': 39, 'fucking': 40, 'getting': 41, 'give': 42, 'go': 43, 'god': 44, 'going': 45, 'good': 46, 'got': 47, 'help': 48, 'history': 49, 'hope': 50, 'image': 51, 'information': 52, 'keep': 53, 'kill': 54, 'know': 55, 'let': 56, 'life': 57, 'list': 58, 'little': 59, 'look': 60, 'love': 61, 'made': 62, 'make': 63, 'man': 64, 'many': 65, 'mean': 66, 'means'

### Logistic Regression with no Feature Transforamtion

In [25]:
# Implement KNN using Sklearn
print(f"Final Training Shape: {X_train_onehot.shape}")
print(f"Final Testing Shape: {X_test_onehot.shape}")

logreg = linear_model.LogisticRegression()

print("Fitting model...")
logreg.fit(X_train_onehot, y_train)

print("Predicting...")
y_pred_test = logreg.predict(X_test_onehot)
y_pred_train = logreg.predict(X_train_onehot)

# Evaluate
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print("\n--- Results ---")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"f1 score: {f1:.4f}")

display(pd.DataFrame({"Predicted: Unhateful": confusion_matrix(y_test, y_pred_test)[:, 0], 
              "Predicted: Hateful": confusion_matrix(y_test, y_pred_test)[:, 1]},
             index=['Actual: Unhateful', 'Actual: Hateful']))

Final Training Shape: (6000, 142)
Final Testing Shape: (4000, 142)
Fitting model...
Predicting...

--- Results ---
Train Accuracy: 0.7143
Test Accuracy: 0.6885
Precision: 0.6426
Recall: 0.8074
f1 score: 0.7157


Unnamed: 0,Predicted: Unhateful,Predicted: Hateful
Actual: Unhateful,1186,872
Actual: Hateful,374,1568


### Logistic Regression with no Feature Transformation (Tuning K)

In [32]:
# Hyperparameter to consider
# Lambda for regularization in Logistic Regression

lambdas = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
kf = model_selection.KFold(n_splits=5)

scores = []

# loop through and try all lambda values in lambdas array
for lam in lambdas:
    l2logreg = linear_model.LogisticRegression(penalty = 'l2', C=1/lam)
    l2logreg.fit(X_train_onehot, y_train)
    scores.append(np.mean(model_selection.cross_val_score(l2logreg, X_train_onehot, y_train, cv=kf, scoring='accuracy')))

best_lambda = lambdas[np.argmax(scores)]

print(f"The Best Lambda: {best_lambda}")
l2logreg = linear_model.LogisticRegression(penalty = 'l2', C=1/best_lambda)
l2logreg.fit(X_train_onehot, y_train)
y_hat_l2logreg = l2logreg.predict(X_test_onehot)
prec, recal, fscore, sup = precision_recall_fscore_support(y_test, y_hat_l2logreg)
print('L2 Regularization prec: ', prec)
print('L2 Regularization recal: ', recal)
print('L2 Regularization fscore: ', fscore)


The Best Lambda: 1
L2 Regularization prec:  [0.76025641 0.64262295]
L2 Regularization recal:  [0.57628766 0.80741504]
L2 Regularization fscore:  [0.65561083 0.71565495]


### Logistic Regression with TF-IDF
* Handles vocabulary selection
* Removes stopwords (built-in to function)
* Transforms raw text

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Implement Logistic Regression using Sklearn
print(f"Final Training Shape: {X_train_tfidf.shape}")
print(f"Final Testing Shape: {X_test_tfidf.shape}")

print("Fitting model...")
logreg.fit(X_train_onehot, y_train)

print("Predicting...")
y_pred_test = logreg.predict(X_test_onehot)
y_pred_train = logreg.predict(X_train_onehot)

# Evaluate
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print("\n--- Results ---")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"f1 score: {f1:.4f}")

display(pd.DataFrame({"Predicted: Unhateful": confusion_matrix(y_test, y_pred_test)[:, 0], 
              "Predicted: Hateful": confusion_matrix(y_test, y_pred_test)[:, 1]},
             index=['Actual: Unhateful', 'Actual: Hateful']))

Final Training Shape: (6000, 1000)
Final Testing Shape: (4000, 1000)
Fitting model...
Predicting...

--- Results ---
Train Accuracy: 0.7143
Test Accuracy: 0.6885
Precision: 0.6426
Recall: 0.8074
f1 score: 0.7157


Unnamed: 0,Predicted: Unhateful,Predicted: Hateful
Actual: Unhateful,1186,872
Actual: Hateful,374,1568


### Logistic Regression with TF-IDF (Tuning lambda)

In [35]:
# Hyperparameter to consider
# Lambda for regularization in Logistic Regression
# max_iter

lambdas = [0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
kf = model_selection.KFold(n_splits=5)

scores = []

# loop through and try all lambda values in lambdas array
for lam in lambdas:
    l2logreg = linear_model.LogisticRegression(max_iter = 1000, penalty = 'l2', C=1/lam)
    l2logreg.fit(X_train_tfidf, y_train)
    scores.append(np.mean(model_selection.cross_val_score(l2logreg, X_train_tfidf, y_train, cv=kf, scoring='accuracy')))

best_lambda = lambdas[np.argmax(scores)]

print(f"The Best Lambda: {best_lambda}")
l2logreg = linear_model.LogisticRegression(penalty = 'l2', C=1/best_lambda)
l2logreg.fit(X_train_tfidf, y_train)
y_hat_l2logreg = l2logreg.predict(X_test_tfidf)
prec, recal, fscore, sup = precision_recall_fscore_support(y_test, y_hat_l2logreg)
print('L2 Regularization prec: ', prec)
print('L2 Regularization recal: ', recal)
print('L2 Regularization fscore: ', fscore)


The Best Lambda: 1
L2 Regularization prec:  [0.76957001 0.69716377]
L2 Regularization recal:  [0.67832847 0.78475798]
L2 Regularization fscore:  [0.72107438 0.73837209]
