In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import kagglehub
import os
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
class BinaryLogisticRegression:
	def __init__(self, input_size):
		# small random weights; scalar bias
		self.weights = np.random.normal(0, 0.01, input_size)
		self.bias = 0.0

	def predict_prob(self, X):
		z = np.dot(X, self.weights) + self.bias
		return 1 / (1 + np.exp(-z))

	def predict(self, X):
		return 1 if self.predict_prob(X) >= 0.5 else 0

	def train(self, X, y_true, lr = 0.01):
		y_pred = self.predict_prob(X)
		loss = y_pred - y_true
		loss = (y_true * np.log(y_pred + 1e-8) + (1-y_true) * np.log(1 - y_pred + 1e-8))
		self.weights -= lr * loss * X
		self.bias -= lr * loss

class MultiLogisticRegression:
		def __init__(self, input_size):
			# Initialize an np array with normal distribution
			self.weights = np.random.normal(0, 0.01, input_size)
			self.bias = np.zeros(input_size)
   		#self.bias = 0

		def train(self, X, y, lr = 0.001):
			probs_predict = self.predict(X)
			loss = - np.dot(np.log(probs_predict), y)
   
			self.weights -= lr * (loss * X)
			self.bias -= lr * loss

		def predict(self, X):
			z = np.dot(X, self.weights) + self.bias
			normalizer = sum(z)
			probs = z / normalizer
   
			return probs

In [36]:
path = kagglehub.dataset_download("jackksoncsie/spam-email-dataset")

csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_path = os.path.join(path, csv_files[0])

dataset = pd.read_csv(csv_path, names=['text', 'spam'])
print(dataset.shape)
dataset = dataset.dropna()
dataset = dataset.sample(frac=1, random_state=42)
print(dataset.head())

texts = dataset['text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
dataset_vectors = X.toarray()
dataset['vector'] = list(dataset_vectors)

dataset["spam"] = dataset["spam"].apply(lambda x: 1 if x == '1' else 0)

dataset_x, dataset_y = dataset.shape
train_cut = int(0.8 * dataset_x)

train_data, test_data = dataset[:train_cut], dataset[train_cut:]

(5729, 2)
                                                   text spam
3562  Subject: re : my son  vince ,  i left a messag...    0
4119  Subject: financial maths course , part 2  vinc...    0
4482  Subject: june 21 - 22 retail electricity confe...    0
4211  Subject: re : enron default swaps  darrell ,  ...    0
5604  Subject: re : power question  steve ,  elena c...    0


In [40]:
classifier = BinaryLogisticRegression(dataset_vectors.shape[1])

for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Training"):
	classifier.train(row['vector'], row['spam'])

correct_predictions = 0
for index, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Testing"):
	predicted_label = classifier.predict(row['vector'])
	#print(f"Predicted: {predicted_label}, Actual: {row['label']}")
	if predicted_label == row['spam']:
		correct_predictions += 1
print(f"Accuracy: {correct_predictions / len(test_data) * 100:.2f}%")

Training: 100%|██████████| 4583/4583 [00:00<00:00, 6691.56it/s]
Testing: 100%|██████████| 1146/1146 [00:00<00:00, 10502.92it/s]

Accuracy: 77.84%





## Tests with Balanced Dataset

In [44]:
# Balancing the dataset over spam
spam_count = dataset['spam'].value_counts()
min_count = spam_count.min()
balanced_dataset = pd.concat([
	dataset[dataset['spam'] == label].sample(min_count, random_state=42) for label in spam_count.index 
])
print(balanced_dataset['spam'].value_counts())

dataset_x, dataset_y = balanced_dataset.shape
train_cut = int(0.8 * dataset_x)
train_data, test_data = balanced_dataset[:train_cut], balanced_dataset[train_cut:]

spam
0    1368
1    1368
Name: count, dtype: int64


In [49]:
classifier = BinaryLogisticRegression(dataset_vectors.shape[1])

for index, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Training"):
	classifier.train(row['vector'], row['spam'])

correct_predictions = 0
for index, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Testing"):
	predicted_label = classifier.predict(row['vector'])
	#print(f"Predicted: {predicted_label}, Actual: {row['label']}")
	if predicted_label == row['spam']:
		correct_predictions += 1
print(f"Accuracy: {correct_predictions / len(test_data) * 100:.2f}%")

Training: 100%|██████████| 2188/2188 [00:00<00:00, 5944.34it/s]
Testing: 100%|██████████| 548/548 [00:00<00:00, 10151.35it/s]

Accuracy: 100.00%



