In [1]:
# Machine Learning Assignment - Exercise 02
# Prof. Klaus Berberich
# Students:
# Aaron Dassen (3871517)
# Jan Beckhausen (5000902)
# Germain Girndt (3872203)

import pandas as pd

# load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']] # considering only label and sms text columns
df.columns = ['classification', 'message']



In [2]:
# Task a - Cleaning up
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_stop_words(text):
    text = text.split() # convert to list of words
    text = [word for word in text if word not in ENGLISH_STOP_WORDS] # stemming and removing stopwords
    text = ' '.join(text) # joining words back to form the cleaned text
    return text

def clean(text):
    text = re.sub('[^a-zA-Z\d]', ' ', text) # keeping only alphanumeric characters
    text = text.lower() # lowercasing
    text = remove_stop_words(text) # removing stop words as recommended in slide 57

    return text


# apply the function on message column
df['message'] = df['message'].apply(clean)


In [3]:
# Task b - Train Naïve Bayes - Compute Precision, Recall and F1 Score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['classification'], test_size=0.2, random_state=2023)

# convert text data into numerical data (bag of words)
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

# train the model
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# make predictions
y_pred = classifier.predict(X_test)

# compute precision, recall and f1 score
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)



Precision:  0.9647887323943662
Recall:  0.9013157894736842
F1 Score:  0.9319727891156462


In [4]:
# Task C - Calculate class priors, which means, probability of a class in the data, independent of the features
import numpy as np

# class priors
class_priors = np.exp(classifier.class_log_prior_)
print(f"Class Priors: Spam {class_priors[0]} | Ham: {class_priors[1]}")

# top 10 words indicative of spam and ham
spam_feature_log_probs = classifier.feature_log_prob_[0, :]
ham_feature_log_probs = classifier.feature_log_prob_[1, :]

# getting top 10 words indexes
top_spam_words_indexes = np.argsort(spam_feature_log_probs)[-10:][::-1]
top_ham_words_indexes = np.argsort(ham_feature_log_probs)[-10:][::-1]



print("\nTop 10 words indicative of spam: ")
print("Position\tWord\tIndex\tLog Probability")
for position, index in enumerate(top_spam_words_indexes):
    corrected_position = position + 1
    word = cv.get_feature_names_out()[index]
    probability = spam_feature_log_probs[index]
    print(f"{corrected_position}\t\t{word}\t{index}\t{probability}")
    
print("\nTop 10 words indicative of ham: ")
print("Position\tWord\tIndex\tLog Probability")
for position, index in enumerate(top_ham_words_indexes):
    corrected_position = position + 1
    word = cv.get_feature_names_out()[index]
    probability = ham_feature_log_probs[index]
    print(f"{corrected_position}\t\t{word}\t{index}\t{probability}")
    


Class Priors: Spam 0.8665021314785729 | Ham: 0.1334978685214271

Top 10 words indicative of spam: 
Position	Word	Index	Log Probability
1		gt	3098	-4.88168549035242
2		lt	4046	-4.889467630794475
3		just	3681	-4.9750595611298785
4		ok	4675	-4.983606621708337
5		ll	3962	-5.045573345457536
6		like	3915	-5.136327708726001
7		know	3766	-5.166786916210709
8		ur	6868	-5.166786916210709
9		come	1820	-5.198203112444088
10		good	3026	-5.225158922432616

Top 10 words indicative of ham: 
Position	Word	Index	Log Probability
1		free	2838	-4.588600922224773
2		txt	6765	-4.810283552899308
3		ur	6868	-5.011765607432339
4		mobile	4323	-5.115143961886173
5		text	6495	-5.125094292739341
6		stop	6202	-5.135144628592843
7		claim	1740	-5.219404972210583
8		www	7265	-5.335815324054994
9		reply	5451	-5.360816626260411
10		prize	5161	-5.412776365191123


In [5]:
# Task d - kNN, SMS data points encoding, distance measure, precision, recall and F1

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


# Training a k-NN model with k = 7
knn = KNeighborsClassifier(n_neighbors=7, metric='euclidean') # Using Euclidean distance as distance beasure
knn.fit(X_train, y_train) # SMS data points already encoded as bag of words for kNN

# Making predictions
y_predicted = knn.predict(X_test)

# Computing Precision, Recall and F1 Score
precision = precision_score(y_true=y_test, y_pred=y_predicted, pos_label='spam')
recall = recall_score(y_true=y_test, y_pred=y_predicted, pos_label='spam')
f1 = f1_score(y_true=y_test, y_pred=y_predicted, pos_label='spam')


print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)


Precision:  1.0
Recall:  0.28289473684210525
F1 Score:  0.441025641025641
