In [8]:
import pandas as pd
import nltk

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Dataset preparation

In [59]:
DATASET_PATH = 'data/spam_ham_dataset.csv'

df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,id,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [60]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text: str):
    """
    Preprocess the text by removing stopwords and stemming the words
    """
    tokens = word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    return " ".join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

In [61]:
df.head()

Unnamed: 0,id,label,text,label_num,processed_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom januari see attach file hplnol...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonder ti...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window offic cheap main tren...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...


## Vectorize to TFIDF

The TF-IDF vectorization process involves assigning weights to words in a document sequence. A high TF-IDF weight is assigned to a word that appears frequently in a specific document but infrequently across the entire collection, indicating its importance in that particular document. This method helps capture the significance of words in representing the content of documents while downweighting common terms. The result is a numerical vector representation for each document, reflecting the importance of individual words in the context of the entire collection.

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()
X = tfid.fit_transform(df['processed_text']).toarray()
y = df['label_num'].values

## Splitting the data into train / test datasets

In [77]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

## MODELS

## LogisticRegression

Logistic Regression is a statistical model used for binary classification, predicting the probability of an instance belonging to a particular class. It employs the logistic function to map a linear combination of input features to a probability score, making it widely used in machine learning for tasks such as spam detection or medical diagnosis.

In [78]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1,solver='liblinear',penalty='l2', max_iter=50)
lr.fit(X_train,y_train)

In [90]:
input_text = """Subject: Call me as soon as you can!
A brand new offer for sale!
"""
input_tfidf = tfid.transform([input_text])
predicted_class = lr.predict(input_tfidf)
print(f"The predicted class for the input is: {predicted_class[0]}")

The predicted class for the input is: 1


# MODELS EVALUATION

In [80]:
from sklearn.metrics import r2_score, accuracy_score

models = [lr]

In [81]:
for model in models:
  y_pred = model.predict(X_test)
  print(f"{model}: R2: {r2_score(y_test,y_pred)} | Accuracy: {accuracy_score(y_test,y_pred)}")

LogisticRegression(C=1, max_iter=50, solver='liblinear'): R2: 0.9135875766487008 | Accuracy: 0.9826086956521739
