In [15]:
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bartosz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bartosz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
SEED = 42

# Dataset preparation

In [18]:
DATASET_PATH = '../../data/spam_ham_dataset.csv'

df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,id,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [19]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text: str):
    """
    Preprocess the text by removing stopwords and stemming the words
    """
    tokens = word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    return " ".join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

In [20]:
df.head()

Unnamed: 0,id,label,text,label_num,processed_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject enron methanol meter follow note gave ...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject hpl nom januari see attach file hplnol...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject neon retreat ho ho ho around wonder ti...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop window offic cheap main tren...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject indian spring deal book teco pvr reven...


## Vectorize to TFIDF

The TF-IDF vectorization process involves assigning weights to words in a document sequence. A high TF-IDF weight is assigned to a word that appears frequently in a specific document but infrequently across the entire collection, indicating its importance in that particular document. This method helps capture the significance of words in representing the content of documents while downweighting common terms. The result is a numerical vector representation for each document, reflecting the importance of individual words in the context of the entire collection.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()
X = tfid.fit_transform(df['processed_text']).toarray()
y = df['label_num'].values

## Splitting the data into train / test datasets

In [22]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

## MODELS

## Common functions

In [23]:
input_text = """Subject: Call me as soon as you can!
A brand new offer for sale! Win 100000$!!
"""

def get_test_prediction(model, use_array=False):
  input_tfidf = tfid.transform([input_text])
  if use_array:
    input_tfidf = input_tfidf.toarray()
  predicted_class = model.predict(input_tfidf)
  return f"The predicted class for the input is: {predicted_class[0]}"

## LogisticRegression

Logistic Regression is a statistical model used for binary classification, predicting the probability of an instance belonging to a particular class. It employs the logistic function to map a linear combination of input features to a probability score, making it widely used in machine learning for tasks such as spam detection or medical diagnosis.

In [24]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1,solver='liblinear',penalty='l2', max_iter=50)
lr.fit(X_train,y_train)

In [25]:
print(get_test_prediction(lr))

The predicted class for the input is: 1


## Gaussian Naive Bayes

Naive Bayes is a probabilistic machine learning algorithm based on Bayes' theorem that assumes independence between features given the class label. It is commonly used for classification tasks and is particularly efficient with high-dimensional data, making it suitable for applications such as text categorization and spam filtering.

Gaussian Naive Bayes is a variant of the Naive Bayes algorithm specifically designed for continuous data. It assumes that the features follow a Gaussian (normal) distribution and estimates the mean and variance for each class, making it suitable for problems where the features are real-valued and can be modeled as continuous random variables.

In [26]:
from sklearn.naive_bayes import GaussianNB

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [27]:
print(get_test_prediction(naive_bayes, use_array=True))

The predicted class for the input is: 0


## Random Forest


Random Forest is an ensemble machine learning algorithm that constructs multiple decision trees during training and outputs the mode of the classes (classification) or the average prediction (regression) of the individual trees. It enhances predictive accuracy and generalization by combining the results of diverse and independently trained trees.

In [28]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=SEED, n_estimators=500)
random_forest.fit(X_train, y_train)

In [29]:
print(get_test_prediction(random_forest))

The predicted class for the input is: 1


## XGBoost


XGBoost (Extreme Gradient Boosting) is a powerful and efficient gradient boosting algorithm that sequentially builds a series of decision trees to minimize a predefined loss function, providing high predictive performance. It incorporates regularization techniques and parallel computing to handle large datasets, making it widely used for both classification and regression tasks in machine learning.

In [30]:
import xgboost as xgb

xclf = xgb.XGBClassifier(n_estimators=500, max_depth=3)
xclf.fit(X_train, y_train)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
print(get_test_prediction(xclf))

The predicted class for the input is: 0


## Naive Model (random)

Introduced for comparison purposes, basic random model which returns either 1 or 0 for each data test.

In [None]:
class NaiveModel:
    def fit(self, X, y):
        pass

    def predict(self, X):
        predictions = np.random.choice([0, 1], size=len(X))
        return predictions

    def __str__(self):
        return self.__class__.__name__


In [None]:
naive_clf = NaiveModel()
naive_clf.fit(X_train, y_train)

In [None]:
print(get_test_prediction(naive_clf, use_array=True))

The predicted class for the input is: 0


# MODELS EVALUATION

In [None]:
from sklearn.metrics import r2_score, accuracy_score, f1_score

models = [lr, naive_bayes, random_forest, xclf, naive_clf]

In [None]:
for model in models:
  y_pred = model.predict(X_test)
  print(f"{str(model).split('(')[0]}: R2: {r2_score(y_test,y_pred)} | Accuracy: {accuracy_score(y_test,y_pred)} | F1 score: {f1_score(y_test,y_pred)}")

LogisticRegression: R2: 0.9135875766487008 | Accuracy: 0.9826086956521739 | F1 score: 0.9696969696969697
GaussianNB: R2: 0.7695668710632021 | Accuracy: 0.9536231884057971 | F1 score: 0.9178082191780822
RandomForestClassifier: R2: 0.9183882668348841 | Accuracy: 0.9835748792270531 | F1 score: 0.9713322091062394
XGBClassifier: R2: 0.8703813649730512 | Accuracy: 0.9739130434782609 | F1 score: 0.9543147208121827
NaiveModel: R2: -1.4579533753258445 | Accuracy: 0.5053140096618357 | F1 score: 0.3801452784503632


## Save the best model

In [None]:
import joblib

joblib.dump(random_forest, '../static/random_forest_model.joblib')