<h2><strong><u>Data Preparation</u></strong></h2>

In [4]:
import matplotlib.pyplot as plt            
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve
from sklearn.metrics import ConfusionMatrixDisplay, recall_score, precision_score, PrecisionRecallDisplay, accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbLearn_Pipeline

In [5]:
pd.read_csv("./SMSSpamCollection.txt", sep="\t", header=None).to_csv("sms_spam_collection.csv", index=False, header=["target", "message"])

In [6]:
data = pd.read_csv("sms_spam_collection.csv")
data

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
X = data['message']
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
display(X_train.shape) 
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(4457,)

(1115,)

(4457,)

(1115,)

In [8]:
#Display message counts
hams = data[(data['target'] == 'ham')]
spams = data[(data['target'] == 'spam')]
hams_count = hams.value_counts().sum()
spams_count = spams.value_counts().sum()
total_messages = hams_count + spams_count
spam_fraction = spams_count / total_messages

print("Number of ham messages: ", hams_count)
print("Number of spam messages: ", spams_count)
print("Fraction of spam messages: {:.2f}%".format(spam_fraction * 100))

Number of ham messages:  4825
Number of spam messages:  747
Fraction of spam messages: 13.41%


In [9]:
def custom_tokenizer(text):
        from nltk.tokenize import word_tokenize
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        import re

        # Get stopwords function and Word Lemmatizer
        en_stopwords = stopwords.words('english')
        lemmatizer = WordNetLemmatizer()

        # Process text
        new_text = text.lower() #lowercase
        new_text = re.sub(r"([^\w\s])", "", new_text) #remove punctuation
        new_text = word_tokenize(new_text) #tokenize
        for word in new_text: #remove stopwords
            if word in en_stopwords:
                new_text.remove(word)
        new_text = [lemmatizer.lemmatize(token) for token in new_text] #lemmatize
        
        return new_text

In [10]:
custom_f1_scorer_spam = make_scorer(f1_score, greater_is_better=True, pos_label='spam')
custom_f1_scorer_ham = make_scorer(f1_score, greater_is_better=True, pos_label='ham')

In [11]:
train_labels = y_train.unique()
test_labels = y_test.unique()

In [12]:
clf_metrics = {"Accuracy": 'accuracy', "F1 Score (Spam)": custom_f1_scorer_spam, "F1 Score (Ham)": custom_f1_scorer_ham}
vc_plot_dict = {}

In [13]:
lc_dict = {}