Edited from https://www.kaggle.com/code/scratchpad/notebook40ff58d30a/edit

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!curl -L -o sms-spam-collection-dataset.zip https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset

Data Cleaning

In [None]:
data = pd.read_csv('spam.csv', encoding='latin')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.head()

In [None]:
data.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
data.head()

This is just a mapping from: string label <-> numerical label 

Integers are just more efficient

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['target'] = le.fit_transform(data['target'])
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(keep='first', inplace=True)

In [None]:
data.shape

In [None]:
%pip install nltk
import nltk

In [None]:
nltk.download('stopwords')
nltk.download("punkt_tab")

You can create new columns by applying a function over the input of another column

Data Transformation

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

Simplify the text inout by only considering significant terms and just their stems

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    # keep only alpha numeric
    text = [word for word in text if word.isalnum()]
    stopwords = nltk.corpus.stopwords.words('english')
    # remove unimportant stop words like "the" or "to"
    text = [word for word in text if word not in stopwords]
    # only get the stem "eating" -> eat
    text = [ps.stem(word) for word in text]
    text = ' '.join(text)
    return text

In [None]:
transform_text('I am learning Python and I am learning Machine Learning')

In [None]:
transform_text(data['text'][0])

This is how you can create a new column from another column. Useful when applying a cleaning or transforming function over input data

In [None]:
data['transformed_text'] = data['text'].apply(transform_text)
data.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=800, height=400, max_words=100, background_color='white').generate(' '.join(data['transformed_text']))

In [None]:
ham_wc = wc.generate(' '.join(data[data['target'] == 0]['transformed_text']))
plt.figure(figsize=(10, 6))
plt.imshow(ham_wc)
plt.title('Most common words in ham messages')

In [None]:
spam_wc = wc.generate(' '.join(data[data['target'] == 1]['transformed_text']))
plt.figure(figsize=(10, 6))
plt.imshow(spam_wc)

Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer() # default

In [None]:
X = data["transformed_text"]
y = data["target"]

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
features = vectorizer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    features,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=2024,
)

# choose highest freq label in neighbors
model = KNeighborsClassifier()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

print(f"train accuracy {model.score(X_train, y_train)}")
print(f"test accuracy {model.score(X_test, y_test)}")
print(f"classifying {model}")

pred = model.predict(X_test)
print(classification_report(y_test, pred, zero_division=True))