In [33]:
import re
import numpy as np
import pandas as pd
import charset_normalizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
# look at the first ten thousand bytes to guess the character encoding
with open("/Users/ismathakit/Downloads/spam.csv", 'rb') as rawdata:
    result = charset_normalizer.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

{'encoding': 'windows-1250', 'language': 'English', 'confidence': 0.9966}


In [12]:
df = pd.read_csv('/Users/ismathakit/Downloads/spam.csv', encoding='windows-1250')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
df = df.iloc[:,0:2]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
def preprocess_text(text):
    # Remove extra whitespaces
    text = " ".join(text.split())
    # Remove non-alphabetic characters (keep spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['text2'] = df['v2'].apply(preprocess_text)

In [17]:
# 2. Calculate word length and add as a new column
df['length'] = df['text2'].apply(lambda x: len(x.split()))

In [21]:
# 3. Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['v1'])

In [22]:
# 4. CountVectorizer for Bag of Words (BoW) representation
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['text2'])

In [24]:
# 5. Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['label_encoded'], test_size=0.2, random_state=42)

# 6. Train a KNN model and evaluate
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict on test data
y_pred = knn_model.predict(X_test)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [25]:
df.head()

Unnamed: 0,v1,v2,text2,length,label_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,20,0
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,6,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in a wkly comp to win FA Cup final...,25,1
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,11,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,13,0


In [31]:
print(accuracy)

0.9246636771300448
