In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\ynikh\OneDrive\python practice\spam.csv", encoding='latin-1')
df=df[['v1', 'v2']].rename(columns={'v1':'label', 'v2':'message'})
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
import string
import re
def text_cleaning(text):
    text = text.lower()
    text = re.sub(r'\d+',' ',text)
    text = re.sub(r'[^\w\s!?$]',' ',text)
    text = text.strip()
    return text
df['cleaned_msg'] = df['message'].apply(text_cleaning)
df.head()

Unnamed: 0,label,message,cleaned_msg
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label']) 
df[['label', 'label_encoded']].head()

Unnamed: 0,label,label_encoded
0,ham,0
1,ham,0
2,spam,1
3,ham,0
4,ham,0


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X=tfidf.fit_transform(df['cleaned_msg'])
y=df['label_encoded']
print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (5572, 3000)


In [5]:
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)

print("Before:", np.bincount(y_train))
print("After:", np.bincount(y_train_bal))

Before: [3859  598]
After: [3859 3859]


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
knn = KNeighborsClassifier(n_neighbors=5, metric='cosine', weights='distance')
knn.fit(X_train_bal, y_train_bal)
y_test_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
cm_df = pd.DataFrame(cm,
                     index=["Actual_Ham", "Actual_Spam"],
                     columns=["Pred_Ham", "Pred_Spam"])

print(cm_df)
test_msgs = [
    "Congratulations! You won a $1000 Walmart gift card. Click to claim now!",
    "Your OTP is 48291. Do not share with anyone.",
    "Limited time offer!!! Get 70% OFF on all electronics. BUY NOW!!!",
    "Hey, are we still meeting at 4 PM today for the project discussion?",
    "Urgent: We detected unusual login activity on your bank account. Verify immediately.",
    "Mom said she will come to pick you up around 6. Call her when free.",
    "FREE entry into our contest. Reply YES to participate!",
    "Thanks for your payment. Your order has been shipped and will arrive Monday.",
    "Final notice: Your car insurance has expired. Renew now before penalty.",
    "Bro send yesterday's notes please, need to study tonight."
]

# Clean and transform
cleaned = [text_cleaning(msg) for msg in test_msgs]
msg_vec = tfidf.transform(cleaned)

pred_labels = knn.predict(msg_vec)
print("\nPredictions:")
for msg, pred in zip(test_msgs, pred_labels):
    print(f"{msg[:60]:60s} → {'SPAM' if pred==1 else 'HAM'}")

Accuracy: 0.9551569506726457
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       966
           1       0.79      0.91      0.84       149

    accuracy                           0.96      1115
   macro avg       0.89      0.93      0.91      1115
weighted avg       0.96      0.96      0.96      1115

             Pred_Ham  Pred_Spam
Actual_Ham        930         36
Actual_Spam        14        135

Predictions:
Congratulations! You won a $1000 Walmart gift card. Click to → SPAM
Your OTP is 48291. Do not share with anyone.                 → HAM
Limited time offer!!! Get 70% OFF on all electronics. BUY NO → HAM
Hey, are we still meeting at 4 PM today for the project disc → HAM
Urgent: We detected unusual login activity on your bank acco → SPAM
Mom said she will come to pick you up around 6. Call her whe → HAM
FREE entry into our contest. Reply YES to participate!       → HAM
Thanks for your payment. Your order 

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
scorer = make_scorer(f1_score, pos_label=1)  # focus on spam class

param_grid = {
    'n_neighbors': [3,5,7,9,11,15],
    'weights': ['uniform', 'distance'],
    'metric': ['cosine', 'euclidean']
}

knn = KNeighborsClassifier()

grid = GridSearchCV(
    knn,
    param_grid,
    scoring=scorer,
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV f1(spam):", grid.best_score_)

best_knn = grid.best_estimator_
y_test_pred = best_knn.predict(X_test)
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
cm_df = pd.DataFrame(cm,
                     index=["Actual_Ham", "Actual_Spam"],
                     columns=["Pred_Ham", "Pred_Spam"])

print(cm_df)


Best params: {'metric': 'cosine', 'n_neighbors': 3, 'weights': 'distance'}
Best CV f1(spam): 0.8835984368480556
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       966
           1       0.86      0.79      0.83       149

    accuracy                           0.96      1115
   macro avg       0.91      0.89      0.90      1115
weighted avg       0.95      0.96      0.95      1115

             Pred_Ham  Pred_Spam
Actual_Ham        947         19
Actual_Spam        31        118
