In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [None]:
df = pd.read_csv('/content/spam_or_not_spam.csv')

print(df.head())
print("\nDataset Info:")
print(df.info())


                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB
None


In [None]:
df.isnull().sum()

Unnamed: 0,0
email,1
label,0


In [None]:
df.duplicated().sum()

np.int64(127)

In [None]:
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df['email'] = df['email'].astype(str)
df = df.dropna()

In [None]:
df.isnull().sum()

Unnamed: 0,0
email,0
label,0


In [None]:

# If labels are like spam/ham, encode them
if df['label'].dtype == object:
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['label'])
    # spam --> 1, ham --> 0

In [None]:
# -----------------------------
# üìå 4. Train-Test Split
# -----------------------------
X = df['email']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=1000,     # ŸÇŸÑŸÑŸÜÿß ÿ¨ÿØÿßŸã
    ngram_range=(1,1),     # ÿ£ŸáŸÖ ÿ≠ÿßÿ¨ÿ©.. ŸÖŸÅŸäÿ¥ bigrams
    min_df=5,              # ÿ™ÿ¨ÿßŸáŸÑ ÿßŸÑŸÉŸÑŸÖÿßÿ™ ÿßŸÑŸÑŸä ÿ®ÿ™ÿ™ŸÉÿ±ÿ± ŸÇŸÑŸäŸÑ
    max_df=0.7             # ÿ™ÿ¨ÿßŸáŸÑ ÿßŸÑŸÉŸÑŸÖÿßÿ™ ÿßŸÑŸÖŸÜÿ™ÿ¥ÿ±ÿ© ÿ¨ÿØÿßŸã
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
nb = MultinomialNB(alpha=3.0)  # ‚Üë smoothing
nb.fit(X_train_tfidf, y_train)

nb_pred = nb.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy: 0.9669565217391304
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       489
           1       0.97      0.80      0.88        86

    accuracy                           0.97       575
   macro avg       0.97      0.90      0.93       575
weighted avg       0.97      0.97      0.97       575



In [None]:
lr = LogisticRegression(
    max_iter=2000,
    penalty='l2',
    C=0.05   # ‚Üì‚Üì regularization ŸÇŸàŸä ÿπÿ¥ÿßŸÜ ÿßŸÑŸÖŸàÿØŸäŸÑ ŸÖŸäÿ™ÿπŸÑŸÖÿ¥ ÿ≤ŸäÿßÿØÿ©
)

lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))


Logistic Regression Accuracy: 0.8504347826086956
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       489
           1       0.00      0.00      0.00        86

    accuracy                           0.85       575
   macro avg       0.43      0.50      0.46       575
weighted avg       0.72      0.85      0.78       575



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:

# -----------------------------
# üìå 8. Confusion Matrix Display
# -----------------------------
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(y_test, nb_pred))

print("\nConfusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, lr_pred))


Confusion Matrix (Naive Bayes):
[[487   2]
 [ 17  69]]

Confusion Matrix (Logistic Regression):
[[489   0]
 [ 86   0]]
