In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix




In [4]:
df = pd.read_csv("mail_l7_dataset.csv")

In [5]:
#df.head()
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df.loc[df['Category'].str.lower().str.strip() == 'spam', 'Category'] = 0
df.loc[df['Category'].str.lower().str.strip() == 'ham', 'Category'] = 1

In [7]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
X = df['Message'].astype(str)
Y = df['Category'].astype(int)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,  test_size=0.2, random_state=42
)

In [10]:
# Viewing the shapes of the splits to confirm the operation was syccessful 
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (4457,)
X_test shape: (1115,)
Y_train shape: (4457,)
Y_test shape: (1115,)


In [11]:
# Text Transformation
tfid = TfidfVectorizer(min_df = 1, stop_words= "english",  lowercase=True)
X_train_features = tfid.fit_transform(X_train)
X_test_features = tfid.transform(X_test)

In [12]:
# model training and prediction
lr = LogisticRegression(max_iter= 1000, random_state=42)
lr.fit(X_train_features, Y_train)
lr_predict = lr.predict(X_test_features)


In [13]:
#Printing the prediction
print(lr_predict)

[1 1 1 ... 1 1 1]


In [14]:
# Testing the model with random forest 
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_features, Y_train)
rf_predict = rf.predict(X_test_features)

In [15]:
#Printing the prediction
print(rf_predict)

[1 1 1 ... 1 1 1]


In [16]:
# Testing the model with naive bayes
gnb = MultinomialNB()
gnb.fit(X_train_features.toarray(), Y_train)
gnb_predict = gnb.predict(X_test_features.toarray())


In [17]:
#Printing the prediction
print(gnb_predict)

[1 1 1 ... 1 1 1]


In [18]:
#Evaluation metric
def evaluate_matrix(name, y_true, y_predict, pos_label=0):
    acc  = accuracy_score(y_true, y_predict)
    prec = precision_score(y_true, y_predict, pos_label=pos_label)
    rec  = recall_score(y_true, y_predict, pos_label=pos_label)
    f1   = f1_score(y_true, y_predict, pos_label=pos_label)

    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

In [19]:
evaluate_matrix("Logistic Regression", Y_test, lr_predict)
evaluate_matrix("Random Forest", Y_test, rf_predict)
evaluate_matrix("Navie Bayes", Y_test, gnb_predict)


Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.758  (positive = spam=0)
  F1-Score : 0.863  (positive = spam=0)

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Navie Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.826  (positive = spam=0)
  F1-Score : 0.904  (positive = spam=0)


In [20]:
# See the consusion matrix
def print_confmat(name, Y_true, Y_predict):
    cm = confusion_matrix(Y_true, Y_predict, labels= [1,0])
    cm_df = pd.DataFrame(
        cm,
        index= ["Actual Harm(1)", "Actual Spam(0)"],
        columns= ["Predicted Harm(1)", "Predicted Spam(0)"]
    )
    print(f"{name} -Confusion Matrix : \n{cm_df}")

print_confmat("Logostics Regression",  Y_test, lr_predict)
print_confmat("Random RegreForestssion",  Y_test, rf_predict)
print_confmat("Navie Bayes ",  Y_test, gnb_predict)

Logostics Regression -Confusion Matrix : 
                Predicted Harm(1)  Predicted Spam(0)
Actual Harm(1)                966                  0
Actual Spam(0)                 36                113
Random RegreForestssion -Confusion Matrix : 
                Predicted Harm(1)  Predicted Spam(0)
Actual Harm(1)                966                  0
Actual Spam(0)                 19                130
Navie Bayes  -Confusion Matrix : 
                Predicted Harm(1)  Predicted Spam(0)
Actual Harm(1)                966                  0
Actual Spam(0)                 26                123


In [29]:
# Single-message sanity check (like L5 single-row check)
i = 14  # change index to inspect different emails from X_test
sample_text = X_test.iloc[i]
true_label  = Y_test.iloc[i]

In [30]:
# Predict with three models
lr_predict_one = int(lr.predict(tfid.transform([sample_text]))[0])
rf_predict_one = int(rf.predict(tfid.transform([sample_text]).toarray())[0])
gnb_predict_one = int(gnb.predict(tfid.transform([sample_text]).toarray())[0])

# For Readable Output 
def lab2str(r):
    return "Spam (0)" if r == 0 else "Ham (1)"

print("\n=== SINGLE MESSAGE CHECK ===")
snippet = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text
print("Text snippet:", snippet)
print("Actual      :", lab2str(true_label))
print("LR Prediction     :", lab2str(lr_predict_one))
print("RF Prediction     :", lab2str(rf_predict_one))
print("NB Prediction     :", lab2str(gnb_predict_one))


=== SINGLE MESSAGE CHECK ===
Text snippet: FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop
Actual      : Spam (0)
LR Prediction     : Spam (0)
RF Prediction     : Spam (0)
NB Prediction     : Spam (0)
