In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def preprocess(text):
    
    text=text.lower()
    text=re.sub(r'\d+','',text)
    text=text.translate(str.maketrans('','',string.punctuation))
    text=text.split()
    
    return text

In [3]:
def sigmoid(z):
    
    return 1 / (1 + np.exp(-z))

In [4]:
def initialize_weight(n_feature):
    
    weight=np.zeros(n_feature)
    bias=0
    
    return weight,bias

In [5]:
def compute_gradient(x,y,y_pred):
    
    m=x.shape[1]
    dw=(1/m)*np.dot(x.T,(y_pred-y))
    db=(1/m)*np.sum(y_pred-y)
    
    return dw,db

In [6]:
def compute_loss(y, y_pred):

    m = len(y)
    loss = -(1 / m) * np.sum(y * np.log(y_pred + 1e-9) + (1 - y) * np.log(1 - y_pred + 1e-9))
    
    return loss

In [7]:
def train(x,y,learning_rate=0.01,iteration=1000):
    
    n_feature=x.shape[1]
    weight,bias=initialize_weight(n_feature)
    
    for i in range(iteration):
        linear_model=np.dot(x,weight)+bias
        y_pred=sigmoid(linear_model)
        
        dw,db=compute_gradient(x,y,y_pred)
        
        weight-=learning_rate*dw
        bias-=learning_rate*db
        
        
    return weight,bias
        

In [8]:
def predict(x,weight,bias,threshold=0.3):
    
    linear_model=np.dot(x,weight)+bias
    y_pred=sigmoid(linear_model)
    
    return (y_pred>=threshold).astype(int)
    

In [None]:
df=pd.read_csv("C:/Users/kamal/Downloads/email.csv")

In [None]:
df["Category"]=df["Category"].map({'ham':0,'spam':1})
df["Message"]=df["Message"].apply(preprocess)
df["Message"] = df["Message"].apply(lambda x: " ".join(x) if isinstance(x, list) else x) #converting list into string

In [11]:
vectorizer=TfidfVectorizer(ngram_range=(1,2),stop_words='english',max_features=2000)
x=vectorizer.fit_transform(df["Message"]).toarray()
y=df["Category"].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
print("NaN values in X_train:", np.isnan(x_train).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())

print("Infinite values in X_train:", np.isinf(x_train).sum())
print("Infinite values in y_train:", np.isinf(y_train).sum())


NaN values in X_train: 0
NaN values in y_train: 1
Infinite values in X_train: 0
Infinite values in y_train: 0


In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  
y_train = imputer.fit_transform(y_train.reshape(-1, 1)) 

y_train = y_train.ravel()
y_train = y_train.astype(int)  



In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.5, random_state=42)  
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [16]:
from collections import Counter
print("Class distribution in y_test:", Counter(y_train_resampled))


Class distribution in y_test: Counter({0: 3868, 1: 1934})


In [None]:
weight,bias=train(x_train_resampled,y_train_resampled,learning_rate=0.1,iteration=2000)

In [None]:
y_pred=predict(x_test,weight,bias)

In [None]:
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy:{accuracy:.4f}")


Accuracy:0.9480


In [None]:
your_message = ["Congratulations! You won a free iPhone. Click here to claim."]  
processed=preprocess(your_message[0])
vectorized=vectorizer.transform([" ".join(processed)])
prediction=predict(vectorized.toarray(),weight,bias)
print("spam" if prediction[0]==1 else "ham")

spam


In [None]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97       958
         1.0       0.75      0.96      0.84       157

    accuracy                           0.95      1115
   macro avg       0.87      0.95      0.90      1115
weighted avg       0.96      0.95      0.95      1115

