In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Hàm sigmoid
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Hàm mất mát (Loss Function)
def loss_function(y, p):
    return -np.mean(y * np.log(p) + (1 - y) * np.log(1 - p))
 
# Gradient Descent
def gradient_descent(X, y, learning_rate, n_iterations):
    n_samples, n_features = X.shape
    w = np.zeros((n_features+1, 1))
    y = y.values.reshape((-1, 1))

    for _ in range(n_iterations):
        z = np.dot(X, w[1:]) + w[0] 
        p = sigmoid(z)
        gradient = np.dot(X.T, (p - y)) /n_samples
        w[1:] -= learning_rate * gradient
        w[0] -= np.mean(p - y) * learning_rate
    return w

# Hàm dự đoán
def predict(X, w):
    p = sigmoid(np.dot(X, w[1:]) + w[0])
    return (p >= 0.5).astype(int)



In [16]:
import pandas as pd

data = pd.read_csv('emails.csv')

In [17]:
data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [18]:
y = data.Prediction

In [19]:

data =data.drop(['Email No.','Prediction'],axis= 'columns')

In [20]:

x_train, x_test,y_train, y_test = train_test_split(data,y,test_size=0.2)

In [21]:
w = gradient_descent(x_train, y_train, learning_rate=0.01, n_iterations=1000)

In [26]:
y_pred = predict(x_test, w)
print("Accuracy score:", accuracy_score(y_test, y_pred))

Accuracy score: 0.8975845410628019


In [27]:
def split_email(email_input):
    email_array = np.zeros(3000)
    words = email_input.split(' ')
    for word in words:
        word = word.lower()
        if word in data.columns:
            index = data.columns.get_loc(word)
            email_array[index] += 1
    email_array = email_array.reshape(1, -1)
    return email_array

In [28]:
email = 'I think Donk is insanely talented for his age but he needs to be more mature'
y_predict_spam = predict(split_email(email),w)
if (y_predict_spam[0,0] == 1 ):
    print('Email Spam')
else:
    print('Email Not Spam')

Email Not Spam


In [29]:
email_spam = 'Win one million dollars here'
y_predict_spam = predict(split_email(email_spam),w)
if (y_predict_spam[0,0] == 1 ):
    print('Email Spam')
else:
    print('Email Not Spam')

Email Spam


In [13]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[719  22]
 [110 184]]


In [14]:
from sklearn.metrics import classification_report
 
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       741
           1       0.89      0.63      0.74       294

    accuracy                           0.87      1035
   macro avg       0.88      0.80      0.83      1035
weighted avg       0.87      0.87      0.86      1035

