In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/classify_emails/emails.csv',sep=';' )

In [7]:
import nltk
from  nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
def preprocess_text(text):

  tokens = word_tokenize(text.lower())
  tokens = [token for token in tokens if token not in string.punctuation]

  stop_words = stopwords.words('english')
  tokens = [token for token in tokens if token not in stop_words]

  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  preprocessed_text = ' '.join(tokens)
  preprocessed_text = re.sub(r'\d+', '', preprocessed_text)
  preprocessed_text = re.sub(r'http\S+|www\S+', '', preprocessed_text)

  return preprocessed_text

In [8]:
df['processed_Message']=df['Message'].apply(preprocess_text)
df

Unnamed: 0,Spam,Message,processed_Message
0,0,Please call me at 8,pleas call
1,1,Free money is available for you,free money avail
2,0,I study he studies they are students I studied...,studi studi student studi yesterday
3,1,I am working at office now to 9 evening,work offic even
4,0,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
...,...,...,...
5222,0,"It‘s reassuring, in this crazy world.",‘ reassur crazi world
5223,0,Oh... Okie lor...We go on sat...,oh ... oki lor ... go sat ...
5224,1,You are awarded a SiPix Digital Camera! call 0...,award sipix digit camera call landlin deliver...
5225,0,"Hey chief, can you give me a bell when you get...",hey chief give bell get need talk royal visit ...


In [9]:
df[df['Spam']==1].count()

Unnamed: 0,0
Spam,675
Message,675
processed_Message,675


In [10]:
spam_df = df[df['Spam']==1]
not_spam_df = df[df['Spam']==0]

max_class_count = len(not_spam_df)
resampled_spam_df = spam_df.sample(n=max_class_count, replace=True, random_state=42)

balanced_df = pd.concat([not_spam_df, resampled_spam_df])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 100
vectorizer = TfidfVectorizer(max_features=max_features)

corpus = balanced_df['processed_Message']

vectors = vectorizer.fit_transform(corpus)

In [14]:
from sklearn.model_selection import train_test_split

x = vectors
y = balanced_df['Spam']

x = x.toarray()

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [15]:
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=10, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8200 - loss: 0.5308
Epoch 2/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9415 - loss: 0.1599
Epoch 3/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9558 - loss: 0.1240
Epoch 4/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9604 - loss: 0.1168
Epoch 5/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9656 - loss: 0.1035
Epoch 6/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9694 - loss: 0.0907
Epoch 7/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9767 - loss: 0.0790
Epoch 8/10
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9716 - loss: 0.0820
Epoch 9/10
[1m214/214[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7a57af4fa790>

In [19]:
import numpy as np
prob = model.predict(x_test)
y_pred = np.round(prob)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy*100,2))

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 97.23


In [22]:
message="call to get free prize one million dollars"

processed_message=preprocess_text(message)
vector=vectorizer.transform([processed_message])
vector_dense=vector.toarray()

prob = model.predict(vector_dense)

y_pred = np.round(prob)
if y_pred>0.5:
    print("Spam")
else:
    print("Not Spam")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Spam


In [23]:
message2 = "there is an urgent meeting today at 8 pm at the head office"

processed_massage = preprocess_text (message2)

vectors = vectorizer.transform([processed_massage])

vectors.toarray()

prob = model.predict(vectors)

y_pred = np.round(prob)
if y_pred > 0.5:
  print ("spam")
else:
  print ("not spam")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
not spam
