<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/email_spam_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [82]:
# Install opendatasets library
!pip install opendatasets

In [None]:
# Import necessary libraries and functions
import opendatasets as od
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from tensorflow import keras
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.layers import Dense,Embedding,Bidirectional,GRU
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.metrics import Precision,Recall
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
# Load the data into the working directory
od.download("https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset")

In [None]:
# Read the data in a pandas frame
path_to_data="/content/email-spam-classification-dataset/combined_data.csv"
data=pd.read_csv(path_to_data)
data.head()

In [None]:
# Check the shape of the data
print("This data contains: "+str(data.shape[0])+" rows and "+str(data.shape[1])+" columns.")

In [None]:
# Check the frequency of each class
data['label'].value_counts()

In [None]:
# Search for nulls
data.isnull().sum()

In [None]:
# download Punkt Sentence Tokenizer
nltk.download('punkt')
# download stopwords
nltk.download('stopwords')

In [None]:
# Define PorterStemmer and the stopwords
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

In [9]:
# define cleaning function
def clean_text(text):
  # convert to lower case
  text = text.lower()

  # remove non alphabetic characters ^
  text = re.sub(r'[^a-z]', ' ', text)

  # tokenize sentences
  tokens = word_tokenize(text)

  # Porter Stemmer
  stemmed = [stemmer.stem(word) for word in tokens]

  # reconstruct the text
  text = ' '.join(stemmed)

  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in english_stopwords])

  return text

In [10]:
# Define count_words function to find the count of tokens in a sentence
def count_words(text):
  return len(text.split())

In [13]:
# Apply count_words function on text-column
data['text_length']=data['text'].apply(count_words)

In [None]:
# Print the maximum length of a text in the data
print("The maximum lenght of a text is: "+str(data['text_length'].max()))

In [15]:
# Apply clean_text on text-column
data['clean_text']=data['text'].apply(clean_text)

In [16]:
# Define the tokenizer and fit the cleaned_text
tokenizer=Tokenizer()
tokenizer.fit_on_texts(data['clean_text'])
#Convert texts to sequences
x=tokenizer.texts_to_sequences(data['clean_text'])
# Pad the sequences
x = pad_sequences(x, maxlen=1000, padding="post", truncating="post")
# Set the labels
y=data['label'].values
# Check the input vocab size
vocab_size=len(tokenizer.word_index)+1
print(vocab_size)

In [None]:
# Define the model
model=Sequential()
model.add(Embedding(vocab_size,100,input_length=1000))
model.add(Bidirectional(GRU(65,return_sequences=False)))
model.add(Dense(65,activation='relu'))
model.add(Dense(25,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
# Check the summary of the model
model.summary()

In [22]:
# Split the data into testing and training parts
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# Compile the model and fit it the training data
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy',Precision(),Recall()])
model.fit(x_train,y_train,validation_split=0.2,epochs=3,batch_size=64)

In [None]:
# Evaluate the model on testing data
model.evaluate(x_test,y_test)

In [72]:
# Define a spam_predictor function
def spam_predictor(input_text, model, tokenizer):
    # Clean,tokenize and pad the input text
    input_text_clean=clean_text(input_text)
    input_sequence = tokenizer.texts_to_sequences([input_text_clean])
    input_padded = pad_sequences(input_sequence, maxlen=1000, padding="post", truncating="post")

    # Make predictions using the pre-trained model
    predictions = model.predict(input_padded)

    return predictions

In [None]:
# Use spam_predictor function
input_text='You have been selected to win the  grand prize winner of our exclusive lottery! Claim your prize by contacting us with your personal information'
spam_predictor(input_text, model, tokenizer)

In [None]:
# Save the model
model.save("model.h5")

# Pickle the predictor function
with open("spam_predictor.pkl", "wb") as file:
    pickle.dump(spam_predictor, file)

# Pickle the tokenizer
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)


In [None]:
# Split the data again
x = data['clean_text'].values
y = data['label'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
# Define a CountVectorizer (with binary=True and max_features=10000)
vectorizer = CountVectorizer(binary = True, max_features = 10000)
# Learn the vocabulary of all tokens in our training dataset
vectorizer.fit_transform(x_train)

# Transform x_train to bag of words
x_train_bow = vectorizer.transform(x_train)
x_test_bow = vectorizer.transform(x_test)

print(x_train_bow.shape, y_train.shape)
print(x_test_bow.shape, y_test.shape)

In [None]:
# Defin LogisticRegression model
LR=LogisticRegression()
LR.fit(x_train_bow,y_train)

In [None]:
# Make predictions using LR on training and testing data
y_hat_test = LR.predict(x_test_bow)
y_hat_train = LR.predict(x_train_bow)

# Calculate, accuracy,precisison and recall scores on training and testing datasets
accuracy_train = accuracy_score(y_train, y_hat_train) * 100
precision_train = precision_score(y_train, y_hat_train, average='binary', zero_division=0) * 100
recall_train = recall_score(y_train, y_hat_train, average='binary', zero_division=0) * 100

accuracy_test = accuracy_score(y_test, y_hat_test) * 100
precision_test = precision_score(y_test, y_hat_test, average='binary', zero_division=0) * 100
recall_test = recall_score(y_test, y_hat_test, average='binary', zero_division=0) * 100

print("Scores on training data:")
print("Accuracy: {:.2f}%".format(accuracy_train))
print("Precision: {:.2f}%".format(precision_train))
print("Recall: {:.2f}%".format(recall_train))
print("----------------------------")
print("Scores on testing data:")
print("Accuracy: {:.2f}%".format(accuracy_test))
print("Precision: {:.2f}%".format(precision_test))
print("Recall: {:.2f}%".format(recall_test))
