In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import string
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Load datasets
emails_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/emails.csv')
email_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/email.csv')

#rename and map labels from email file
email_df.rename(columns={'Message': 'text', 'Category': 'spam'}, inplace=True)
email_df['spam'] = email_df['spam'].map({'spam': 1, 'ham': 0})

# Remove any empty rows or rows with missing values
email_df = email_df[email_df['spam'].notna()]

# Remove "Subject:" from the beginning of the emails
emails_df['text'] = emails_df['text'].str.replace(r'^Subject:\s*', '', regex=True).str.strip()

# Merge datasets
merged_df = pd.concat([emails_df[['text', 'spam']], email_df[['text', 'spam']]], ignore_index=True)

# Convert spam column to integers
merged_df['spam'] = merged_df['spam'].astype(int)

#merge and shuffle the dataset to mix the two data from the different files
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final cleaned + merged dataset shape: {merged_df.shape}")
print(merged_df.tail())


Final cleaned + merged dataset shape: (11300, 2)
                                                    text  spam
11295  Yes i have. So that's why u texted. Pshew...mi...     0
11296  the national forum on corporate finance  mr . ...     0
11297  why johan dahl and the mri energy staffing gro...     0
11298  perfect visual solution for your business now ...     1
11299  Do u konw waht is rael FRIENDSHIP Im gving yuo...     0


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11300 entries, 0 to 11299
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11300 non-null  object
 1   spam    11300 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 176.7+ KB


#Preprocessing


In [None]:
#download the nltk package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

cleaned = []

#The for loop would go through each word and lowercase, remove html tags,
#remove line breaks, urls, numbers,punctuattiion and apply stemmer and check for stopwords before adding it to the list
for i in range(len(merged_df)):
    text = merged_df['text'].iloc[i].lower()
    text = re.sub(r'<.*?>', '', text)                 # Remove HTML tags
    text = re.sub(r'\n+', ' ', text)                  # Remove line breaks
    text = re.sub(r'https?://\S+', '', text)          # Remove URLs
    text = re.sub(r'\d+', '', text)                   # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stopwords_set]
    cleaned_text = ' '.join(tokens)
    cleaned.append(cleaned_text)

In [None]:
merged_df.text.iloc[0]

'year end 2000 performance feedback  note : you will receive this message each time you are selected as a reviewer .  you have been selected to participate in the year end 2000 performance  management process by providing meaningful feedback on specific employee ( s ) .  your feedback plays an important role in the process , and your participation  is critical to the success of enron \' s performance management goals .  to complete requests for feedback , access pep at http : / / pep . corp . enron . com  and select perform review under performance review services . you may begin  providing feedback immediately and are requested to have all feedback forms  completed by friday , november 17 , 2000 .  if you have any questions regarding pep or your responsibility in the  process , please contact the pep help desk at :  houston : 1 . 713 . 853 . 4777 , option 4  london : 44 . 207 . 783 . 4040 , option 4  email : perfmgmt @ enron . com  thank you for your participation in this important pr

In [None]:
cleaned[0]

'year end perform feedback note receiv messag time select review select particip year end perform manag process provid meaning feedback specif employe feedback play import role process particip critic success enron perform manag goal complet request feedback access pep http pep corp enron com select perform review perform review servic may begin provid feedback immedi request feedback form complet friday novemb question regard pep respons process pleas contact pep help desk houston option london option email perfmgmt enron com thank particip import process follow cumul list employe feedback request statu open submit declin employe request feedback name longer appear list review group enron feedback due date nov employe name supervisor name date select crenshaw shirley j wincenti j kaminski oct kindal kevin vasant shanbhogu oct lama vieira pinto rodrigo david port oct supatgiat chonawe peyton gibner oct tamarchenko tanya v vasant shanbhogu oct villarr norma e sheila h walton oct walton 

In [None]:
#Apply Vectorization with TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
x = vectorizer.fit_transform(cleaned).toarray()
y = merged_df['spam'].astype(int)

#Train Model


In [None]:
#Normalize input features (Bag of Words)
scaler = StandardScaler()
x_normalized = scaler.fit_transform(x)


#split off test set (20%)
x_temp, x_test, y_temp, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#remaining 80% into 70% train and 10% validation
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.125, random_state=42)  # 0.125 * 0.8 = 0.1

print(f"Train shape: {x_train.shape}, Val shape: {x_val.shape}, Test shape: {x_test.shape}")

#Train the Deep neural network

model = Sequential([
    Dense(512, activation='relu', input_shape=(x.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=10, batch_size=32,
                    validation_data=(x_val, y_val))

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"\n Test Accuracy: {test_acc:.4f}")

#Classification report
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype(int)

print("\n Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

Train shape: (7910, 5000), Val shape: (1130, 5000), Test shape: (2260, 5000)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.8690 - loss: 0.3108 - val_accuracy: 0.9752 - val_loss: 0.0772
Epoch 2/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9915 - loss: 0.0329 - val_accuracy: 0.9770 - val_loss: 0.0790
Epoch 3/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9955 - loss: 0.0152 - val_accuracy: 0.9779 - val_loss: 0.0888
Epoch 4/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9984 - loss: 0.0074 - val_accuracy: 0.9743 - val_loss: 0.0975
Epoch 5/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9985 - loss: 0.0060 - val_accuracy: 0.9735 - val_loss: 0.1089
Epoch 6/10
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9987 - loss: 0.0054 - val_accuracy: 0.9761 - val_loss: 0.1182
Epoch 7/10
[1m248/248[0m 

#REAL LIFE APPLICATION

In [None]:
import joblib

joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')
model.save('spam_classifier_model.h5')




In [None]:
import joblib
from tensorflow.keras.models import load_model
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import numpy as np
import re

# Load saved files
vectorizer = joblib.load('vectorizer.pkl')
scaler = joblib.load('scaler.pkl')
model = load_model('spam_classifier_model.h5')

# Text preprocessing function
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'\n+', ' ', text)   # remove line breaks
    text = text.translate(str.maketrans('', '', string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    return ' '.join(text)

#  Paste email here:
sample_email = """  """
# Preprocess
processed = preprocess(sample_email)
vectorized = vectorizer.transform([processed]).toarray()
normalized = scaler.transform(vectorized)

# Predict
prediction = model.predict(normalized)

# Output
if prediction[0][0] > 0.5:
    print("🟥 SPAM")
else:
    print("🟩 NOT SPAM")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 291ms/step
🟩 NOT SPAM
