In [4]:
import tensorflow
import matplotlib
import pandas as pd
import re
import nltk

In [5]:
data = pd.read_csv('/content/emails.csv') #data frame
print(data.columns)
print(data.info)

Index(['text', 'spam'], dtype='object')
<bound method DataFrame.info of                                                    text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
...                                                 ...   ...
5723  Subject: re : research and development charges...     0
5724  Subject: re : receipts from visit  jim ,  than...     0
5725  Subject: re : enron case study update  wow ! a...     0
5726  Subject: re : interest  david ,  please , call...     0
5727  Subject: news : aurora 5 . 2 update  aurora ve...     0

[5728 rows x 2 columns]>


In [6]:
duplicates = data[data.duplicated()]
print(f"Number of duplicates: {data.duplicated().sum()}")


Number of duplicates: 33


In [7]:
data=data.drop_duplicates()

In [8]:
data.to_csv('cleaned_emails.csv', index=False)

In [9]:
print("Number of duplicates =",data.duplicated().sum())

Number of duplicates = 0


In [10]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Removes any character that is NOT A WORD
    text = re.sub(r'\s+', ' ', text)  # Removes extra spaces and replaces them with ONE SPACE
    return text.lower()
data['cleaned_text'] = data['text'].apply(clean_text) #applies the clean_text function to each email in the 'text' column
#and stores the cleaned result in a NEW COLUMN, 'cleaned_text'
print("Original Text:", data['text'].iloc[0]) #first row
print("Cleaned Text:", data['cleaned_text'].iloc[0])

Original Text: Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordabil

In [11]:
#tokenization and stopwords removal (words that add little meaning to the text like this, and ,the ...)
#natural language toolkit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt') #punkt is a tokenizer model
nltk.download('stopwords')
nltk.download('punkt_tab') #part of punkt tokenizer model
stop_words = set(stopwords.words('english')) #creates a set of english stopwords

data['tokens'] = data['cleaned_text'].apply(lambda x: [word for word in word_tokenize(x) if word not in stop_words])
#print(data.info)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [12]:
#Tokenization and stopwords exemple
print("Original Text:", data['cleaned_text'].iloc[0])
print("Tokens after stopword removal:", data['tokens'].iloc[0])

Original Text: subject naturally irresistible your corporate identity lt is really hard to recollect a company the market is full of suqgestions and the information isoverwhelminq but a good catchy logo stylish statlonery and outstanding website will make the task much easier we do not promise that havinq ordered a iogo your company will automaticaily become a world ieader it isguite ciear that without good products effective business organization and practicable aim it will be hotat nowadays market but we do promise that your marketing efforts will become much more effective here is the list of clear benefits creativeness hand made original logos specially done to reflect your distinctive company image convenience logo and stationery are provided in all formats easy to use content management system letsyou change your website content and even its structure promptness you will see logo drafts within three business days affordability your marketing break through shouldn t make gaps in y

In [13]:
#preparing data for model
from tensorflow.keras.preprocessing.text import Tokenizer #convert text into a sequence of integers by assigning
# a unique index to each word in the dataset.(not like regular tokenizer)
from tensorflow.keras.preprocessing.sequence import pad_sequences #ensures all sequences (lists of tokenized words) have
#the same length by padding them with zeros.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tokens']) #trains tokenizer on toknized data (tokens)
X = tokenizer.texts_to_sequences(data['tokens']) #converts tokenized text to a SEQUENCE of integers
X = pad_sequences(X, padding='post') #adds padding to the end

y = data['spam'].values
print(len(y))
print("Original Text:", data['text'].iloc[0])
print("Tokenized Text (Integer Sequence):", X[0])  # After converting to sequences


5695
Original Text: Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affor

In [14]:
# Data split (train + crossvalidation + test)
from sklearn.model_selection import train_test_split

# split into 80% training+validation and 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, split the remaining 80% into 60% training and 20% cross-validation
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Check the shapes of the resulting splits
print("Training data shape:", X_train.shape)
print("Validation data shape:", X_val.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (3417, 4341)
Validation data shape: (1139, 4341)
Testing data shape: (1139, 4341)


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Define the model
model = Sequential()

# Embedding layer
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
max_length = X.shape[1]
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# Bidirectional LSTM layer
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))

# Dense layers
model.add(Dense(64, activation='relu'))  # Additional Dense layer
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(32, activation='relu'))  # Additional Dense layer

# Output layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# View model summary
model.summary()




In [17]:
# Train the model
# Train the model
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),  # Use the validation set here (cross validation not test set)
                    epochs=10,
                    batch_size=32,
                    callbacks=[early_stopping])

#Batch size refers to the number of training samples processed before the model's internal parameters are updated.
#A smaller batch size means more updates but slower processing, while a larger batch size speeds up processing but may lead
#to less frequent updates.
# An epoch is one complete pass through the entire training dataset. After each epoch, the model's parameters are updated based on the data.
# More epochs mean the model gets more chances to learn.

Epoch 1/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1178s[0m 11s/step - accuracy: 0.9984 - loss: 0.0071 - val_accuracy: 0.9877 - val_loss: 0.0637
Epoch 2/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1140s[0m 11s/step - accuracy: 1.0000 - loss: 7.1840e-04 - val_accuracy: 0.9877 - val_loss: 0.0714
Epoch 3/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1172s[0m 11s/step - accuracy: 1.0000 - loss: 2.1735e-04 - val_accuracy: 0.9895 - val_loss: 0.0752
Epoch 4/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1157s[0m 11s/step - accuracy: 1.0000 - loss: 1.3321e-04 - val_accuracy: 0.9868 - val_loss: 0.0764


In [18]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, batch_size=32)
print("Test Accuracy:", test_accuracy)



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1s/step - accuracy: 0.9837 - loss: 0.0959
Test Accuracy: 0.9868305325508118
