In [117]:
#Import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [None]:
#Loading Dataset
data = pd.read_csv('spam.csv',encoding='latin-1')
#Removing unnecessary data
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
#Renaming data
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]

In [None]:
predict_msg = pd.read_csv('output_spam.csv',encoding='latin-1')
predict_msg.head()

In [None]:
#Summary Statistics
data.describe()

In [None]:
#Looking for duplicate messages
duplicatedRow = data[data.duplicated()]
print(duplicatedRow[:5])

In [None]:
data.groupby('label').describe().T

In [170]:
#Get all the ham and spam emails
ham_msg = data[data.label =='ham']
spam_msg = data[data.label=='spam']
#Create numpy list to visualize using wordcloud
ham_msg_text = " ".join(ham_msg.text.to_numpy().tolist())
spam_msg_text = " ".join(spam_msg.text.to_numpy().tolist())

In [None]:
#Wordcloud of ham messages
ham_msg_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(ham_msg_text)
plt.figure(figsize=(16,10))
plt.imshow(ham_msg_cloud, interpolation='bilinear')
plt.axis('off') 
plt.show()

In [None]:
#Wordcloud of spam messages
spam_msg_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(spam_msg_text)
plt.figure(figsize=(16,10))
plt.imshow(spam_msg_cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#Plotting the data to display imbalance data
plt.figure(figsize=(8,6))
sns.countplot(data.label)
#Percentage of spam messages
(len(spam_msg)/len(ham_msg))*100 # 15.48%

In [None]:
#Fixing the imbalance by downsampling
ham_msg_df = ham_msg.sample(n = len(spam_msg), random_state = 44)
spam_msg_df = spam_msg
print(ham_msg_df.shape, spam_msg_df.shape)

In [None]:
#Creating a dataframe
msg_df = ham_msg_df.append(spam_msg_df).reset_index(drop=True)
plt.figure(figsize=(8,6))
sns.countplot(msg_df.label)
plt.title('Distribution of ham and spam email messages (after downsampling)')
plt.xlabel('Message types')

In [None]:
# Get the text length
msg_df['text_length'] = msg_df['text'].apply(len)
#Calculate average length by label types
labels = msg_df.groupby('label').mean()
labels

In [126]:
#Preparing train/test data and pre-process text
msg_df['msg_type']= msg_df['label'].map({'ham': 0, 'spam': 1})
msg_label = msg_df['msg_type'].values
#Splitting data into train and test
train_msg, test_msg, train_labels, test_labels = train_test_split(msg_df['text'], msg_label, test_size=0.2, random_state=434)

In [127]:
#Defining hyperparameters
max_len = 50 
trunc_type = "post" 
padding_type = "post" 
oov_tok = "<OOV>" 
vocab_size = 500

In [128]:
#Convert text into numerical representation by using tokenization
tokenizer = Tokenizer(num_words = vocab_size, char_level=False, oov_token = oov_tok)
tokenizer.fit_on_texts(train_msg)

In [None]:
#Using word_index 
word_index = tokenizer.word_index
word_index

In [None]:
# Checking how many words 
tot_words = len(word_index)
print('There are %s unique tokens in training data. ' % tot_words)

In [131]:
#Sequencing and padding
training_sequences = tokenizer.texts_to_sequences(train_msg)
training_padded = pad_sequences (training_sequences, maxlen = max_len, padding = padding_type, truncating = trunc_type )
testing_sequences = tokenizer.texts_to_sequences(test_msg)
testing_padded = pad_sequences(testing_sequences, maxlen = max_len,
padding = padding_type, truncating = trunc_type)

In [None]:
#Shape of train tensor
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

In [None]:
#Before padding
len(training_sequences[0]), len(training_sequences[1])

In [None]:
#After padding
len(training_padded[0]), len(training_padded[1])

In [None]:
print(training_padded[0])

In [135]:
#Defining hyperparameters for the Dense Model
vocab_size = 500 
embeding_dim = 16
drop_value = 0.2
n_dense = 24

In [83]:
#Dense model architecture
model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [136]:
#Compiling the Dense model
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])

In [None]:
#Fitting the dense spam detector model
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded, train_labels, epochs=num_epochs, validation_data=(testing_padded, test_labels),callbacks =[early_stop], verbose=2)

In [None]:
#Model performance on test data 
model.evaluate(testing_padded, test_labels)

In [139]:
#Read as a dataframe 
metrics = pd.DataFrame(history.history)
#Renaming column
metrics.rename(columns = {'loss': 'Training_Loss', 'accuracy': 'Training_Accuracy', 'val_loss': 'Validation_Loss', 'val_accuracy': 'Validation_Accuracy'}, inplace = True)
def plot_graphs1(var1, var2, string):
    metrics[[var1, var2]].plot()
    plt.title('Training and Validation ' + string)
    plt.xlabel ('Number of epochs')
    plt.ylabel(string)
    plt.legend([var1, var2])

In [None]:
#Display long string 
pd.options.display.max_colwidth=100
predict_msg[:20]

In [160]:
# Defining prediction function
def predict_spam(predict_msg):
    new_seq = tokenizer.texts_to_sequences(predict_msg)
    padded = pad_sequences(new_seq, maxlen =max_len,
                      padding = padding_type,
                      truncating=trunc_type)
    return (model.predict(padded))
predict_spam(predict_msg)

array([[0.01467556],
       [0.09514403]], dtype=float32)

In [174]:

predict_spam('output_spam.csv')

array([[0.01467556],
       [0.01094931],
       [0.04646036],
       [0.01467556],
       [0.01094931],
       [0.04646036],
       [0.01183754],
       [0.00651857],
       [0.01467556],
       [0.01850682],
       [0.02487409],
       [0.01183754],
       [0.02569854],
       [0.00651857],
       [0.00542918]], dtype=float32)