# Spam Mail Detection


### Data Collection


In [10]:
# Mukul Jain
# Kaggle Full Dataset

import os
import numpy as np

dirs = []
for i in range(6):
    dirs.append("enron" + str(i + 1))
    
ham_mails = []
spam_mails = []
for f in dirs:
    for i in os.listdir(os.path.join(f, "ham")):
        ham_mails.append(os.path.join(f, "ham", i))
for f in dirs:
    for i in os.listdir(os.path.join(f, "spam")):
        spam_mails.append(os.path.join(f, "spam", i))

ham_mails = np.array(ham_mails)
spam_mails = np.array(spam_mails)
all_mails = np.concatenate((ham_mails, spam_mails))

ham_labels = np.zeros((len(ham_mails),), dtype=int)
spam_labels = np.ones((len(spam_mails),), dtype=int)
all_labels = np.concatenate((ham_labels, spam_labels))


### Splitting Data


In [11]:
from sklearn.model_selection import train_test_split

train_mails, test_mails, train_labels, test_labels = train_test_split(
    all_mails, all_labels, test_size=0.2, random_state=42
)


### Data Preparation


In [12]:
all_words = []

for mail in train_mails:
    with open(mail) as m:
        try:
            for i, line in enumerate(m):
                words = line.split()
                all_words += words
        except:
            pass


from collections import Counter

word_counter = Counter(all_words)


### Data Cleaning


In [13]:
list_to_remove = list(word_counter.keys())
for item in list_to_remove:
    if item.isalpha() == False:
        del word_counter[item]
    elif len(item) == 1:
        del word_counter[item]
word_counter = word_counter.most_common(20000)
word_index = {}
for i, (word, frequency) in enumerate(word_counter):
    word_index[word] = i


### Feature Extraction


In [14]:
def extract_features(train_mails, word_index):
    docId = 0
    matrix = np.zeros((len(train_mails), len(word_counter)), dtype=int)
    for file in train_mails:
        with open(file) as m:
            try:
                for i, line in enumerate(m):
                    words = line.split()
                    for word in words:
                        if word not in word_index.keys():
                            continue
                        wordId = word_index[word]
                        matrix[docId, wordId] = words.count(word)
            except:
                pass
            docId += 1

    return matrix


train_word_matrix = extract_features(train_mails, word_index)


### Training


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

model = MultinomialNB()
model.fit(train_word_matrix, train_labels)


MultinomialNB()

### Testing


In [16]:
test_word_matrix = extract_features(test_mails, word_index)
result = model.predict(test_word_matrix)
print(model.score(test_word_matrix, test_labels))

print(confusion_matrix(test_labels, result))


0.9682633842503336
[[3253  105]
 [ 109 3276]]
