<a href="https://colab.research.google.com/github/Moridi/Spam-Detector/blob/master/Spam_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Getting the Dataset

In [0]:
import os
import tarfile
from six.moves import urllib

def fetch_data(kind):
  spam_url = "http://spamassassin.apache.org/old/publiccorpus/" + \
      "20030228_" + kind + ".tar.bz2"

  spam_path = os.path.join(kind + ".tar.bz2")

  urllib.request.urlretrieve(spam_url, spam_path)

  spam_tgz = tarfile.open(kind + ".tar.bz2")  
  spam_tgz.extractall()
  spam_tgz.close()

In [0]:
fetch_data("spam")
fetch_data("easy_ham")

In [0]:
import email

def parseEmail(dirName, fileName):
  with open(os.path.join(dirName, fileName), "rb") as f:
    return email.parser.BytesParser().parse(f)

In [0]:
spamList = [parseEmail("spam", fileName)\
    for fileName in os.listdir("spam")]
hamList = [parseEmail("easy_ham", fileName)\
    for fileName in os.listdir("easy_ham")]

In [0]:
def initiateDataset(mailList):
  global dataset

  for i, x in enumerate(mailList):
    for y in x.keys():
      dataset[y] = []

In [0]:
def addToDataset(mailList, isSpam):
  for x in mailList:
    for y in dataset.keys():
      if (y in x.keys()):
        dataset[y].append(x[y])
      else:
        dataset[y].append("")
    
    dataset["Is-Spam"][-1] = isSpam
    dataset["Payload"][-1] = x.get_payload()

In [0]:
def removeInappropriateAttributes():
  
  upperBound = 1000
  fieldException = ["In-Reply-To"]
  inappropriateAttr = []    

  for x in dataset.keys():
    counter = 0
    for i, y in enumerate(dataset[x]):
      if (y == ""):
        dataset[x][i] = np.nan
        counter += 1

    if (counter > upperBound \
        and (x not in fieldException)):
      inappropriateAttr.append(x)

  for x in inappropriateAttr:
    del dataset[x]


In [8]:
import pandas as pd
import numpy as np

dataset = {"Is-Spam" : [], "Payload" : []}

initiateDataset(spamList)
initiateDataset(hamList)
addToDataset(spamList, isSpam=True)
addToDataset(hamList, isSpam=False)

removeInappropriateAttributes()

pandaDataset = pd.DataFrame(dataset, columns=dataset.keys())
pandaDataset.head()

pandaDataset.isna().sum()

Is-Spam            0
Payload            0
Return-Path        8
Delivered-To     163
Received         137
From               2
To               158
Subject            4
Content-Type     422
Date               2
Message-Id       214
In-Reply-To     1983
dtype: int64

## Prepare Train Dataset

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(\
    pandaDataset, pandaDataset["Is-Spam"]):
  
  stratTrainSet = pandaDataset.loc[trainIndex]
  stratTestSet = pandaDataset.loc[testIndex]

trainSet = stratTrainSet.drop("Is-Spam", axis=1)
trainLabels = stratTrainSet["Is-Spam"]

testSet = stratTestSet.drop("Is-Spam", axis=1)
testLabels = stratTestSet["Is-Spam"]

In [10]:
trainSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2401 entries, 1712 to 2212
Data columns (total 11 columns):
Payload         2401 non-null object
Return-Path     2395 non-null object
Delivered-To    2265 non-null object
Received        2285 non-null object
From            2399 non-null object
To              2267 non-null object
Subject         2397 non-null object
Content-Type    2062 non-null object
Date            2399 non-null object
Message-Id      2228 non-null object
In-Reply-To     821 non-null object
dtypes: object(11)
memory usage: 225.1+ KB


In [11]:
droppedTrainSet = pd.DataFrame(trainSet)
droppedTrainSet["In-Reply-To"] = droppedTrainSet["In-Reply-To"].fillna(0)

for x in droppedTrainSet.loc[:, "In-Reply-To"].index:
  if (droppedTrainSet.loc[x, "In-Reply-To"] != 0):
    droppedTrainSet.loc[x, "In-Reply-To"] = 1

droppedTrainSet["In-Reply-To"].head()

1712    1
364     0
2890    0
1733    1
178     0
Name: In-Reply-To, dtype: object

In [0]:
droppedTrainSet = droppedTrainSet.drop(["To", "From",\
    "Date", "Message-Id", "Received", "Content-Type"], axis=1)

In [13]:
droppedTrainSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2401 entries, 1712 to 2212
Data columns (total 5 columns):
Payload         2401 non-null object
Return-Path     2395 non-null object
Delivered-To    2265 non-null object
Subject         2397 non-null object
In-Reply-To     2401 non-null object
dtypes: object(5)
memory usage: 192.5+ KB


In [0]:
deliveredTo = list(set(droppedTrainSet["Delivered-To"]))

if (np.nan in deliveredTo):
  nanIndex = deliveredTo.index(np.nan)
  del deliveredTo[nanIndex]

dummies = pd.get_dummies(droppedTrainSet[["Delivered-To"]])
droppedTrainSet = pd.concat([droppedTrainSet, dummies], axis=1)
droppedTrainSet = droppedTrainSet.drop("Delivered-To", axis=1)

In [0]:
dummies = pd.get_dummies(droppedTrainSet[["In-Reply-To"]])
droppedTrainSet = pd.concat([droppedTrainSet, dummies], axis=1)
droppedTrainSet = droppedTrainSet.drop("In-Reply-To", axis=1)

### Without Considering the Payload Part in the Train Dataset

In [0]:
# @TODO: Remove it.
trainSetWithoutPayload = droppedTrainSet.drop(["Payload", "Subject",\
    "Return-Path"], axis=1)

In [17]:
trainSetWithoutPayload.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2401 entries, 1712 to 2212
Data columns (total 11 columns):
Delivered-To_freebsd-bugs@freebsd.org                   2401 non-null uint8
Delivered-To_freebsd-ports@freebsd.org                  2401 non-null uint8
Delivered-To_freebsd-questions@freebsd.org              2401 non-null uint8
Delivered-To_yyyy@localhost.netnoteinc.com              2401 non-null uint8
Delivered-To_yyyy@localhost.spamassassin.taint.org      2401 non-null uint8
Delivered-To_yyyy@spamassassin.taint.org                2401 non-null uint8
Delivered-To_zzzz-spamtrap@sonic.spamtraps.taint.org    2401 non-null uint8
Delivered-To_zzzz@localhost.netnoteinc.com              2401 non-null uint8
Delivered-To_zzzz@localhost.spamassassin.taint.org      2401 non-null uint8
In-Reply-To_0                                           2401 non-null uint8
In-Reply-To_1                                           2401 non-null uint8
dtypes: uint8(11)
memory usage: 124.5 KB


In [18]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy',
                              random_state = 0)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=trainSetWithoutPayload,
                              y=trainLabels , cv = 10, scoring="recall")
print("Random Forest:\n Accuracy:", accuracies.mean(), "+/-", accuracies.std())

classifier.fit(trainSetWithoutPayload, trainLabels)

Random Forest:
 Accuracy: 0.9400609756097561 +/- 0.049033679801823896


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Prepare Test Dataset

In [0]:
droppedTestSet = testSet.drop(["To", "From",\
    "Date", "Message-Id", "Received", "Content-Type"], axis=1)

In [20]:
droppedTestSet = droppedTestSet.drop(["Payload", "Subject",\
    "Return-Path"], axis=1)

droppedTestSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601 entries, 148 to 2104
Data columns (total 2 columns):
Delivered-To    574 non-null object
In-Reply-To     198 non-null object
dtypes: object(2)
memory usage: 14.1+ KB


In [0]:
shouldBeConcat = {}

for x in deliveredTo:
  shouldBeConcat["Delivered-To_" + x] = []

shouldBeConcat["In-Reply-To_0"] = []
shouldBeConcat["In-Reply-To_1"] = []

In [0]:
for x in droppedTestSet["Delivered-To"]:
  for y in deliveredTo:
    if (x == y):
      shouldBeConcat["Delivered-To_" + y].append(1)
    else:
      shouldBeConcat["Delivered-To_" + y].append(0)

In [0]:
droppedTestSet["In-Reply-To"] = droppedTestSet["In-Reply-To"].fillna('0')

for x in droppedTestSet.loc[:, "In-Reply-To"].index:
  if (droppedTestSet.loc[x, "In-Reply-To"] == '0'):
    shouldBeConcat["In-Reply-To_0"].append(1)
    shouldBeConcat["In-Reply-To_1"].append(0)
  else:
    shouldBeConcat["In-Reply-To_0"].append(0)
    shouldBeConcat["In-Reply-To_1"].append(1)

droppedTestSet = pd.DataFrame(shouldBeConcat)

In [24]:
droppedTestSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 11 columns):
Delivered-To_freebsd-bugs@freebsd.org                   601 non-null int64
Delivered-To_zzzz-spamtrap@sonic.spamtraps.taint.org    601 non-null int64
Delivered-To_zzzz@localhost.netnoteinc.com              601 non-null int64
Delivered-To_yyyy@spamassassin.taint.org                601 non-null int64
Delivered-To_yyyy@localhost.netnoteinc.com              601 non-null int64
Delivered-To_freebsd-ports@freebsd.org                  601 non-null int64
Delivered-To_zzzz@localhost.spamassassin.taint.org      601 non-null int64
Delivered-To_yyyy@localhost.spamassassin.taint.org      601 non-null int64
Delivered-To_freebsd-questions@freebsd.org              601 non-null int64
In-Reply-To_0                                           601 non-null int64
In-Reply-To_1                                           601 non-null int64
dtypes: int64(11)
memory usage: 51.7 KB


### Without Considering the Payload in the Test Dataset

In [25]:
y_pred = classifier.predict(droppedTestSet)
n_correct = sum(y_pred == testLabels)
print(n_correct / len(y_pred))

0.9351081530782029


## Considering the Payload Words

In [26]:
droppedTrainSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2401 entries, 1712 to 2212
Data columns (total 14 columns):
Payload                                                 2401 non-null object
Return-Path                                             2395 non-null object
Subject                                                 2397 non-null object
Delivered-To_freebsd-bugs@freebsd.org                   2401 non-null uint8
Delivered-To_freebsd-ports@freebsd.org                  2401 non-null uint8
Delivered-To_freebsd-questions@freebsd.org              2401 non-null uint8
Delivered-To_yyyy@localhost.netnoteinc.com              2401 non-null uint8
Delivered-To_yyyy@localhost.spamassassin.taint.org      2401 non-null uint8
Delivered-To_yyyy@spamassassin.taint.org                2401 non-null uint8
Delivered-To_zzzz-spamtrap@sonic.spamtraps.taint.org    2401 non-null uint8
Delivered-To_zzzz@localhost.netnoteinc.com              2401 non-null uint8
Delivered-To_zzzz@localhost.spamassassin.taint.org 

In [27]:
droppedTestSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 11 columns):
Delivered-To_freebsd-bugs@freebsd.org                   601 non-null int64
Delivered-To_zzzz-spamtrap@sonic.spamtraps.taint.org    601 non-null int64
Delivered-To_zzzz@localhost.netnoteinc.com              601 non-null int64
Delivered-To_yyyy@spamassassin.taint.org                601 non-null int64
Delivered-To_yyyy@localhost.netnoteinc.com              601 non-null int64
Delivered-To_freebsd-ports@freebsd.org                  601 non-null int64
Delivered-To_zzzz@localhost.spamassassin.taint.org      601 non-null int64
Delivered-To_yyyy@localhost.spamassassin.taint.org      601 non-null int64
Delivered-To_freebsd-questions@freebsd.org              601 non-null int64
In-Reply-To_0                                           601 non-null int64
In-Reply-To_1                                           601 non-null int64
dtypes: int64(11)
memory usage: 51.7 KB
