In [1]:
### This notebook uses Python 3.x ###
### Author: GTKlondike            ###


# Download data from Github
! git clone https://github.com/NetsecExplained/Machine-Learning-for-Security-Analysts.git
data_dir = "Machine-Learning-for-Security-Analysts"

Cloning into 'Machine-Learning-for-Security-Analysts'...
remote: Enumerating objects: 4134, done.[K
remote: Counting objects: 100% (4134/4134), done.[K
remote: Compressing objects: 100% (4102/4102), done.[K
remote: Total 4134 (delta 30), reused 4114 (delta 23), pack-reused 0
Receiving objects: 100% (4134/4134), 14.46 MiB | 14.05 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [2]:
import re, os, math, nltk, string, json

nltk.download('stopwords')
nltk.download('punkt')

# ======= New Imports =======
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

# ======= /New Imports =======

print("Libraries imported")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Libraries imported


In [3]:
# Straight copy/paste
test_email = """
Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org
"""
print(test_email)


Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org



In [4]:
# Straight copy/paste
def tokenizer(text):
    punctuations = list(string.punctuation)
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.PorterStemmer()
    # the commented line is for python 2.7
    #tokens = nltk.word_tokenize(text.decode('latin1').lower())
    tokens = nltk.word_tokenize(text.lower())
    # Strip out the punctuations
    tokens = [i.strip(''.join(punctuations)) 
              for i in tokens 
              if i not in punctuations]
    # User Porter Stemmer on each token
    tokens = [stemmer.stem(i)
              for i in tokens]
    return [w for w in tokens if w not in stopwords and w != ""]

t = tokenizer(test_email)
print(t)

['east', 'asian', 'font', 'lenni', 'thank', 'support', 'instal', 'unifont', 'well', 'nima', 'unsubscrib', 'email', 'debian-user-request', 'lists.debian.org', 'subject', 'unsubscrib', 'troubl', 'contact', 'listmast', 'lists.debian.org']


In [5]:
# Reworked version of train() function
# Takes a while per training session (5-10 minutes per class)

corpus = []
y = []
print("Training ham")
for each in os.listdir(data_dir + '/ham'):
        with open(data_dir + '/ham/' + each, 'r', encoding='latin-1') as f:
            corpus.append(f.read())
            y.append("ham")
print("Training spam")
for each in os.listdir(data_dir + '/spam'):
        with open(data_dir + '/spam/'+each, 'r', encoding='latin-1') as f:
            corpus.append(f.read())
            y.append("spam")
print("Training Complete!")

Training ham
Training spam
Training Complete!


In [9]:
print(corpus[5])

[Spambayes] spambayes package?
    Guido> Why would we care about installing a few extra files, as long as
    Guido> they're inside a package?

I guess you needn't worry about that.  It just doesn't seem "clean" to me.

S




In [11]:
# Vectorizes the training inputs. Takes about 90 seconds to complete
print("Vectorizing...")
vectorizer = TfidfVectorizer(tokenizer=tokenizer)#get a vector for each url but use our customized tokenizer
countVectorizer = CountVectorizer(tokenizer=tokenizer)
X = vectorizer.fit_transform(corpus) #get the X vector
count_X = countVectorizer.fit_transform(corpus)
print("Vectorizing Complete!")


print("Printing input")
print(X[0])
print(y[:10])

Vectorizing...
Vectorizing Complete!
Printing input
  (0, 30378)	0.16969928153062816
  (0, 37449)	0.05735377143169219
  (0, 23130)	0.13013684858723798
  (0, 59276)	0.09470453015104684
  (0, 27547)	0.06049926027811394
  (0, 5761)	0.0747348427251385
  (0, 56049)	0.05287641957283723
  (0, 24819)	0.11415211743023412
  (0, 14756)	0.08678999959458103
  (0, 54336)	0.06214010348985103
  (0, 746)	0.12024844188400292
  (0, 38032)	0.05630817976401394
  (0, 3890)	0.06244475852447297
  (0, 2892)	0.19032136382434178
  (0, 336)	0.09689786452860939
  (0, 38470)	0.11129035707218164
  (0, 36888)	0.10998529312505823
  (0, 61104)	0.04727050596127583
  (0, 15484)	0.21240239131504268
  (0, 61080)	0.07666906709107443
  (0, 16491)	0.1490742618762242
  (0, 10270)	0.08229426774385218
  (0, 9481)	0.3796707932729973
  (0, 25498)	0.22907369110232906
  (0, 45038)	0.10183151126768646
  :	:
  (0, 53193)	0.20879950143034606
  (0, 46026)	0.12286531830448028
  (0, 45111)	0.12405368171760191
  (0, 60269)	0.06888103659892

In [12]:
# Reworked version of testModel()
test_corpus = []
y_test = []
print("Reading test set")
for each in os.listdir(data_dir + '/test'):
        with open(data_dir + '/test/' + each, 'r', encoding='latin-1') as f:
            test_corpus.append(f.read())
            label = ''.join(x for x in each[-4:] if x.isalpha())
            y_test.append(label)
print("Test set read complete!")

Reading test set
Test set read complete!


In [13]:
# Vectorizes the test inputs. Takes about 30 seconds to complete

print("Vectorizing...")
X_test = vectorizer.transform(test_corpus) #get the X vector
count_X_test = countVectorizer.transform(test_corpus)
print("Vectorizing complete!")
print("")
print(X_test[0])
print(y_test[:10])

Vectorizing...
Vectorizing complete!

  (0, 57944)	0.26295947598887165
  (0, 57612)	0.561548532844958
  (0, 56874)	0.280774266422479
  (0, 56409)	0.1848158362029182
  (0, 56317)	0.11220371491036285
  (0, 54331)	0.24246070739206158
  (0, 53948)	0.15070277706082222
  (0, 34765)	0.1833625215713884
  (0, 29879)	0.04759366991605391
  (0, 29671)	0.1364695074879047
  (0, 29455)	0.1561252359799158
  (0, 24653)	0.24656833238037618
  (0, 19625)	0.16027362602762493
  (0, 19595)	0.1061861873671019
  (0, 18657)	0.42582322896345
  (0, 13573)	0.22464591695845426
['ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham']


In [15]:
# MNB with TF-IDF
# === MNB ===

mnb = MultinomialNB()
mnb.fit(X, y)

print("multiNB predictor")
print(mnb.score(X_test, y_test))
mnb_pred = mnb.predict(X_test)
print(confusion_matrix(mnb_pred, y_test))

# === /MNB ===

multiNB predictor
0.8799076212471132
[[590 104]
 [  0 172]]


In [16]:
# MNB with Count Vectorizer
# === MNB ===

mnb = MultinomialNB()
mnb.fit(count_X, y)

print("multiNB predictor")
print(mnb.score(count_X_test, y_test))
mnb_pred = mnb.predict(count_X_test)
print(confusion_matrix(mnb_pred, y_test))

# === /MNB ===

multiNB predictor
0.9422632794457275
[[564  24]
 [ 26 252]]


In [17]:
# LGS with TF-IDF
# === /LGS ===

lgs = LogisticRegression(solver='lbfgs', max_iter=1000)    #using logistic regression
lgs.fit(X, y)

print("logistic predictor")
print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98%
lgs_pred = lgs.predict(X_test)
print(confusion_matrix(lgs_pred, y_test))

# === /LGS ===

logistic predictor
0.9584295612009238
[[585  31]
 [  5 245]]


In [18]:
# LGS with Count Vectorizer
# === /LGS ===

lgs = LogisticRegression(solver='lbfgs', max_iter=1000)    #using logistic regression
lgs.fit(count_X, y)

print("logistic predictor")
print(lgs.score(count_X_test, y_test)) #pring the score. It comes out to be 98%
lgs_pred = lgs.predict(count_X_test)
print(confusion_matrix(lgs_pred, y_test))

# === /LGS ===

logistic predictor
0.9780600461893765
[[583  12]
 [  7 264]]


In [19]:
# Enter your email here to predict

working_email = test_email
#working_email = test_corpus[10]

test_email_vector = vectorizer.transform([working_email])
test_email_pred = mnb.predict(test_email_vector)

print(test_email_pred)
print('')
print(working_email)



['ham']


Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org

