In [0]:
### This notebook uses Python 3.x ###
### Author: GTKlondike            ###


# Download data from Github
! git clone https://github.com/NetsecExplained/Machine-Learning-for-Security-Analysts.git
data_dir = "Machine-Learning-for-Security-Analysts"

fatal: destination path 'Machine-Learning-for-Security-Analysts' already exists and is not an empty directory.


In [0]:
import re, os, math, nltk, string, json

nltk.download('stopwords')
nltk.download('punkt')

print("Libraries imported")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Libraries imported


In [0]:
test_email = """
Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org
"""
print(test_email)


Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org



In [0]:
spam_table = dict() # Holds spammy words
spam_table_len = 0  # The total number of words in all spam messages
total_spam = 0      # The total number of spam emails parsed

ham_table = dict()  # Holds hammy words
ham_table_len = 0   # The total number of words in all ham messages
total_ham = 0       # The total number of ham emails parsed

bum_words = "the;and;that;have;for;not;with;you;this;but;his;from;they;we;say;her;she;will;one;all;would;there;their;what;out;about;who;get;which;when;make;can;like;time;just;him;know;take;people;into;year;your;good;some;could;them;see;other;than;then;now;look;only;come;its;over;think;also;back;after;use;two;how;our;work;first;well;way;even;new;want;because;any;these;give;day;most;ever;among;stand;yet;often;hour;talk;might;start;turn;help;big;small;keep;old;out;high;low;ask;should;down;thing;aaron;adam;alan;albert;alice;amanda;amy;andrea;andrew;angela;ann;anna;anne;annie;anthony;antonio;arthur;ashley;barbara;benjamin;betty;beverly;billy;bobby;bonnie;brandon;brenda;brian;bruce;carl;carlos;carol;carolyn;catherine;charles;cheryl;chris;christina;christine;christopher;clarence;craig;cynthia;daniel;david;deborah;debra;denise;dennis;diana;diane;donald;donna;doris;dorothy;douglas;earl;edward;elizabeth;emily;eric;ernest;eugene;evelyn;frances;frank;fred;gary;george;gerald;gloria;gregory;harold;harry;heather;helen;henry;howard;irene;jack;jacqueline;james;jane;janet;janice;jason;jean;jeffrey;jennifer;jeremy;jerry;jesse;jessica;jimmy;joan;joe;john;johnny;jonathan;jose;joseph;joshua;joyce;juan;judith;judy;julia;julie;justin;karen;katherine;kathleen;kathryn;kathy;keith;kelly;kenneth;kevin;kimberly;larry;laura;lawrence;lillian;linda;lisa;lois;lori;louis;louise;margaret;maria;marie;marilyn;mark;martha;martin;mary;matthew;melissa;michael;michelle;mildred;nancy;nicholas;nicole;norma;pamela;patricia;patrick;paul;paula;peter;philip;phillip;phyllis;rachel;ralph;randy;raymond;rebecca;richard;robert;robin;roger;ronald;rose;roy;ruby;russell;ruth;ryan;samuel;sandra;sara;sarah;scott;sean;sharon;shawn;shirley;stephanie;stephen;steve;steven;susan;tammy;teresa;terry;theresa;thomas;timothy;tina;todd;victor;virginia;walter;wanda;wayne;william;willie"
bum_words = bum_words.split(';')

print("Initial counters set to 0")

Initial counters set to 0


In [0]:
def tokenizer(text):
    punctuations = list(string.punctuation)
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.PorterStemmer()
    # the commented line is for python 2.7
    #tokens = nltk.word_tokenize(text.decode('latin1').lower())
    tokens = nltk.word_tokenize(text.lower())
    # Strip out the punctuations
    tokens = [i.strip(''.join(punctuations)) 
              for i in tokens 
              if i not in punctuations]
    # User Porter Stemmer on each token
    tokens = [stemmer.stem(i)
              for i in tokens]
    return [w for w in tokens if w not in stopwords and w != ""]

t = tokenizer(test_email)
print(t)

['east', 'asian', 'font', 'lenni', 'thank', 'support', 'instal', 'unifont', 'well', 'nima', 'unsubscrib', 'email', 'debian-user-request', 'lists.debian.org', 'subject', 'unsubscrib', 'troubl', 'contact', 'listmast', 'lists.debian.org']


In [0]:
stopwords = set(nltk.corpus.stopwords.words('english'))
print(str(stopwords)[:100])

{'we', 'from', 'in', 'ain', 'now', "wouldn't", 'shouldn', 're', 'for', 'wouldn', "needn't", 'y', 'it


In [0]:
def readEmail(email, method='tokenize'):
    table = dict()
    word_count = 0
    #
    if method == 're':
        words = re.findall(r'\b(?:[a-z]{2,}-)*[a-z]{3,}', email.lower())
    elif method == 'tokenize':
        words = tokenizer(email)
    #    
    for word in words:
        if word not in bum_words:
            word_count += 1
            if word in table:
                table[word] += 1
            else:
                table[word] = 1
    #
    return table, word_count

t = readEmail(test_email)
print(t)

({'east': 1, 'asian': 1, 'font': 1, 'lenni': 1, 'thank': 1, 'support': 1, 'instal': 1, 'unifont': 1, 'nima': 1, 'unsubscrib': 2, 'email': 1, 'debian-user-request': 1, 'lists.debian.org': 2, 'subject': 1, 'troubl': 1, 'contact': 1, 'listmast': 1}, 19)


In [0]:
def learnSpam(email):
    global spam_table
    global total_spam
    global spam_table_len
    
    table, word_count = readEmail(email)
    
    spam_table_len += word_count
    total = total_spam
    old_spam = spam_table
    
    for word in old_spam:
        if word in table:
            # Add to the word count
            spam_table[word] += table[word]
            
            # Quietly delete the word (key pop)
            nul = table.pop(word, None)
    #
    # Add new words to the spam_table
    for word in table:
        spam_table[word] = table[word]
    
    # I read a new spam email
    total_spam = total + 1

In [0]:
def learnHam(email):
    global ham_table
    global total_ham
    global ham_table_len
    
    table, word_count = readEmail(email)
    
    ham_table_len += word_count
    total = total_ham
    old_ham = ham_table
    
    for word in old_ham:
        if word in table:
            # Add to the word count
            ham_table[word] += table[word]
            
            # Quietly delete the word (key pop)
            nul = table.pop(word, None)
    #
    # Add new words to the ham_table
    for word in table:
        ham_table[word] = table[word]
    
    # I read a new ham email
    total_ham = total + 1

In [0]:
def train():
    print("training ham")
    for each in os.listdir(data_dir + '/ham'):
        with open(data_dir + '/ham/' + each, 'r', encoding='latin-1') as f:
            learnHam(f.read())
    print("training spam")
    for each in os.listdir(data_dir + '/spam'):
        with open(data_dir + '/spam/' + each, 'r', encoding='latin-1') as f:
            learnSpam(f.read())
    print("Training complete!")

In [0]:
def calcN():
    # Calculates the total number of unique words
    z = spam_table.copy()
    z.update(ham_table)
    return len(z)

In [0]:
# Time to train the tables
# Takes a while per training session (5-10 minutes per class)

train()

training ham
training spam
Training complete!


In [0]:
# Read the contents of the spam_table
print(json.dumps(spam_table,indent=4)[:500])

{
    "question": 122,
    "irelandfrom": 1,
    "nobodi": 238,
    "sun": 257,
    "sep": 239,
    "18": 289,
    "20:45:33": 19,
    "2016": 231,
    "content-typ": 262,
    "text/html": 39,
    "content-transfer-encod": 239,
    "base64": 28,
    "pehutuw+dqoncjxirufepg0kpe1fveegahr0cc1lcxvpdj0iq29udgvudc1uexbliibjb250": 1,
    "zw50psj0zxh0l2h0bww7ignoyxjzzxq9d2luzg93cy0xmjuyij4ncjxnrvrbig5hbwu9ikdf": 1,
    "tkvsqvrpuiigy29udgvudd0itwljcm9zb2z0iezyb250ugfnzsa0ljaipg0kpe1fveegbmft": 1,
    "


In [0]:
# Read the contents of the ham_table
print(json.dumps(ham_table,indent=4)[:500])

{
    "ilug": 302,
    "pppd": 15,
    "disconnect": 11,
    "hello": 88,
    "folk": 73,
    "linux": 669,
    "goe": 88,
    "tri": 840,
    "connect": 301,
    "outsid": 59,
    "world": 366,
    "modem": 68,
    "got": 344,
    "debian": 685,
    "kernel": 449,
    "2.4.18": 5,
    "thi": 4031,
    "win-modem": 2,
    "ye": 256,
    "manag": 328,
    "locat": 89,
    "proper": 42,
    "driver": 246,
    "minicom": 6,
    "veri": 461,
    "much": 462,
    "abl": 191,
    "dial": 7,
    "seem"


In [0]:
def predict(email, alpha=1, debug=False):
    table, word_count = readEmail(email)
    
    num_h = alpha                             # P(h_theta) = a
    num_s = alpha                             # P(s_theta) = a
    
    N = calcN()
    denom_s = spam_table_len + N*alpha        # P(s_theta) = a / [(spam_num) + Na]
    denom_h = ham_table_len + N*alpha         # P(h_theata) = a / [(ham_num) + Na]
    
    # P(x|C)
    # Numerator: For each word, that is in the class, count the number\
    #    of occurances in the email. Add together and add 1 (alpha)
    # Denominator: The total number of ALL features (words) in a class
    #    plus the total number of unique words
    #
    #                                         # P(s_theta) = (Xi + a) / [(spam_num) + Na]
    #                                         # P(h_theta) = (Xi + a) / [(ham_num) + Na]
    #
    # Now to sum the P(theta)'s
    # We are using math.log to prevent overflows
    
    for word in table:
        if word in spam_table:
            num_s += math.log(table[word]+1) # add number of times word occurs
        if word in ham_table:
            num_h += math.log(table[word]+1)
    p_spam = float(num_s)/math.log(denom_s)
    p_ham = float(num_h)/math.log(denom_h)
    
    # Which has the greatest probability?
    if debug:
        print("Spam Probability:", p_spam)
        print("Ham Probability:", p_ham)
    if (p_spam > p_ham):
        ret = 'spam'
    else:
        ret = 'ham'
    return ret

# Predict our test email

t = predict(test_email, debug=True)
print(t)
print('')
print(test_email)

Spam Probability: 0.8259554584442061
Ham Probability: 1.051372586916997
ham


Re: Re: East Asian fonts in Lenny. Thanks for your support.  Installing unifonts did it well for me. ;)
Nima
--
To UNSUBSCRIBE, email to debian-user-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org



In [0]:
def testModel(alpha=1):
    total = 0
    correct = 0
    print("reading emails")
    for each in os.listdir(data_dir + '/test'):
        with open(data_dir + "/test/" + each, 'r', encoding='latin-1') as f:
            prediction = predict(f.read(), alpha)
            actual = ''.join(x for x in each[-4:] if x.isalpha())
            total += 1
            if prediction == actual:
                correct += 1
    print("Total Emails: ", total)
    print("Correctly classified: ", correct)
    print("Accuracy: ", float(correct)/total)
    #return (float(correct)/total)

In [0]:
testModel()

reading emails
Total Emails:  866
Correctly classified:  827
Accuracy:  0.9549653579676675
