In [6]:
from glob import glob

In [7]:
glob("./dataset/*")

['./dataset/6',
 './dataset/1',
 './dataset/4',
 './dataset/3',
 './dataset/2',
 './dataset/5']

In [33]:
len(glob("./dataset/6/Train/*"))

100

In [77]:
from nltk import SnowballStemmer, PorterStemmer

def extract_words_in_dir(dir, stemmer=None):
    with open(dir, errors="ignore") as f:
        return extract_words(f.readlines(), stemmer=stemmer)

def extract_words(lines, stemmer=None):
    if isinstance(lines, str):
        lines = [lines]
        
    words = []
    for l in lines:
        l = l.replace("\n", "")
        if l == "":
            continue
        splitted = l.split(" ")
        splitted = [s for s in splitted if s != ""]
        if len(splitted) == 0:
            continue

        if stemmer is not None:
            splitted = [stemmer.stem(s) for s in splitted]
            
        words.extend(splitted)

    return words

sb_stemmer = SnowballStemmer("english")
pt_stemmer = PorterStemmer()
words = extract_words_in_dir("./dataset/2/Train/74770")
words[:10]

['Path:',
 'cantaloupe.srv.cs.cmu.edu!rochester!news.crd.ge.com!rpi!uwm.edu!caen!malgudi.oar.net!ucbeh.san.uc.edu!ucunix.san.uc.edu!uceng.uc.edu!cccbbs!sam.halperin',
 'Newsgroups:',
 'misc.forsale',
 'Subject:',
 'Price',
 'quote',
 'wanted',
 'Message-ID:',
 '<30431.93.uupcb@cccbbs.UUCP>']

In [78]:
len(words), len(set(words))

(244, 201)

In [79]:
ws1 = extract_words_in_dir("./dataset/2/Train/74770", sb_stemmer)
ws2 = extract_words_in_dir("./dataset/2/Train/74770", pt_stemmer)
for w1, w2 in zip(ws1, ws2):
    if w1 != w2:
        print(w1, w2)

bus bu
70ns 70n
bus bu
dos do
this thi
high highli


In [80]:
def get_num_occurences(words):
    w2no = {}
    for w in words:
        w2no[w] = w2no.get(w, 0) + 1

    return w2no
    
oc = get_num_occurences(words)
oc

{'Path:': 1,
 'cantaloupe.srv.cs.cmu.edu!rochester!news.crd.ge.com!rpi!uwm.edu!caen!malgudi.oar.net!ucbeh.san.uc.edu!ucunix.san.uc.edu!uceng.uc.edu!cccbbs!sam.halperin': 1,
 'Newsgroups:': 1,
 'misc.forsale': 1,
 'Subject:': 1,
 'Price': 1,
 'quote': 1,
 'wanted': 1,
 'Message-ID:': 1,
 '<30431.93.uupcb@cccbbs.UUCP>': 1,
 'From:': 2,
 'sam.halperin@cccbbs.UUCP': 2,
 '(Sam': 2,
 'Halperin)': 2,
 'Date:': 1,
 '5': 1,
 'Apr': 1,
 '93': 1,
 '22:31:00': 1,
 'GMT': 1,
 'Reply-To:': 1,
 'Distribution:': 1,
 'world': 1,
 'Organization:': 1,
 'Cincinnati': 1,
 'Computer': 1,
 'Connection': 1,
 '-': 2,
 'Cincinnati,': 1,
 'OH': 1,
 '513-752-1055': 1,
 'Lines:': 1,
 '46': 1,
 'sam.halperin@cccbbs.uceng': 2,
 '486': 1,
 'DX': 1,
 '50': 1,
 'mHz': 1,
 'in': 2,
 'Zero': 1,
 'Insertion': 1,
 'Force': 1,
 'Socket': 1,
 'Empty': 1,
 'over-drive': 1,
 'socket': 1,
 'EISA': 3,
 'motherboard': 1,
 'with': 2,
 '256k': 1,
 'cache': 2,
 '-->2': 4,
 '32': 3,
 'bit': 4,
 'Slots': 1,
 'VESA': 2,
 'local': 2,
 '

In [81]:
from pathlib import Path

def load_train(base_dir, stemmer=None):
    """
    load train files and returns `class_to_words` = {
        # class 
        1: {
            "random": 10, # <word>: <number_of_occurences>
            ...
        },
        ...
    }
    """
    class_to_words = {}
    for i in range(1, 7):
        dir = base_dir.joinpath(f"{i}/Train/*")
        file_paths = glob(str(dir))
        words = []
        for p in file_paths:
            words.extend(extract_words_in_dir(p, stemmer))

        word_with_num_occur = get_num_occurences(words)
        
        class_to_words[i] = word_with_num_occur
            
    return class_to_words

base_dir = Path("./dataset")
c2w = load_train(base_dir, sb_stemmer)

In [82]:
c2w.keys()

dict_keys([1, 2, 3, 4, 5, 6])

In [83]:
c2w[2]

{'path:': 100,
 'cantaloupe.srv.cs.cmu.edu!rochester!news.crd.ge.com!rpi!uwm.edu!caen!malgudi.oar.net!ucbeh.san.uc.edu!ucunix.san.uc.edu!uceng.uc.edu!cccbbs!sam.halperin': 1,
 'newsgroups:': 100,
 'misc.forsal': 51,
 'subject:': 100,
 'price': 17,
 'quot': 2,
 'want': 17,
 'message-id:': 100,
 '<30431.93.uupcb@cccbbs.uucp>': 1,
 'from:': 101,
 'sam.halperin@cccbbs.uucp': 2,
 '(sam': 2,
 'halperin)': 2,
 'date:': 100,
 '5': 51,
 'apr': 106,
 '93': 56,
 '22:31:00': 1,
 'gmt': 96,
 'reply-to:': 10,
 'distribution:': 37,
 'world': 10,
 'organization:': 98,
 'cincinnati': 1,
 'comput': 35,
 'connect': 3,
 '-': 71,
 'cincinnati,': 1,
 'oh': 4,
 '513-752-1055': 1,
 'lines:': 99,
 '46': 2,
 'sam.halperin@cccbbs.uceng': 2,
 '486': 2,
 'dx': 3,
 '50': 1,
 'mhz': 3,
 'in': 129,
 'zero': 1,
 'insert': 1,
 'forc': 3,
 'socket': 4,
 'empti': 1,
 'over-dr': 1,
 'eisa': 3,
 'motherboard': 5,
 'with': 76,
 '256k': 2,
 'cach': 2,
 '-->2': 4,
 '32': 4,
 'bit': 8,
 'slot': 3,
 'vesa': 3,
 'local': 10,
 'b

In [115]:
def nb_classifier(class_to_words, target_text, prior, stemmer=None):
    target_words = extract_words(target_text, stemmer=stemmer)
    max_prob = 0
    target_class = -1 # not yet found
    for c, class_words in class_to_words.items():
        c_prob = get_class_prob(class_words, target_words, prior)
        print(f"prob for class {c} is: {c_prob}")
        if c_prob > max_prob:
            max_prob = c_prob
            target_class = c

    return target_class

def get_class_prob(class_words, target_words, prior):
    # add 1 to number of occurences of each word in order to normalize non-occuring words impact
    num_all_class_words = sum([n+1 for _, n in class_words.items()])

    prob = prior
    for w in target_words:
        word_prob = (class_words.get(w, 0) + 1) / num_all_class_words
        prob *= word_prob

    return prob
    

In [116]:
c2w[1]

{'newsgroups:': 100,
 'comp.graph': 86,
 'path:': 100,
 'cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!news.ans.net!malgudi.oar.net!zaphod.mps.ohio-state.edu!uunet.ca!canrem!dosgate!dosgate![danny.hawrysio@canrem.com]': 1,
 'from:': 103,
 '"danni': 4,
 'hawrysio"': 4,
 '<danny.hawrysio@canrem.com>': 4,
 'subject:': 101,
 'window': 7,
 'imagine??!!': 3,
 'message-id:': 100,
 '<199314.4387.24074@dosgate>': 1,
 'reply-to:': 21,
 'organization:': 100,
 'canada': 12,
 'remot': 4,
 'system': 33,
 'distribution:': 26,
 'comp': 2,
 'date:': 101,
 '14': 46,
 'apr': 110,
 '93': 30,
 '14:45:43': 1,
 'est': 2,
 'lines:': 100,
 '12': 5,
 '->': 6,
 'i': 229,
 'have': 98,
 'been': 18,
 'on': 116,
 'the': 649,
 'phone': 6,
 'with': 98,
 'impuls': 3,
 'for': 185,
 'about': 60,
 '3': 14,
 'month': 3,
 'wait': 5,
 'my': 42,
 'cross': 2,
 '-': 62,
 'platform': 7,
 'upgrad': 17,
 '(amiga': 2,
 'to': 393,
 'ibm).': 2,
 'they': 39,
 'told': 5,
 'me': 41,
 'everi': 1

In [86]:
sum([n+1 for _, n in c2w[1].items()])

26284

In [121]:
target_words = extract_words("from past to the future", sb_stemmer)

In [124]:
for c in range(1, 7):
    s = sorted(c2w[c].items(), key = lambda x: x[1], reverse=True)
    print(s[:5])
    p = get_class_prob(c2w[c], target_words, 1/6)
    print(f"prob for class {c} is: {p}")

[('the', 649), ('to', 393), ('a', 381), ('of', 357), ('and', 305)]
prob for class 1 is: 7.213348193805997e-16
[('the', 293), ('for', 248), ('of', 183), ('and', 182), ('to', 174)]
prob for class 2 is: 1.1116364811964218e-16
[('the', 835), ('a', 452), ('to', 340), ('of', 291), ('and', 289)]
prob for class 3 is: 7.260208327183447e-16
[('the', 2603), ('to', 1356), ('of', 1271), ('and', 959), ('a', 877)]
prob for class 4 is: 3.972405640267088e-16
[('the', 1780), ('of', 1068), ('to', 1047), ('and', 748), ('is', 696)]
prob for class 5 is: 2.977897414124212e-15
[('the', 1813), ('of', 1025), ('to', 957), ('a', 886), ('and', 703)]
prob for class 6 is: 8.970454817092327e-16


In [123]:
target_class = nb_classifier(c2w, "from past to the future", 1/6, sb_stemmer)
target_class

prob for class 1 is: 7.213348193805997e-16
prob for class 2 is: 1.1116364811964218e-16
prob for class 3 is: 7.260208327183447e-16
prob for class 4 is: 3.972405640267088e-16
prob for class 5 is: 2.977897414124212e-15
prob for class 6 is: 8.970454817092327e-16


5

['from', 'path']

In [104]:
sorted(list(c2w[c].items()), 

[('path:', 100),
 ('cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!noc.near.net!transfer.stratus.com!sw.stratus.com!cdt',
  1),
 ('from:', 101),
 ('cdt@sw.stratus.com', 6),
 ('(c.', 6),
 ('d.', 9),
 ('tavares)', 6),
 ('newsgroups:', 100),
 ('talk.politics.gun', 82),
 ('subject:', 100),
 ('a', 886),
 ('scoop', 1),
 ('of', 1025),
 ('waco', 8),
 ('road,', 1),
 ('pleas', 14),
 ('date:', 100),
 ('15', 37),
 ('apr', 110),
 ('1993', 100),
 ('14:26:31', 1),
 ('gmt', 89),
 ('organization:', 98),
 ('stratus', 2),
 ('computer,', 2),
 ('inc.', 10),
 ('lines:', 100),
 ('13', 1),
 ('distribution:', 17),
 ('world', 23),
 ('message-id:', 100),
 ('<1qjran$9sh@transfer.stratus.com>', 1),
 ('references:', 81),
 ('<s539.2adf@looking.on.ca>', 1),
 ('nntp-posting-host:', 41),
 ('rocket.sw.stratus.com', 2),
 ('keywords:', 5),
 ('topical,', 1),
 ('smirk', 1),
 ('your', 158),
 ('"lite"', 1),
 ('post', 22),
 ('for', 317),
 ('the', 1813),
 ('day,',

In [None]:
def load_test(base_dir):
    pass