In [1]:
import os
import math
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter, defaultdict
import numpy as np

In [2]:
def read_data(path):
    n_sen = 0
    classes = []
    for path, dirs, files in os.walk(path):   # each class
        print(path)
        data = []
        for filename in files:      # each file
            if filename[0] == '.':
                continue
            else:
#                print(filename)
                n_sen+=1
                filePath = path + '/' + filename
                text = open( filePath, 'r' ).read()       
                data.append(parse_article(text))
        classes.append(data)
    return classes[1:]

def parse_article(doc):
#     TODO: tokenize sentence and transform to lower and padding
    doc = doc.lower()
    sens = sent_tokenize(doc)
    sen_list = []
    for sen in sens:
        words = word_tokenize(sen)
        sen_list.append(['<s>'] + words + ['<\s>'])
    return sen_list

def to_bi(data):
    bi_dict = defaultdict(int)
    for i,word in enumerate(data[:-1]):
        key = data[i]+' '+data[i+1]
        bi_dict[key] += 1
    return bi_dict

def wi_1_type(bigram_count):
    # for every wi, type(wi-1)
    # ex. San Francisco, for "Francisco", type("XXX Francisco")
    type_count = defaultdict(lambda:0)
    total_count = defaultdict(lambda:0)
    for two_word in bigram_count.keys():
        wi = two_word.split()[1]
        type_count[wi] += 1
        total_count[wi] += bigram_count[two_word]
    return type_count, total_count

cons = 1e-4
def wi_type(bigram_count):
    # for every wi-1, type(wi)
    # ex. San Francisco, for "San", type("San XXX")
    type_count = defaultdict(lambda:cons)
    total_count = defaultdict(lambda:cons)
    for two_word in bigram_count.keys():
        wi_1 = two_word.split()[0]
        type_count[wi_1] += 1
        total_count[wi_1] += bigram_count[two_word]
    return type_count, total_count

def count_prob(data):   
    new_data = []
    for doc in data:
        for sen in doc:
            new_data.extend(sen)
    data = new_data
    del new_data
    
    word_count = Counter(data)
    bigram_count = to_bi(data)
    bigram_len = len(bigram_count)
    
    wi_1_type_count, wi_1_total_count = wi_1_type(bigram_count)
    wi_type_count, wi_total_count = wi_type(bigram_count)
    
    Pcontinue_dict = defaultdict(lambda:cons, zip(wi_1_type_count.keys(), [ max((wi_1_type_count[wi]-d)/bigram_len, cons) for wi in wi_1_type_count] ))    
    return word_count, bigram_count, Pcontinue_dict, wi_type_count, wi_total_count

def KNLM(two_word, model, d):
    word_count, bigram_count, Pcontinue_dict, wi_type_count, wi_total_count = model
    wi_1 = two_word.split()[0]
    wi = two_word.split()[1]
    bigram = max(bigram_count[two_word]-d, 0) / wi_total_count[wi_1]
    pcontinue = Pcontinue_dict[wi]
    lambda_ = wi_type_count[wi_1] * d / wi_total_count[wi_1]
    prob = math.log(bigram + lambda_ * pcontinue,10)

    return prob

def score_article(doc, models, d):
#     TODO: get label that has highest scores
    doc_score = [0]*12
    for i, model in enumerate(models):
        for sen in doc:
            doc_score[i] += score_sentence(sen, model, d)
    prediction = np.argmax(doc_score)
    
    return prediction

def score_sentence(sen, model, d):    
    bigram = [ sen[i]+' '+sen[i+1] for i,word in enumerate(sen[:-1]) ]
    score = 0
    
    if len(sen) > 90:      
        return 0   
    else:
        for bi in bigram:
            score += KNLM(bi, model, d)
        return score

In [3]:
""" main """
train_path = "reuters.tar/r_train"
test_path = "reuters.tar/r_test"
d = 0.75

print("load and preprocess training data")
train_data = read_data(train_path) # class -> document -> sentence -> word
print("load and preprocess testing data")
test_data = read_data(test_path) # class -> document -> sentence -> word

load and preprocess training data
reuters.tar/r_train
reuters.tar/r_train\acq
reuters.tar/r_train\corn
reuters.tar/r_train\crude
reuters.tar/r_train\earn
reuters.tar/r_train\grain
reuters.tar/r_train\interest
reuters.tar/r_train\money-fx
reuters.tar/r_train\oilseed
reuters.tar/r_train\ship
reuters.tar/r_train\soybean
reuters.tar/r_train\trade
reuters.tar/r_train\wheat
load and preprocess testing data
reuters.tar/r_test
reuters.tar/r_test\acq
reuters.tar/r_test\corn
reuters.tar/r_test\crude
reuters.tar/r_test\earn
reuters.tar/r_test\grain
reuters.tar/r_test\interest
reuters.tar/r_test\money-fx
reuters.tar/r_test\oilseed
reuters.tar/r_test\ship
reuters.tar/r_test\soybean
reuters.tar/r_test\trade
reuters.tar/r_test\wheat


In [4]:
models = []
print('start training')
for i, class_ in enumerate(train_data):
    word_count, bigram_count, Pcontinue_dict, wi_type_count, wi_total_count = count_prob(class_)    
#    pkn = KNLM(word_count, bigram_count, Pcontinue_dict, wi_type_count, wi_total_count, d) 
    models.append([word_count, bigram_count, Pcontinue_dict, wi_type_count, wi_total_count])
    print("model",i,"OK")  

start training
model 0 OK
model 1 OK
model 2 OK
model 3 OK
model 4 OK
model 5 OK
model 6 OK
model 7 OK
model 8 OK
model 9 OK
model 10 OK
model 11 OK


In [5]:
print("start testing")
test_data_len = sum(len(t) for t in test_data)
total_hit = 0

for i, class_ in enumerate(test_data):    
    hit = 0
    for doc in class_:
        prediction = score_article(doc, models, d)
        if prediction == i:
            hit += 1
            total_hit += 1
    print("category_", i, "hit count:", hit, "/", len(class_))
print('Accuracy: ', total_hit/test_data_len)

start testing
category_ 0 hit count: 89 / 100
category_ 1 hit count: 6 / 33
category_ 2 hit count: 71 / 96
category_ 3 hit count: 77 / 95
category_ 4 hit count: 27 / 59
category_ 5 hit count: 19 / 50
category_ 6 hit count: 52 / 63
category_ 7 hit count: 6 / 28
category_ 8 hit count: 26 / 49
category_ 9 hit count: 0 / 22
category_ 10 hit count: 44 / 55
category_ 11 hit count: 7 / 33
Accuracy:  0.6207906295754027
