In [18]:
#!/usr/bin/python

import tensorflow as tf
import numpy as np

import sys, getopt
import collections
import math
import os
import random

curr_path = os.path.abspath('__file__')
root_path = os.path.abspath(os.path.join(curr_path, os.pardir))
print(root_path)
sys.path.append(str(root_path))

import numpy as np
import tensorflow as tf
from nltk import word_tokenize
from random import shuffle


from collections import namedtuple
from models.FastText import FastText

Dataset = namedtuple('Dataset','sentences labels')

num_classes = 3
learning_rate = 0.05
num_epochs = 2
embedding_dim = 10
label_to_id = {'World':0, 'Entertainment':1, 'Sports':2}
unknown_word_id = 0


def create_label_vec(label):
    label_id = label_to_id[label.strip()]
    label_vec = [0] * num_classes
    label_vec[label_id] = 1
    return label_vec


def tokenize(sens):
    return word_tokenize(sens)

def map_token_seq_to_word_id_seq(token_seq, word_to_id):
    return [map_word_to_id(word_to_id,word) for word in token_seq]


def map_word_to_id(word_to_id, word):
    if word in word_to_id:
        return word_to_id[word]
    else:
        return unknown_word_id


def build_vocab(sens_file_name):
    data = []
    with open(sens_file_name) as f:
        for line in f.readlines():
            tokens = tokenize(line)
            data.extend(tokens)
    print('size of token sequence is %s. ' % len(data))
    count = [['$UNK$', 0]]
    sorted_counts = collections.Counter(data).most_common()
    count.extend(sorted_counts)
    word_to_id = dict()
    for word, _ in count:
        word_to_id[word] = len(word_to_id)
    print("Unknown word id is %s ." % word_to_id['$UNK$'])
    print('size of vocabulary is %s. ' % len(word_to_id))
    return word_to_id


def read_labeled_dataset(sens_file_name, label_file_name, word_to_id):
    sens_file = open(sens_file_name)
    label_file = open(label_file_name)
    data = []
    for label in label_file:
        sens = sens_file.readline()
        word_id_seq = map_token_seq_to_word_id_seq(tokenize(sens), word_to_id)
        data.append((word_id_seq, create_label_vec(label)))
    print("read %d sentences from %s ." % (len(data), sens_file_name))
    sens_file.close()
    label_file.close()
    return data

def read_dataset(sens_file_name, word_to_id):
    sens_file = open(sens_file_name)
    data = []
    for sens in sens_file:
        word_id_seq = map_token_seq_to_word_id_seq(tokenize(sens), word_to_id)
        data.append(word_id_seq)
    print("read %d sentences from %s ." % (len(data), sens_file_name))
    sens_file.close()
    return data


def eval(word_to_id, train_dataset, dev_dataset, test_dataset):
    fast_text = FastText(num_classes, embedding_dim, len(word_to_id), learning_rate)
    fast_text.build_graph()
    init = tf.global_variables_initializer()
    test_results = []

    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(num_epochs):
            shuffle(train_dataset)
            for sens in train_dataset:
                fast_text.train_step.run(feed_dict={fast_text.input_sens: sens[0], fast_text.correct_label: sens[1]})
            print('Epoch %d : %s .' % (epoch, compute_accuracy(fast_text, dev_dataset)))


        print('Accuracy on the test set : %s.' % compute_accuracy(fast_text, test_dataset))
        test_results = predict(fast_text, test_dataset)
    return test_results


def compute_accuracy(fast_text, eval_dataset):
    num_correct = 0
    for (sens, label) in eval_dataset:
        num_correct += fast_text.accuracy.eval(feed_dict={fast_text.input_sens: sens, fast_text.correct_label: label})
    print('#correct sentences is %s ' % num_correct)
    return num_correct / len(eval_dataset)


def predict(fast_text, test_dataset):
    test_results = []
    for (sens, label) in test_dataset:
        test_results.append(fast_text.predict.eval(feed_dict={fast_text.input_sens: sens}))
    return test_results


def write_result_file(test_results, result_file):
    with open(result_file, mode='w') as f:
         for r in test_results:
             f.write("%d\n" % r)


def main(argv):
    trainSensFile = ''
    trainLabelFile = ''
    devSensFile = ''
    devLabelFile = ''
    testSensFile = ''
    testLabelFile = ''
    testResultFile = ''
    try:
        opts, args = getopt.getopt(argv,"hd:",["dataFolder="])
    except getopt.GetoptError:
        print('fastText.py -d <dataFolder>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('fastText.py -d <dataFolder>')
            sys.exit()
        elif opt in ("-d", "--dataFolder"):
            trainSensFile = os.path.join(arg, 'sentences_train.txt')
            devSensFile = os.path.join(arg, 'sentences_dev.txt')
            testSensFile = os.path.join(arg, 'sentences_test.txt')
            trainLabelFile = os.path.join(arg, 'labels_train.txt')
            devLabelFile = os.path.join(arg, 'labels_dev.txt')
            testLabelFile = os.path.join(arg, 'labels_test.txt')
            testResultFile = os.path.join(arg, 'test_results_2.txt')
        else:
            print("unknown option %s ." % opt)
    word_to_id = build_vocab(trainSensFile)
    train_dataset = read_labeled_dataset(trainSensFile, trainLabelFile, word_to_id)
    dev_dataset = read_labeled_dataset(devSensFile, devLabelFile, word_to_id)
    test_dataset = read_labeled_dataset(testSensFile, testLabelFile, word_to_id)
    test_results = eval(word_to_id, train_dataset, dev_dataset, test_dataset)
    write_result_file(test_results, testResultFile)


#if __name__ == "__main__":
#   main(sys.argv[1:])

/Users/zhaolongfei/Documents/ANU/COMP6490-Document Analysis/Lab/Lab4_nlp_assignment


In [19]:
main(['-d','data'])

size of token sequence is 1215500. 
Unknown word id is 0 .
size of vocabulary is 56493. 
read 150158 sentences from data/sentences_train.txt .
read 21451 sentences from data/sentences_dev.txt .
read 42902 sentences from data/sentences_test.txt .
#correct sentences is [ 8159.  8159.  8159.] 
Epoch 0 : [ 0.38035524  0.38035524  0.38035524] .
#correct sentences is [ 8159.  8159.  8159.] 
Epoch 1 : [ 0.38035524  0.38035524  0.38035524] .
#correct sentences is [ 16381.  16381.  16381.] 
Accuracy on the test set : [ 0.38182369  0.38182369  0.38182369].


AttributeError: 'function' object has no attribute 'eval'

In [None]:
model = FastText(3, 5, 10, 0.1)