In [1]:
import xml.etree.ElementTree as ET
from nltk import word_tokenize, RegexpTokenizer, FreqDist
import nltk.classify
from nltk.classify import maxent
import random
import math
from nltk.corpus import stopwords
from pprint import pprint
import numpy as np
stpwords = stopwords.words('english')
import re
import pickle
import sys
from nltk import ngrams


## extra output for debugging
debug = 1






def first_pass(train_xml, test_xml, num_features=100):
    train, Ytrain, test, Ytest = revs_labels(train_xml, test_xml)

    all_words, all_bigrams, all_trigrams, feat_words, feat_bigrams, feat_trigrams = get_vocab(train, num_features)

    x_feat_list = features(train, feat_words, feat_bigrams, feat_trigrams)
    y_feat_list = features(test, feat_words, feat_bigrams, feat_trigrams)
    test_zip = list(zip (y_feat_list, Ytest))
    train_zip = list(zip (x_feat_list, Ytrain))

    model, pred = run_model(train_zip, test_zip)
    return model

def second_pass(train_xml, test_xml, feat_words):
    train, Ytrain, test, Ytest = revs_labels(train_xml, test_xml)

    x_feat_list = features(train, feat_words, feat_bigrams, feat_trigrams)
    y_feat_list = features(test, feat_words, feat_bigrams, feat_trigrams)
    test_zip = list(zip (y_feat_list, Ytest))
    train_zip = list(zip (x_feat_list, Ytrain))

    model, pred = run_model(train_zip, test_zip)
    return model





In [2]:

# if len(sys.argv) > 1:
#     trainfile = sys.argv[1]
# elif len(sys.argv) > 2:
#     testfile = sys.argv[2]
# else:
trainfile = "TrainingSet.xml"


tree = ET.parse(trainfile)
root = tree.getroot()
root = [rev for rev in root]
random.shuffle(root, lambda: 0.424125125437)

train_xml = root[:1800]
test_xml = root[1800:]
# train_xml = root[:20]
# test_xml = root[1890:]
if debug: print(len(root), len(train_xml), len(test_xml))


1900 1800 100


In [3]:
def revs_labels(train_xml, test_xml):
    train  = [0] * len(train_xml)
    Ytrain = [0] * len(train_xml)
    train_ID = [0] * len(train_xml)
    for i in range(len(train_xml)):
        train[i] = train_xml[i].find("review_text").text.lower()
        y = float(train_xml[i].find("rating").text)
        if (y < 3):
            Ytrain[i] = 'neg'
        else:
            Ytrain[i] = 'pos'
        train_ID[i] = train_xml[i].find("unique_id").text


    # train[0].find("review_text").text
    test  = [0] * len(test_xml)
    Ytest = [0] * len(test_xml)
    test_ID  = [0] * len(test_xml)
    for i in range(len(test_xml)):
        test[i] = test_xml[i].find("review_text").text.lower()
        Ytest[i] = str(int(float(test_xml[i].find("rating").text)))
        y = float(test_xml[i].find("rating").text)
        if (y < 3):
            Ytest[i] = 'neg'
        else:
            Ytest[i] = 'pos'
        test_ID[i] = test_xml[i].find("unique_id").text
    return train, Ytrain, train_ID, test, Ytest, test_ID


def get_vocab(train, num_words=500, num_bigrams=50, num_trigrams=10):
    tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
    tokenized = [tokenizer(rev) for rev in train]

    all_words = [x for rev in tokenized for x in rev]
    all_words = [x for x in all_words if x not in stpwords]
    all_bigrams = list(ngrams(all_words, 2))
    all_trigrams = list(ngrams(all_words, 3))

    word_fdist = FreqDist(all_words)
    bigram_fdist = FreqDist(all_bigrams)
    trigram_fdist = FreqDist(all_trigrams)

    feat_words = word_fdist.most_common(num_words)
    feat_bigrams = bigram_fdist.most_common(num_bigrams)
    feat_trigrams = trigram_fdist.most_common(num_trigrams)

    feat_words = [a for a,b in feat_words]
    feat_bigrams = [a for a,b in feat_bigrams]
    feat_trigrams = [a for a,b in feat_trigrams]


    return all_words, all_bigrams, all_trigrams, feat_words, feat_bigrams, feat_trigrams
#     return all_words, all_bigrams, feat_words, feat_bigrams



def features (revs, words, bigrams, trigrams):
    feat_list = []
    tokenizer = RegexpTokenizer(r'[a-z][a-z\']*').tokenize
    for text in revs:
        feat = {}
        tokens = [x for x in tokenizer(text) if x not in stpwords]
        text_bigrams = list(ngrams(tokens, 2))
        text_trigrams = list(ngrams(tokens, 3))
#         print(text_trigrams)
        for bg in bigrams:
            feat[bg] = int(bg in text_bigrams)
        for tg in trigrams:
            feat[tg] = int(tg in text_trigrams)
        for word in words:
            feat[word] = int(word in text)
        feat_list.append(feat)

    return feat_list

def find_best_feats(model, num_features=100):
    from contextlib import redirect_stdout
    ## nltk's most informative features prints to std out
    #  extract labels from output

    with open('help.txt', 'w') as f:
        with redirect_stdout(f):
            model.show_most_informative_features(300)
    with open('help.txt', 'r') as f:
        idk = re.findall(u'\s[a-z][a-z\']*=', f.read())

        idk = [re.findall(u'[a-z\']+', s) for s in idk]
        idk = [x for y in idk for x in y]
    with open('most_helpful.txt', 'wb') as f:
        pickle.dump(idk, f)
    return idk



[1, 2, 3, 4]
def check_acc(pred, Ytest):
    thing = list(zip(pred, Ytest))

    correct = 0
    total = 0
    for (p,r) in thing:
        if p == r:
            correct += 1
    #         print("correct", p,r)
        else:
    #         print("incorrect", p,r)
            total += 1
    #     print(p,r)
    print(correct, total)
    # pprint(thing)

if debug > 1:
    print(Ytrain.count('pos'),Ytrain.count('neg'), len(Ytrain))
    print(Ytest.count('pos'),Ytest.count('neg'), len(Ytest))

    print(Ytrain[:150])
     #pprint(list(zip(Ytrain[250:255],train[250:255])))

def run_model(train_zip, test_zip):
    encoding = maxent.TypedMaxentFeatureEncoding.train(train_zip)

    v = .01
    model = maxent.MaxentClassifier.train(train_zip, encoding=encoding, trace=4, min_lldelta=v)
    print(nltk.classify.accuracy(model, test_zip))
    # pred = model.classify_many(y_feat_list)
    pred = []
    return model, pred

In [4]:
train, Ytrain, train_ID, test, Ytest, test_ID = revs_labels(train_xml, test_xml)



In [5]:
num_features = 55
num_bigrams = 90
num_trigrams = 8
all_words, all_bigrams, all_trigrams, feat_words, feat_bigrams, feat_trigrams = get_vocab(train, num_features, num_bigrams, num_trigrams)
# all_words, all_bigrams, feat_words, feat_bigrams = get_vocab(train, num_features, num_bigrams)





In [6]:
x_feat_list = features(train, feat_words, feat_bigrams, feat_trigrams)
y_feat_list = features(test, feat_words, feat_bigrams, feat_trigrams)
test_zip = list(zip (y_feat_list, Ytest))
train_zip = list(zip (x_feat_list, Ytrain))

# model, pred = run_model(train_zip, test_zip)
#     return model
# x_feat_list

In [None]:
model.show_most_informative_features(300)



In [None]:

# model1 = first_pass(train_xml, test_xml, 200)
# with open("model" + str(round(random.random(),4)) + ".txt", "wb") as fp:
#     pickle.dump(model1, fp)
# best_feats = find_best_feats(model1, 400)

# model2 = second_pass(train_xml, test_xml, best_feats)

In [None]:
test_ID

In [7]:
pred = Ytest
for ID,pred in list(zip(test_ID, pred)):
        print("%s\t%s"% (ID.strip(), pred))

1594837937:a_good_man's_long_journey:wantz_upon_a_time_reviews_"www.wantzuponatime.com"	pos
0071463097:i_can't_really_rate_this_item...:catalina_sanchez_"tatatiu"	neg
0671027034:outstanding:brandon_michael	pos
0743200926:big_book_of_grilling,_bbq_and_rotisserie_cookbook:tweety_"tweety"	neg
0394536487:the_egotistical_master:	neg
061318114X:not_innate,_but_geographical_differences:luc_reynaert	pos
0962855057:yuck!!!!_just_a_bad_book!:bookeee	neg
0743254562:5.0_stars:jason_frost_"rubicon"	pos
1416509690:unbelievable:shirlee_lerner	neg
0061234001:freakonomics:ronald_l._rushton	pos
0670032506:ironic_--_an_unproductive_book_on_productivity:rundhc	neg
0826415717:an_opinion_on_hendrix_-_far_from_anything_new:t._walker	neg
0195045785:statecraft:christian_schlect	pos
0415921139:abstractionist_in_disguise:terrance_shock	neg
0316010294:made_my_son_a_reader!!:m._connelly_"bookie"	pos
1583483985:i_thought_i'd_come_away_with_useful_information...:m._ollila	neg
0316082597:will_there_be_pi_in_the_sky_b