In [None]:
import nltk
import numpy as np
import sklearn
from collections import defaultdict
import string
import gzip
from nltk.corpus import stopwords
import random
import math
import tensorflow as tf

In [None]:
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

In [None]:
data = [l for l in readGz('train.json.gz')if 'categoryID' in l]

In [None]:
print len(data)

In [None]:
punctuation = set(string.punctuation)
stemmer = nltk.stem.porter.PorterStemmer()
#first try use stemmer
stopword = stopwords.words('english')

In [None]:
wordCount = defaultdict(int)
for d in data:
    review = ''.join(c for c in d['reviewText'].lower() if c not in punctuation)
    wordList = review.split()

    wordList = [stemmer.stem(w) for w in wordList if w not in stopword]
    d['wordList'] = wordList
    for w in wordList:
        wordCount[w] += 1
#use stemmer

In [None]:
print len(wordCount)

In [None]:
lenWords = 2000

In [None]:
count = [(wordCount[w], w ) for w in wordCount]
count.sort()
count.reverse()

commonWords = [t[1] for t in count[:lenWords]]

wordDict = defaultdict(int)
for i in range(lenWords):
    wordDict[commonWords[i]] = i

#The 1000 most common words

In [None]:
random.shuffle(data)
train_label = np.array([d['categoryID'] for d in data[:60000]])
train_data = data[:60000]
validation_label = np.array([d['categoryID'] for d in data[60000:70195]])
validation_data = data[60000:70195]

In [None]:
#calculate idf
#tf calculated when creating feature
idf = [0 for i in range(lenWords)]

for d in train_data:
    reviewSet = set(d['wordList'])
    for w in commonWords:
        if w in reviewSet:
            idf[wordDict[w]] += 1.0


In [None]:
idf = [math.log(60000/f) for f in idf]
idf = np.array(idf)

In [None]:
avgRating = np.mean([d['rating'] for d in train_data])

In [None]:
userList = []
for d in train_data:
    if d['userID'] not in userList:
        userList.append(d['userID'])
userDict = defaultdict(int)
userAvg = [[] for u in userList]
for i in range(len(userList)):
    userDict[userList[i]] = i
userHistory = [[0 for i in range(10)] for j in range(len(userList))]
userCatRating = [[[] for i in range(10)]for u in userList]

In [None]:
for d in train_data:
    u = userDict[d['userID']]
    c = d['categoryID']
    userCatRating[u][c].append(d['rating'])
    userHistory[u][c]+=1.0
userCatAvg = [[np.mean(l)-avgRating if len(l)!=0 else 0 for l in u ]for u in userCatRating]

In [None]:
userHistory = [np.divide(u, np.linalg.norm(u)) for u in userHistory]

In [None]:
def feature(datum):
    wordList = datum['wordList']
    tf = [0 for i in range(lenWords)]
    for w in wordList:
        if w in commonWords:
            tf[wordDict[w]] += 1.0
    tf = np.array(tf)
    if np.max(tf) != 0 :
        tf = np.divide(tf, np.max(tf))
    tfidf = np.multiply(tf, idf)
    if datum['userID'] in userList:
        u = userDict[datum['userID']]
        tfidf = np.concatenate((tfidf, userHistory[u]))
        tfidf = np.concatenate((tfidf, userCatAvg[u]))
    else:
        tfidf = np.concatenate((tfidf, [0 for i in range(20)]))
    return tfidf

In [None]:
train_feature = np.array([feature(d) for d in train_data])
validation_feature = np.array([feature(d) for d in validation_data])

In [None]:
test_data = []
for l in readGz("test_Category.json.gz"):
    test_data.append(l)
for d in test_data:

    review = ''.join(c for c in d['reviewText'].lower() if c not in punctuation)
    wordList = review.split()
    wordList = [stemmer.stem(w) for w in wordList if w not in stopword]
    d['wordList'] = wordList
test_feature = np.array([feature(d) for d in test_data])

In [None]:
fc_size = 500
input_size = 2020
output_size = 10
regularization_rate = 0.001
learning_rate = 0.1
batch_size = 200
max_iter = 60000
#tensorflow learning hyperpatameters

In [None]:
def calc(X, regularizer):
    with tf.variable_scope('fc1'):
        w1 = tf.get_variable(name = 'weight', shape = [input_size, fc_size], initializer = tf.truncated_normal_initializer(stddev = 0.1))
        b1 = tf.get_variable(name = 'bias', shape = [fc_size], initializer = tf.constant_initializer(0.1))
        fc1 = tf.nn.relu(tf.matmul(X, w1)+b1)
        tf.add_to_collection('losses', regularizer(w1))
    
    with tf.variable_scope('fc2'):
        w2 = tf.get_variable(name = 'weight', shape = [fc_size, output_size], initializer = tf.truncated_normal_initializer(stddev = 0.1))
        b2 = tf.get_variable(name = 'bias', shape = [output_size], initializer = tf.constant_initializer(0.1))
        fc2 = tf.matmul(fc1, w2) + b2
        tf.add_to_collection('losses', regularizer(w2))
    
    return fc2
#A neural network with one hidden layer

In [None]:
X = tf.placeholder(tf.float32, [None, input_size], name = 'input-X')
y = tf.placeholder(tf.int64, [None], name = 'input-Y')
    
regularizer = tf.contrib.layers.l2_regularizer(regularization_rate)
    
y_ = calc(X, regularizer)
y_predict = tf.argmax(y_,1)
correct_prediction = tf.cast(tf.equal(y_predict, y),tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = y_, labels = y)
loss = tf.reduce_mean(cross_entropy) + tf.add_n(tf.get_collection('losses'))
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

In [None]:
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    for i in range(max_iter):
        sample = np.random.randint(0, 60000, batch_size)
        x_batch = train_feature[sample]
        y_batch = train_label[sample]
            
        _, loss_value = sess.run([train_step, loss], feed_dict = {X:x_batch,y:y_batch})
        if i % 500 == 0:
            print("After %d iters, loss on training is %f."%(i, loss_value))
            acc = sess.run(accuracy, feed_dict = {X:validation_feature, y:validation_label})
            print("After %d iters, accuracy on validation is %f"%(i, acc))
    predictions = open("predictions_Category.txt", 'w')
    predictions.write("userID-reviewHash,category\n")
    y_p = sess.run(y_predict, feed_dict = {X : test_feature,dropout_r:1})
    for d, l in zip(test_data, y_p):
        predictions.write(d['userID'] + '-' + d['reviewHash'] + ',' + str(l) + '\n')