In [1]:
import numpy
import gzip
from collections import defaultdict
import scipy.optimize
import random
from math import exp
from math import log
from sklearn import svm

In [2]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

In [3]:
data = list(readGz("train.json.gz"))
train_data = data[:len(data)/2]
validation_data = data[len(data)/2:]

In [4]:
allHelpful = []
for l in train_data:
    allHelpful.append(l['helpful'])

In [5]:
trainHelpful = []
trainWords = []
trainRatings = []
trainTimes = []
trainMonths = []
trainSummary = []
trainCatnumbers = []
trainCatwords = []
trainCattypes = []


catDict = {'Women': [1,0,0,0,0], 'Men': [0,1,0,0,0], 'Girls': [0,0,1,0,0], 'Boys': [0,0,0,1,0], 'Baby': [0,0,0,0,1]}

for u in train_data:
    if u['helpful']['outOf'] > 0:
        trainHelpful.append(u['helpful']['nHelpful']*1.0/u['helpful']['outOf'])
        trainWords.append(len(u['reviewText'].split()))
        trainRatings.append(u['rating'])
        trainTimes.append(u['reviewTime'].split(','))
        trainSummary.append(len(u['summary'].split()))
        trainCatnumbers.append(len(u['categories']))
        trainCatwords.append(u['categories'])
for i in range(len(trainTimes)):
    trainMonths.append((2017 - float(trainTimes[i][1])) * 365 - float(trainTimes[i][0][0:2]) * 30 + 2.0)
    cat = catDict['Women']
    for j in range(len(trainCatwords[i])):
        if 'Men' in trainCatwords[i][j] or 'Man' in trainCatwords[i][j]:
            cat = catDict['Men']
        if 'Girls' in trainCatwords[i][j] or 'Girl' in trainCatwords[i][j]:
            cat = catDict['Girls']
        if 'Boys' in trainCatwords[i][j] or 'Boy' in trainCatwords[i][j]:
            cat = catDict['Boys']
        if 'Babies' in trainCatwords[i][j] or 'Baby' in trainCatwords[i][j]:
            cat = catDict['Baby']
    trainCattypes.append(cat)

In [6]:
def feature(words, ratings, months, cat):
    feat = [[1] for row in range(len(words))]
    feat = numpy.column_stack((feat, words, ratings, months, cat))
    return feat

In [7]:
X_train = feature(trainWords, trainRatings, trainMonths, trainCatnumbers)

In [8]:
helpfulRatings = defaultdict(list)
reviewWords = defaultdict(list)
reviewTimes = defaultdict(list)
monthPassed = defaultdict(list)
summaryWords = defaultdict(list)
catNumbers = defaultdict(list)
catWords = defaultdict(list)
catTypes = defaultdict(list)


for l in readGz("test_Helpful.json.gz"):
    user,item = l['reviewerID'],l['itemID']
    wordCount = defaultdict(int)
    reviewWords[user + item] = (len(l['reviewText'].split()))
    helpfulRatings[user + item] = (l['rating'])
    reviewTimes[user + item] = (l['reviewTime'].split(','))
    monthPassed[user + item] = (2017 - float(reviewTimes[user + item][1])) * 12 + float(reviewTimes[user + item][0][0:2])
    summaryWords[user + item] = (len(l['summary'].split()))
    catNumbers[user + item] = (len(l['categories']))
    catWords[user + item] = (l['categories'])
    cat = catDict['Women']
    for j in range(len(catWords[user + item])):
        if 'Men' in catWords[user + item][j] or 'Man' in catWords[user + item][j]:
            cat = catDict['Men']
        if 'Girls' in catWords[user + item][j] or 'Girl' in catWords[user + item][j]:
            cat = catDict['Girls']
        if 'Boys' in catWords[user + item][j] or 'Boy' in catWords[user + item][j]:
            cat = catDict['Boys']
        if 'Babies' in catWords[user + item][j] or 'Baby' in catWords[user + item][j]:
            cat = catDict['Baby']
    catTypes[user + item] = cat


In [9]:
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

In [10]:
def sigmoid(x):
  return 1.0 / (1 + exp(-x))

In [11]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  return -loglikelihood

In [12]:
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

In [13]:
# Obtain the evaluation
y_train=[]
for quality in trainHelpful:
  if quality > 0.75:
    y_train.append(1)
  else:
    y_train.append(0)
    


In [14]:
# Use a library function to run gradient descent (or you can implement yourself!)
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 4.0))

In [19]:
y_predictions = defaultdict(list)
for u in reviewWords:
    X_feature = [1.0, reviewWords[u], helpfulRatings[u], monthPassed[u], catTypes[u]]
    y_predictions[u] = sum(theta[i] * X_feature[i] for i in range(len(theta)))



In [20]:
minimum = 0
maximum = 0
for u in y_predictions:
    minimum = min(y_predictions[u], minimum)
    maximum = max(y_predictions[u], maximum)

In [21]:
userRate = defaultdict(list)
for u in y_predictions:
    if y_predictions > 0:
        userRate[u] = 0.75 + 0.25 * (y_predictions[u] / maximum)
    else:
        userRate[u] = 0.75 - 0.75 * (y_predictions[u] / minimum)

In [22]:
predictions = open("predictions_Helpful.txt", 'w')
for l in open("pairs_Helpful.txt"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,i,outOf = l.strip().split('-')
  outOf = int(outOf)
  result = round(outOf * userRate[u + i],0)
  predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(result) + '\n')

predictions.close()

TypeError: only length-1 arrays can be converted to Python scalars

In [23]:
theta

array([ -5.08729381e-01,   4.45243435e-04,   3.15081874e-01,
         3.52317809e-05,  -7.65316102e-03])