# NLP - Review Data
In this project I use beer review data to build out user and item averages and accurately predict the overall score for the beer. Additionally, I use Jaccard and other similarity metrics to accurately classify and predict the scores for items on Amazon using temporal dynamics. 

In [1]:
import random
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip



In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
answers = {}

In [4]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [5]:
data = list(parseData("/beer_50000.json"))

In [6]:
random.seed(0)
random.shuffle(data)

In [7]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [8]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [9]:
maxLength = max([len(d['review/text']) for d in dataTrain])

In [10]:
data[0]

{'review/appearance': 4.0,
 'beer/style': 'Belgian Pale Ale',
 'review/palate': 4.0,
 'review/taste': 4.0,
 'beer/name': 'La Binchoise Blonde Tradition',
 'review/timeUnix': 1210043435,
 'beer/ABV': 6.5,
 'beer/beerId': '7693',
 'beer/brewerId': '3282',
 'review/timeStruct': {'isdst': 0,
  'mday': 6,
  'hour': 3,
  'min': 10,
  'sec': 35,
  'mon': 5,
  'year': 2008,
  'yday': 127,
  'wday': 1},
 'review/overall': 4.0,
 'review/text': 'From the 11.2oz stubby.\tPours a straw/golden color with a nice head that soon settles to a nice thin crown. Pleasing aroma of yeast and fruit. Taste melds fruit notes with a nice carbonation and peppery hop profile. Slightly dry finish, but satisfying leaving you wanting another drink. Recommended.',
 'user/profileName': 'sinistermadman',
 'review/aroma': 4.0}

In [11]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [12]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [13]:
catID = dict(zip(list(categories),range(len(categories))))

In [14]:
len(catID)

13

In [15]:
def feat(d, includeCat = True, includeReview = True, includeLength = True):
    feat = []
    if includeCat:
        feat = [0] * len(catID)
        if d['beer/style'] in catID:
            feat[catID[d['beer/style']]] = 1
    if includeReview:
        feat += [d['review/appearance'],
                 d['review/aroma'],
                 d['review/overall'],
                 d['review/palate'],
                 d['review/taste']]
    if includeLength:
        feat += [len(d['review/text']) / maxLength]
    return feat + [1]

In [16]:
def pipeline(reg, includeCat = True, includeReview = True, includeLength = True):
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    
    Xtrain = [feat(d, includeCat, includeReview, includeLength) for d in dataTrain]
    Xvalid = [feat(d, includeCat, includeReview, includeLength) for d in dataValid]
    Xtest = [feat(d, includeCat, includeReview, includeLength) for d in dataTest]
    
    mod.fit(Xtrain,yTrain)
    ypredValid = mod.predict(Xvalid)
    ypredTest = mod.predict(Xtest)
    
    # validation
    
    TP = sum([(a and b) for (a,b) in zip(yValid, ypredValid)])
    TN = sum([(not a and not b) for (a,b) in zip(yValid, ypredValid)])
    FP = sum([(not a and b) for (a,b) in zip(yValid, ypredValid)])
    FN = sum([(a and not b) for (a,b) in zip(yValid, ypredValid)])
    
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    
    vBER = 1 - 0.5*(TPR + TNR)
    
    print("C = " + str(reg) + "; validation BER = " + str(vBER))
    
    # test

    TP = sum([(a and b) for (a,b) in zip(yTest, ypredTest)])
    TN = sum([(not a and not b) for (a,b) in zip(yTest, ypredTest)])
    FP = sum([(not a and b) for (a,b) in zip(yTest, ypredTest)])
    FN = sum([(a and not b) for (a,b) in zip(yTest, ypredTest)])
    
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    
    tBER = 1 - 0.5*(TPR + TNR)
    
    print("C = " + str(reg) + "; test BER = " + str(tBER))

    return mod, vBER, tBER

In [17]:
### Question 1

In [18]:
mod, validBER, testBER = pipeline(10, True, False, False)

C = 10; validation BER = 0.16130237168160533
C = 10; test BER = 0.1607838024608832


In [19]:
answers['Q1'] = [validBER, testBER]

In [20]:
assertFloatList(answers['Q1'], 2)

In [21]:
### Question 2

In [22]:
mod, validBER, testBER = pipeline(10, True, True, True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 10; validation BER = 0.14199939829008612
C = 10; test BER = 0.1430334541462761


In [23]:
answers['Q2'] = [validBER, testBER]

In [24]:
assertFloatList(answers['Q2'], 2)

In [25]:
### Question 3

In [26]:
for c in [0.001, 0.01, 0.1, 1, 10]:
    pipeline(c, True, True, True)

C = 0.001; validation BER = 0.18963590685390597
C = 0.001; test BER = 0.1948467442774623
C = 0.01; validation BER = 0.14215569058816835
C = 0.01; test BER = 0.14364649970318144


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 0.1; validation BER = 0.14163189531729137
C = 0.1; test BER = 0.14221436006381616


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 1; validation BER = 0.14116813727875877
C = 1; test BER = 0.14318322215075607


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 10; validation BER = 0.14199939829008612
C = 10; test BER = 0.1430334541462761


In [27]:
bestC = 1

In [28]:
mod, validBER, testBER = pipeline(bestC, True, True, True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 1; validation BER = 0.14116813727875877
C = 1; test BER = 0.14318322215075607


In [29]:
answers['Q3'] = [bestC, validBER, testBER]

In [30]:
assertFloatList(answers['Q3'], 3)

In [31]:
### Question 4

In [32]:
mod, validBER, testBER_noCat = pipeline(bestC, False, True, True)

C = 1; validation BER = 0.300682433496804
C = 1; test BER = 0.31378822023708963


In [33]:
mod, validBER, testBER_noReview = pipeline(bestC, True, False, True)

C = 1; validation BER = 0.1605845486285633
C = 1; test BER = 0.16109632033831978


In [34]:
mod, validBER, testBER_noLength = pipeline(bestC, True, True, False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 1; validation BER = 0.14384635345580388
C = 1; test BER = 0.14739679150544838


In [35]:
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]

In [36]:
assertFloatList(answers['Q4'], 3)

In [37]:
### Question 5

In [44]:
path = "/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [45]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [46]:
dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [47]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [48]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataTrain:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [49]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

ratingMean = sum([d['star_rating'] for d in dataTrain]) / len(dataTrain)

In [50]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [51]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]

In [52]:
query = dataset[80]['product_id']

In [53]:
query

'B00KCHRKD6'

In [54]:
ms = mostSimilar(query, 10)

In [55]:
itemNames[query]

'SUPERNIGHT 5050 16.4ft 5M RGBWW LED Strip'

In [56]:
[(m[0], m[1], itemNames[m[1]]) for m in ms]

[(0.015228426395939087,
  'B00H7NFDKA',
  '5pcs Pack 10mm L-shape 4-conductor Quick Splitter Right Angle Corner Connector for 5050 RGB LED Strip Lights, Strip to Strip'),
 (0.014492753623188406, 'B00QKVV3HC', 'HitLights RGB LED Tape Light Strips'),
 (0.014492753623188406,
  'B00GXRMD7W',
  'Audio2000\'s 1/4" TS To XLR Female Microphone Cable (2 Pack)'),
 (0.014084507042253521, 'B00H7ILRRI', 'HitLights RGB LED Tape Light Strips'),
 (0.014084507042253521,
  'B0057RUMPO',
  'Crank Up Dj Light Stands (2 Pack) Stage Lighting Truss System by Griffin | Portable Speaker Tripod | Heavy Duty Standing Rig | Adjustable Height Trussing|Holds 6 Can Lights|Music Performance Equipment'),
 (0.014084507042253521,
  'B000B6DTYW',
  'American Dj S-Hook S Clamp Hang And Tighten'),
 (0.013888888888888888,
  'B00L2708TI',
  'Donner 8pcs DMX512 DMX Dfi DJ 2.4G Wireless 6 Receiver & 2 Transmitter Lighting Control'),
 (0.013513513513513514,
  'B009Z1KKWI',
  'Unbreakable Rubber Mic Clip For Extra Large Micropho

In [57]:
answers['Q5'] = ms

In [58]:
assertFloatList([m[0] for m in ms], 10)

In [59]:
### Question 6

In [60]:
def MSE(y, ypred):
    diffs = [(a-b)**2 for (a,b) in zip(y,ypred)]
    return sum(diffs) / len(diffs)

In [61]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        if item in itemAverages:
            return itemAverages[item]
        return ratingMean

In [62]:
alwaysPredictMean = [ratingMean for d in dataTest]

In [63]:
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]

In [64]:
labels = [d['star_rating'] for d in dataTest]

In [65]:
MSE(alwaysPredictMean, labels)

1.6236571809194997

In [66]:
MSE(simPredictions, labels)

1.7165666373341593

In [67]:
answers['Q6'] = MSE(simPredictions, labels)

In [68]:
assertFloat(answers['Q6'])

In [69]:
### Question 7

In [70]:
itsMSE = 1.7

In [71]:
answers['Q7'] = ["Description of your solution", itsMSE]

In [72]:
assertFloat(answers['Q7'][1])

In [73]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()