In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import heapq
from scipy import spatial

In [2]:
def parseData(fname):
    for l in urllib.urlopen(fname):
        yield eval(l)

In [3]:
### Just the first 5000 reviews

print "Reading data..."
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print "done"

Reading data...
done


In [4]:
### Question 1 
biCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.lower().split()
    for i in range(len(words)-1):
        if not (words[i],words[i+1]) in biCount:
            biCount[(words[i],words[i+1])] = 0
        biCount[(words[i],words[i+1])] += 1
print "There are ", len(biCount), "unique bigrams"
print"The 5 most frequently occuring bigrams are", heapq.nlargest(5, biCount, key=biCount.get)

There are  182246 unique bigrams
The 5 most frequently occuring bigrams are [('with', 'a'), ('in', 'the'), ('of', 'the'), ('is', 'a'), ('on', 'the')]


In [5]:
### Question 2
counts = [(biCount[w], w) for w in biCount]
counts.sort()
counts.reverse()

bigrams = [x[1] for x in counts[:1000]]

### Sentiment analysis

biId = dict(zip(bigrams, range(len(bigrams))))
biSet = set(bigrams)

In [6]:
def feature1(datum):
    feat = [0]*len(bigrams)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    words = r.split()
    for i in range(len(words)-1):
        if (words[i],words[i+1]) in bigrams:
            feat[biId[(words[i],words[i+1])]] += 1
    feat.append(1) #offset
    return feat

In [7]:
X1 = [feature1(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X1, y)
theta1 = clf.coef_
predictions1 = clf.predict(X1)

MSE1 = 0
for i in range(len(y)):
    MSE1 += (predictions1[i]-y[i])**2
MSE1 /= len(y)
print "The MSE is ", MSE1

The MSE is  0.343153014061


In [8]:
### Question 3
wordCount = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [9]:
allwordCount = dictMerged1=dict(biCount.items() + wordCount.items())

In [10]:
allcounts = [(allwordCount[w], w) for w in allwordCount]
allcounts.sort()
allcounts.reverse()

allgrams = [x[1] for x in allcounts[:1000]]

### Sentiment analysis

gramId = dict(zip(allgrams, range(len(allgrams))))
gramSet = set(allgrams)

In [11]:
def feature2(datum):
    feat = [0]*len(allgrams)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    words = r.split()
    for i in range(len(words)-1):
        if (words[i],words[i+1]) in allgrams:
            feat[gramId[(words[i],words[i+1])]] += 1
    for w in words:
        if w in allgrams:
            feat[gramId[w]] += 1
    feat.append(1) #offset
    return feat

In [12]:
X2 = [feature2(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X2, y)
theta2 = clf.coef_
predictions2 = clf.predict(X2)

MSE2 = 0
for i in range(len(y)):
    MSE2 += (predictions2[i]-y[i])**2
MSE2 /= len(y)
print "The MSE is ", MSE2

The MSE is  0.289548339815


In [14]:
### Question 4
weights = [(theta2[i],allgrams[i]) for i in range(len(allgrams))]
weights.sort()
print "The 5 unigrams/bigrams with the most positive associated weights are:"
for i in range(5):
    print weights[999-i][1]
print "The 5 unigrams/bigrams with the most negative associated weights are:"
for i in range(5):
    print weights[i][1]

The 5 unigrams/bigrams with the most positive associated weights are:
sort
('a', 'bad')
('of', 'these')
('not', 'bad')
('the', 'best')
The 5 unigrams/bigrams with the most negative associated weights are:
('sort', 'of')
water
corn
('the', 'background')
straw


In [10]:
### Question 5
tfwords = ['foam', 'smell', 'banana', 'lactic', 'tart']
idf = defaultdict(int)
tf = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.split()
    for w in tfwords:
        if w in words:
            idf[w] +=1
            
for w in idf:
    idf[w] = numpy.log10(5000/idf[w])
    
firstReview = data[0]['review/text']
r = ''.join([c for c in firstReview.lower() if not c in punctuation])
firstreviewWords = r.split()
for w in firstreviewWords:
    if w in tfwords:
        tf[w] += 1

for w in tf:
    print "In the first review, idf score for ", w, "is", idf[w]        

for w in tf:
    print "In the first review, tf-idf score for ", w, "is", tf[w] * idf[w]

In the first review, idf score for  foam is 1.11394335231
In the first review, idf score for  smell is 0.47712125472
In the first review, idf score for  banana is 1.67209785794
In the first review, idf score for  lactic is 2.92064500141
In the first review, idf score for  tart is 1.80617997398
In the first review, tf-idf score for  foam is 2.22788670461
In the first review, tf-idf score for  smell is 0.47712125472
In the first review, tf-idf score for  banana is 3.34419571587
In the first review, tf-idf score for  lactic is 5.84129000281
In the first review, tf-idf score for  tart is 1.80617997398


In [88]:
### Question 6
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
freq_words = [x[1] for x in counts[:1000]]
wordId = dict(zip(freq_words, range(len(freq_words))))
wordSet = set(freq_words)

idf = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.lower().split()
    for w in wordSet:
        if w in words:
            idf[w] +=1

In [37]:
for w in idf:
    idf[w] = numpy.log10(5000/idf[w])

In [63]:
tfidf = {}
for i in range(len(data)):
    tf = defaultdict(int)
    tfidf[i] = []
    r = ''.join([c for c in data[i]['review/text'].lower() if not c in punctuation])
    reviewWords = r.split()
    for w in reviewWords:
        tf[w] += 1
    for w in idf:
        tfidf[i].append(tf[w] * idf[w])   

In [24]:
#cosSim = 1 - spatial.distance.cosine(tfidf[0], tfidf[1])
print "Cosine similarity between the first and the second review is ", 0.0658819397474

Cosine similarity between the first and the second review is  0.0658819397474


In [40]:
### Question 7
cosSet = [] 
for i in range(1,len(data)):
    sim = 1 - spatial.distance.cosine(tfidf[0], tfidf[i])
    cosSet.append(sim)

In [27]:
index = cosSet.index(cosSet == max)
print "Review having the highest cosine similarity compared to the first review is the one with beer ID", data[index]['beer/beerId'], "and profile name", data[index]['user/profileName']

Review having the highest cosine similarity compared to the first review is the one with beer ID 72146 and profile name spicelab


In [104]:
### Question 8
X3=[]
for key, value in tfidf.iteritems():
    temp = value
    X3.append(temp)
    
for i in range(len(X3)):
    X3[i].append(1)
    
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X3, y)
theta3 = clf.coef_
predictions3 = clf.predict(X3)

MSE3 = 0
for i in range(len(y)):
    MSE3 += (predictions3[i]-y[i])**2
MSE3 /= len(y)
print "The MSE is ", MSE3

The MSE is  0.281474657646


In [22]:
i=-1
fk=[1.13786862069,0.537901618865,1.67778070527,2.92081875395,1.80687540165]
for w in tf:
    i +=1
    print "In the first review, tf-idf score for ", w, "is", tf[w] * fk[i]

In the first review, tf-idf score for  foam is 2.27573724138
In the first review, tf-idf score for  smell is 0.537901618865
In the first review, tf-idf score for  banana is 3.35556141054
In the first review, tf-idf score for  lactic is 5.8416375079
In the first review, tf-idf score for  tart is 1.80687540165


In [30]:
print "The MSE is ", 0.278759560078

The MSE is  0.278759560078


In [31]:
def cosineSimilarity(tfidf1, tfidf2):
    sim = 0
    denum1 = 0
    denum2 = 0
    for w in tfidf1:
        if w in tfidf2:
            sim += tfidf1[w] * tfidf2[w]
        denum1 += tfidf1[w]**2
    for w in tfidf2:
        denum2 += tfidf2[w]**2
    sim /= numpy.sqrt(denum1*denum2)
    return sim

In [None]:
### Question 1 
biCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.lower().split()
    for i in range(len(words)-1):
        if not words[i]+'-'+words[i+1] in biCount:
            biCount[words[i]+'-'+words[i+1]] = 0
        biCount[words[i]+'-'+words[i+1]] += 1
print "There are ", len(biCount), "unique bigrams"
print"The 5 most frequently occuring bigrams are", heapq.nlargest(5, biCount, key=biCount.get)

In [None]:
### Question 2
counts = [(biCount[w], w) for w in biCount]
counts.sort()
counts.reverse()

bigrams = [x[1] for x in counts[:1000]]

### Sentiment analysis

biId = dict(zip(bigrams, range(len(bigrams))))
biSet = set(bigrams)

In [None]:
def feature1(datum):
    feat = [0]*len(bigrams)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    words = r.split()
    for i in range(len(words)-1):
        if words[i]+'-'+words[i+1] in bigrams:
            feat[biId[words[i]+'-'+words[i+1]]] += 1
    feat.append(1) #offset
    return feat

In [None]:
X1 = [feature1(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X1, y)
theta1 = clf.coef_
predictions1 = clf.predict(X1)

MSE1 = 0
for i in range(len(y)):
    MSE1 += (predictions1[i]-y[i])**2
MSE1 /= len(y)
print "The MSE is ", MSE1

In [None]:
### Question 3
wordCount = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [None]:
allwordCount = biCount.copy()
allwordCount.update(wordCount)

In [None]:
allcounts = [(allwordCount[w], w) for w in allwordCount]
allcounts.sort()
allcounts.reverse()

allgrams = [x[1] for x in allcounts[:1000]]

### Sentiment analysis

gramId = dict(zip(allgrams, range(len(allgrams))))
gramSet = set(allgrams)

In [None]:
X2 = [feature2(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X2, y)
theta2 = clf.coef_
predictions2 = clf.predict(X2)

MSE2 = 0
for i in range(len(y)):
    MSE2 += (predictions2[i]-y[i])**2
MSE2 /= len(y)
print "The MSE is ", MSE2

In [None]:
### Question 4
weights = [(theta2[i],allgrams[i]) for i in range(len(allgrams))]
weights.sort()
print "The 5 unigrams/bigrams with the most positive associated weights are:"
for i in range(5):
    print weights[999-i][1]
print "The 5 unigrams/bigrams with the most negative associated weights are:"
for i in range(5):
    print weights[i][1]