In [29]:
# imports
import gzip
import math
import random
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import string
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [6]:
# converts sentences to tokens
punct = string.punctuation
def word2token(sent):
        t = sent.lower() # lowercase string
        t = "".join([c for c in t if not (c in punct)]) # non-punct characters
        words = word_tokenize(t)# tokenizes
        return words

In [7]:
# reading in first 500,001 entries of DS
file = 'beer.json.gz'
def readData(path):
    x = gzip.open(path)
    for l in x:
        yield eval(l)

In [8]:
count = 0
X_train = []
y_train = []
X_test = []
y_test = []
amount_in_train = 400000
for d in readData(file):
    if count > 500000:
        break
    if count < amount_in_train:
        X_train.append(d['review/text'])
        y_train.append(d["review/overall"])
    if count > amount_in_train:
        X_test.append(d['review/text'])
        y_test.append(d["review/overall"])
    count = count + 1

In [9]:
def word_counts(train):
    wordCount = defaultdict(int)
    for t in train:
        words = word2token(t)
        for w in words:
            wordCount[w] += 1
    return wordCount

In [10]:
# creating word counts dictionary
wordCount = word_counts(X_train)

In [11]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [w[1] for w in counts[:200]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [12]:
def feature(data):
    feat = np.zeros(len(wordSet)+1)
    r = word2token(data)
    for w in r:
        if w in words:
            feat[wordId[w]] += 1
    feat[-1] = 1 # offset
    return feat

In [13]:
train_x = [feature(d) for d in X_train]

In [18]:
train_x = np.array(train_x).astype(np.uint8)
sparse_train = csr_matrix(train_x)
sparse_train

<375000x201 sparse matrix of type '<class 'numpy.uint8'>'
	with 17941988 stored elements in Compressed Sparse Row format>

In [19]:
test_x = [feature(d) for d in X_test]

In [20]:
test_x = np.array(test_x).astype(np.uint8)
sparse_test = csr_matrix(test_x)
sparse_test

<125000x201 sparse matrix of type '<class 'numpy.uint8'>'
	with 6003815 stored elements in Compressed Sparse Row format>

In [22]:
# results of ridge regression, no hyper tuning
clf = linear_model.Ridge()
clf.fit(sparse_train, y_train)
predictions = clf.predict(sparse_test)
mse = mean_absolute_error(y_test, predictions)
mse

0.47252587274477115

In [23]:
# first optimize lambda

In [24]:
# optimizing lambda
lambs = [10,100,500]
arr = []
for x in lambs:
    clf = linear_model.Ridge(x)
    clf.fit(sparse_train, y_train)
    predictions = clf.predict(sparse_test)
    mse = mean_absolute_error(y_test, predictions)
    arr.append((x,mse))
arr

[(10, 0.472523879856653),
 (100, 0.47251526007655803),
 (500, 0.47245607140926665)]

In [25]:
# optimizing lambda
lambs = [20000,21000]
arr = []
for x in lambs:
    clf = linear_model.Ridge(x)
    clf.fit(sparse_train, y_train)
    predictions = clf.predict(sparse_test)
    mse = mean_absolute_error(y_test, predictions)
    arr.append((x,mse))
arr

[(20000, 0.4718509769979457), (21000, 0.4718790960389748)]

In [17]:
# extremely large regualizer of 20,000 resulted in best mae of .4707
# means that we may be over fitting data

In [26]:
# top 5 most negative and most positive words
theta = clf.coef_
lc ={ w:theta[wordId[w]] for w in wordId}
pd.Series(lc).sort_values()

bad         -0.124806
no          -0.082371
thin        -0.070540
not         -0.062409
decent      -0.055044
               ...   
balanced     0.085017
smooth       0.100752
easy         0.104984
great        0.117258
drinkable    0.118000
Length: 200, dtype: float64

In [27]:
# remove values over 5
predictions = clf.predict(sparse_test)
less_5 = [5 if p>5 else p for p in predictions]
mean_squared_error(y_test, predictions)
mean_absolute_error(y_test, predictions)
#mean_squared_error(y_test, less_5)
mean_absolute_error(y_test, less_5)

0.4717042654255452