In [1]:
import gzip
import numpy as np
import pandas as pd
import scipy.optimize
import random
from collections import defaultdict
import nltk
from sklearn import svm
import string
from sklearn import linear_model
import pickle

In [2]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)
data = [d for d in parseData("train.json")]

In [3]:
wordCount = defaultdict(int)
idf = defaultdict(int)
punctuation = set(string.punctuation)
avglen = 0
for d in data:
    r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
    avglen += len(r.split())
    seen = []
    for w in r.split():
        wordCount[w] += 1
        if w not in seen:
            idf[w] += 1
            seen.append(w)
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
avglen /= len(data)



In [None]:
twowordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
    split = r.split()
    for i in range(len(split)-1):
        w = split[i] + ' ' + split[i + 1]
        twowordCount[w] += 1

twocounts = [(twowordCount[w], w) for w in twowordCount]
twocounts.sort()
twocounts.reverse()

In [None]:
twowords = [x[1] for x in twocounts[:1000]]
twowordId = dict(zip(twowords, range(len(twowords))))
twowordSet = set(twowords)

In [4]:
words = [x[1] for x in counts[:2000]]

In [5]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [10]:
def feature(datum):
    feat = [0]*(len(words))
    r = ''.join([c for c in datum['reviewText'].lower() if not c in punctuation])
    split = r.split()
    #1grams
    for w in split:
        if w in words:
            feat[wordId[w]] += 1
#     #2grams
#     for i in range(len(split) - 1):
#         w = split[i] + ' ' + split[i + 1]
#         if w in twowords:
#             feat[twowordId[w]] += 1
    time = datum['reviewTime'].split()
    #year
#     feat.append(int(time[2]))
    #month
    for i in range(1,13):
        if int(time[0]) == i:
            feat.append(1)
        else:
            feat.append(0)
#     feat.append(len(split)/avglen)
    return feat


In [None]:
test_data = data[:10000]
X = [feature(d) for d in test_data]
y = [(d['categoryID']==0) for d in test_data]

In [None]:
clf = svm.SVC(C=1000, kernel='rbf')
clf.fit(X, y)

In [None]:
X_test = [feature(d) for d in data[10000:20000]]
y_test = [(d['categoryID'] == 0) for d in data[10000:20000]]

In [None]:
predictions = clf.predict(X)
predictions.sort()

In [None]:
score = [i > 0 for i in predictions]

In [None]:
correct = [(a==b) for (a,b) in zip(score,y)]

In [None]:
np.sum(correct)

In [None]:
test_predictions = clf.predict(X_test)
test_score = [i > 0 for i in test_predictions]
correct = [(a==b) for (a,b) in zip(test_score,y_test)]
np.sum(correct)/len(correct)

In [11]:
test_data = data[:100000]
X = [feature(d) for d in test_data]
y = [d['categoryID'] for d in test_data]
clf2 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [12]:
clf2.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
print("Train:" , clf2.score(X,y))
x2 = [feature(d) for d in data[100000:150000]]
y2 = [d['categoryID'] for d in data[100000:150000]]
print("Test: " , clf2.score(x2,y2))

Train: 0.83943
Test:  0.82286


[-35.832642759263557,
 -47.819815649541759,
 -35.915206284725777,
 -23.654683933049238,
 -11.660275932820131,
 -34.470904383996853,
 -46.153710023930287,
 0,
 -11.310319564873566,
 -11.401233838950162,
 -11.31648469961539,
 0,
 0,
 -11.249949013154295,
 -11.334038004910557,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -11.039556178248191,
 0,
 0,
 0,
 0,
 -32.102308493484244,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -10.520698167006133,
 0,
 0,
 0,
 -10.40250371834262,
 0,
 0,
 0,
 -10.274292200380668,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -10.241743910950843,
 0,
 -20.401175271932335,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -9.8816508579269531,
 0,
 0,
 0,
 0,
 0,
 -9.8323137330816035,
 0,
 0,
 -9.8336550429374476,
 0,
 -19.676831111584825,
 0,
 0,
 0,
 -9.7283600563224137,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -9.0631156522196576,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -9.3540077217885962

In [None]:
pred2 = clf2.predict(X_test)

In [None]:
dates =[int(d['reviewTime'].split()[0]) for d in data]

In [None]:
file = open("tfidf.pkl",'rb')

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

clf2 = pickle.load(file)
predictions = open("predictions_Category3.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    feat = feature(l)
    feat = np.array(feat)
    feat = np.reshape(feat,(1,2024))
    cat = clf2.predict(feat)[0]
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()

In [None]:
import pickle
file = open("clf2.pkl",'wb')
pickle.dump(clf2, file)

In [None]:
file = open("clf2.pkl",'rb')
clf2 = pickle.load(file)

In [None]:
np.random.shuffle(data)
batch = 200000
clf2 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=.1)
for i in range(1):   
    X = [feature(d) for d in data]
    y = [d['categoryID'] for d in data]
    clf2.fit(X,y)
    print(1)

In [None]:
import pickle
file = open("orig200.pkl",'wb')
pickle.dump(clf2, file)

In [None]:
file = open("orig.pkl",'rb')
clf2 = pickle.load(file)
file.close()
X = [feature(d) for d in data[150000:]]
y = [d['categoryID'] for d in data[150000:]]
print(clf2.score(X,y))
file = open("orig.pkl",'wb')
pickle.dump(clf2, file)

In [None]:
file.close()

In [None]:
data[1]

In [None]:
file = open("orig200.pkl",'rb')

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

clf2 = pickle.load(file)
predictions = open("predictions_Category5.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    feat = feature(l)
    feat = np.array(feat)
    feat = np.reshape(feat,(1,2024))
    cat = clf2.predict(feat)[0]
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()

In [None]:
iss = [(idf[w], w) for w in idf]
iss.sort()
iss.reverse()

In [None]:
avglen

In [None]:

from sklearn.naive_bayes import MultinomialNB
X = [feature(d) for d in data[:75000]]
y = [d['categoryID'] for d in data[:75000]]
clf = MultinomialNB().fit(X, y)


In [None]:
import pickle
file = open("bayes.pkl",'wb')
pickle.dump(clf, file)
X = [feature(d) for d in data[150000:]]
y = [d['categoryID'] for d in data[150000:]]
print(clf.score(X,y))

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
                  'solver':('lbfgs','liblinear'),
                  'class_weight': ('balanced', None)}
clf2 = linear_model.LogisticRegression(multi_class='ovr', C=.1)
X = [feature(d) for d in data[:1000]]
y = [d['categoryID'] for d in data[:1000]]
gs_clf = GridSearchCV(clf2, parameters, cv=5, iid=False, n_jobs=-1)
gs_clf = gs_clf.fit(X, y)

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
len(feature(data[1]))

In [None]:
import pickle
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier

In [None]:
np.random.shuffle(data)
text = []
for d in data:
    text.append(d['reviewText'])

In [None]:
x_train = [feature(d) for d in data[:50000]]
y_train = [d['categoryID'] for d in data[:50000]]
x_test = [feature(d) for d in data[150000:]]
y_test = [d['categoryID'] for d in data[150000:]]

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=20000,
                             stop_words='english',
                             use_idf=True, analyzer='word')

In [None]:
X_train_tfidf = vectorizer.fit_transform(x_train)


In [None]:
svd = TruncatedSVD(1000)
lsa = make_pipeline(svd, Normalizer(copy=False))

In [None]:
_train_lsa = lsa.fit_transform(x_train)


In [None]:
# X_test_tfidf = vectorizer.transform(x_test)
X_test_lsa = lsa.transform(x_test)

In [None]:
clf2 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=.1)
clf2.fit(_train_lsa, y_train)

In [None]:
# Classify the test vectors.
p = clf2.predict(X_test_lsa[:100000])

# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print("  (%d / %d) correct - %.2f%%" % (numRight, len(p), float(numRight) / float(len(p)) * 100.0))


In [None]:
p

In [None]:

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

predictions = open("predictions_Category4.txt", 'w')
predictions.write("reviewerID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    feat = l['reviewText']
    t = []
    t.append(feat)
    feat = vectorizer.transform(t)
    feat = lsa.transform(feat)
    cat = clf2.predict(feat)[0]
    predictions.write(l['reviewerID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
np.random.shuffle(data)
X_train = [feature(d) for d in data[:150000]]
y_train = [d['categoryID'] for d in data[:150000]]
X_test = [feature(d) for d in data[150000:]]
y_test = [d['categoryID'] for d in data[150000:]]


In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)  
# apply same transformation to test data
X_test = scaler.transform(X_test) 

In [None]:
clf2 = MLPClassifier(solver='adam', alpha=1e-5,
                     hidden_layer_sizes=(1500))
clf2.fit(X_train,y_train)

In [None]:
import pickle
file = open("neural.pkl",'wb')
pickle.dump(clf2, file)
file = open("neural.pkl",'rb')
clf2 = pickle.load(file)
file.close()
print(clf2.score(X_test,y_test))
file = open("neural.pkl",'wb')
pickle.dump(clf2, file)

In [None]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_train, y_train) 
print(clf.score(X_test, y_test))
file = open("svm.pkl",'wb')
pickle.dump(clf2, file)
file.close()


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
np.random.shuffle(data)
X_train = [d['reviewText'] for d in data[:50000]]
y_train = [d['categoryID'] for d in data[:50000]]
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vectorizer', TfidfVectorizer(analyzer=<function <lambda> at 0x00000198C4F90F28>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), no...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [7]:
X_test = [d['reviewText'] for d in data[50000:100000]]
y_test = [d['categoryID'] for d in data[50000:100000]]
print(svc_tfidf.score(X_test,y_test))

0.70754


In [3]:
products = [d['itemID'] for d in data[:100000]]

In [4]:
users = [d['reviewerID'] for d in data[:100000]]

In [5]:
ratings = [1 for d in data[:100000]]

In [6]:
df = pd.DataFrame({'item' : products,'user' : users,'rating' : ratings})

In [7]:
R_df = df.pivot(index = 'user', columns ='item', values = 'rating')

In [8]:
R_demeaned=R_df.fillna(0).as_matrix()

MemoryError: 

In [10]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_df.as_matrix(), k = 500)
sigma = np.diag(sigma)

ArpackError: ARPACK error 3: No shifts could be applied during a cycle of the Implicitly restarted Arnoldi iteration. One possibility is to increase the size of NCV relative to NEV. 

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [None]:
users = range(len(R_df))

In [None]:
user_id_dict = dict(zip(user_id, users))

In [38]:
predictions = open("purchase_predictions_test1.txt", 'w')
for l in open("pairs_Purchase.txt"):
    if l.startswith("reviewerID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u not in user_id_dict:
        predictions.write(u + '-' + i + ",0\n")
        continue
    if i not in products:
        predictions.write(u + '-' + i + ",0\n")
        continue
    rating = preds_df.iloc[user_id_dict[u]][i]
    if rating > 0:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")
predictions.close()

In [None]:
user_id = []
for d in R_df.iterrows():
    user_id.append(d[0])
    

In [11]:
len(R_df)

NameError: name 'R_df' is not defined

In [14]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [16]:
from keras.models import load_model

model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

Using TensorFlow backend.


NameError: name 'model' is not defined