### Spark Initialization

In [1]:
import findspark
findspark.init()

In [2]:
import sys
import copy
import csv

from string import atoi
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
import numpy as np

In [4]:
conf = SparkConf().setAppName("ContentBased")
conf = conf.setMaster("local[*]")

In [5]:
sc  = SparkContext(conf=conf)

### Load Train and Test Data

In [6]:
trainData = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/pittsburgh_review_with_text_20_res_lemma_data_train.txt",use_unicode=False)
testData = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/pittsburgh_review_with_text_20_res_lemma_data_test.txt",use_unicode=False)

In [7]:
train_rdd = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))
test_rdd = testData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))

In [8]:
avg_rating = train_rdd.map(lambda x: (x[0][0], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))
prod_rating = train_rdd.map(lambda x: (x[0][1], x[1])).groupByKey().map(lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], sum(x[1])/len(x[1])))

In [9]:
train_temp = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), 1))

### Load review data

In [10]:
data = sc.textFile("/Users/lakshya/Desktop/INF-553/Project/pittsburgh_review_with_text_20_res_lemma_data_train.txt",use_unicode=False)

In [11]:
data.take(6)

['1VVHf1BvtGC0aSCCIjQyiA,K5jY2W5Q3eNnwssV5UZtow,4,2016-11-16,2,2,2,past sunday one several time ive spirit its always eccentric fun time first music performance second hang recent visit sundays bingo bango spirit know pizza good drink also awesome last time get chard margarita time get tomatillo margarita hot ciders its always pleasant surprise see whats menu food drink drink little pricey drawback one coolest things spirit atmosphere its always super strange positive way really never know expect bingo bango definitely family appropriate its fun activity friends maybe even date doesnt mind something ordinary',
 'QYKexxaOJQlseGWmc6soRg,rzByiKaj-bLeLz-zKNBQdg,2,2015-04-13,0,0,0,old cramp build lot enough employees staff keep demand cause long wait time',
 '-ARdx8hOcEWlMDjzwLYZ_g,3cbsPfoUUrysf-M8FI_0IA,4,2014-03-24,6,4,3,live long world without donut menu dont know group nine din three varieties donuts include lemon lavender chocolate espresso zeppolli amaze pepper donut concoction ever m

In [12]:
train_data = data.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), (x[2],x[7]))).join(train_temp)

In [13]:
train_data.take(3)

[(('3egcdazws_x1wW35jgXfNw', 'gae9LAyt7Qvf_OgAkWASxA'),
  (('4',
    'family always love buca di beppo location food great service always excellent however wait time food delivery final bill keep us remember come return experience buca di beppo wonderful love family style din make reservations years occasion price isnt bad receive well discount coupons everywhere look know say family return'),
   1)),
 (('4wp4XI9AxKNqJima-xahlg', '7O_mNtg37-1sMvQ5xmA8Dw'),
  (('4',
    'appreciate whole vibe old reclaim wood chocolate color wall beautiful crystal chandeliers almost like shouldnt go together go well hear serve lunch want check along something sweet course go roast beef sandwich fresh load good stuff definitely need mayo thats totally give know condiment obsession love bun wasnt hard bite downstairs like sweet treat wonderland glitter lot glitter know theres edible glitter its home base im totally okay past glitter delicious look treat every nook corner cake cookies bon bons even ice cre

### Collect user data from train data (User, Product, Rating)

In [14]:
userReview = train_data.map(lambda x: (x[0][0], x[0][1], x[1][0][0]))

In [15]:
userReviewCollected = userReview.map(lambda x: (x[0], x[1], x[2]))

In [16]:
userReviewCollected.take(5)

[('3egcdazws_x1wW35jgXfNw', 'gae9LAyt7Qvf_OgAkWASxA', '4'),
 ('4wp4XI9AxKNqJima-xahlg', '7O_mNtg37-1sMvQ5xmA8Dw', '4'),
 ('BxDsaVNeWxc5mNyA1HtSHQ', 'oeW0vIYd3rUnAPgmD4fEFg', '5'),
 ('BBg_86FEejn3dNzj0JOR9Q', 'XItYW5ul3OW_AqpT2nDbBQ', '4'),
 ('z2YVGKKcup6mjQmDQ6arEg', 'RqmORv3974ZDC6Zh4nSQwg', '5')]

### Collect product data (Product, Review Text)

In [17]:
prodReview = train_data.map(lambda x: (x[0][1], x[1][0][1])).groupByKey().mapValues(list)

In [18]:
prodReviewCollected = prodReview.map(lambda x: (x[0], x[1][0].split(" ")))

In [19]:
prodReviewCollected.take(5)

[('5REYrZfsX3m4E3FTwovp5Q',
  ['try',
   'first',
   'time',
   'last',
   'night',
   'pretty',
   'good',
   'one',
   'thing',
   'really',
   'annoy',
   'restaurant',
   'see',
   'review',
   'verde',
   'entire',
   'restaurant',
   'fill',
   'reservations',
   'can',
   'not',
   'even',
   'seat',
   'walk',
   '2',
   'people',
   'restaurant',
   'empty',
   'really',
   'avocado',
   'corn',
   'ceviche',
   'mojo',
   'criollo',
   'nigiri',
   'avocado',
   'crab',
   'delicious',
   'mojo',
   'criollo',
   'favorite',
   'everything',
   'taste',
   'super',
   'fresh',
   'flavorful',
   'serve',
   'good',
   'size',
   'date',
   'share',
   'wine',
   'meh',
   'would',
   'think',
   'place',
   'thats',
   'like',
   'tapas',
   'wine',
   'bar',
   'wine',
   'would',
   'better',
   'order',
   'frontera',
   'sauvignon',
   'blanc',
   'often',
   'buy',
   'liquor',
   'store',
   'slo',
   'wine',
   'hm',
   'lame',
   'hate',
   'pay',
   '9',
   'glass',


### Doc2Vec model creation

In [20]:
from collections import namedtuple
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

  % self._get_c_name())


Convert user and product rdd to pandas dataframe

In [21]:
spark = SparkSession(sc)

userPandas = userReviewCollected.toDF().toPandas()
prodPandas = prodReviewCollected.toDF().toPandas()

Create Doc2Vec model on product category text

In [22]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(prodPandas['_2'])]

In [23]:
documents

[TaggedDocument(words=[u'try', u'first', u'time', u'last', u'night', u'pretty', u'good', u'one', u'thing', u'really', u'annoy', u'restaurant', u'see', u'review', u'verde', u'entire', u'restaurant', u'fill', u'reservations', u'can', u'not', u'even', u'seat', u'walk', u'2', u'people', u'restaurant', u'empty', u'really', u'avocado', u'corn', u'ceviche', u'mojo', u'criollo', u'nigiri', u'avocado', u'crab', u'delicious', u'mojo', u'criollo', u'favorite', u'everything', u'taste', u'super', u'fresh', u'flavorful', u'serve', u'good', u'size', u'date', u'share', u'wine', u'meh', u'would', u'think', u'place', u'thats', u'like', u'tapas', u'wine', u'bar', u'wine', u'would', u'better', u'order', u'frontera', u'sauvignon', u'blanc', u'often', u'buy', u'liquor', u'store', u'slo', u'wine', u'hm', u'lame', u'hate', u'pay', u'9', u'glass', u'wine', u'restaurant', u'easily', u'buy', u'10', u'liquor', u'store', u'seem', u'justify', u'least', u'slo', u'bottle', u'service', u'good', u'nice', u'time', u'wou

In [26]:
model = Doc2Vec(documents, size = 3000, min_count = 1, workers=4)

In [27]:
feature_vectors = []
for i in range(0, len(model.docvecs)):
    feature_vectors.append(model.docvecs[i])

Add Doc2Vec vectors to product dataframe

In [28]:
prodPandas['Vector'] = feature_vectors

In [29]:
prodPandas

Unnamed: 0,_1,_2,Vector
0,5REYrZfsX3m4E3FTwovp5Q,"[try, first, time, last, night, pretty, good, ...","[0.0205016, -0.00644905, -0.0154693, -0.008487..."
1,HWrbZS1mxVRj2Y2VwMmDMg,"[oh, man, word, can, not, describe, excite, ba...","[0.0250453, -0.00734597, -0.0195707, -0.010632..."
2,MvlQo4bev1eqp1q0HYOLHg,"[first, dance, class, probably, decade, probab...","[0.00621634, -0.00210196, -0.00474029, -0.0025..."
3,X9Bql7RrPU5Mab5-hJsI8A,"[2nd, visit, promise, first, time, order, feat...","[0.00829221, -0.00161553, -0.00683834, -0.0037..."
4,owO2UkNKk9qrWWd_PTYLDA,"[feel, like, ive, random, experience, place, r...","[0.0334839, -0.00856758, -0.0266369, -0.014285..."
5,rc9SfYli96cnlPDMyJ7hyA,"[go, lunch, friends, group, 7, round, table, b...","[0.00490008, -0.000952024, -0.00394094, -0.002..."
6,94VxE6XLTrXmz4yXWEJbAg,"[come, instead, ds, need, buy, case, beer, the...","[0.00617516, -0.00202224, -0.00465858, -0.0025..."
7,anfgvbiobw4M0Wnvx_os3Q,"[its, good, place, try, ive, never, wow, prett...","[0.0115594, -0.0034133, -0.00886636, -0.004637..."
8,vLGYQ96AjKV5zpW0q3795g,"[breakfast, pizza, checkgood, pizza, accent, e...","[0.00988138, -0.002731, -0.00778469, -0.004043..."
9,7l-CCyNnvT86GMR5EtYjog,"[jamaican, tacos, tonight, die, cant, speak, a...","[0.00716874, -0.00213457, -0.00553517, -0.0029..."


Product category text not needed

In [30]:
del prodPandas['_2']

### Weighted Linear Combination of product vectors for users

In [31]:
userPandas['Vector'] = [[] for _ in range(len(userPandas))]

In [32]:
userPandas['_3'] = userPandas['_3'].astype(float)

In [33]:
for index, row in userPandas.iterrows():
    vector = np.array(prodPandas.loc[prodPandas['_1'] == row['_2'], 'Vector'].values[0])
    rating = row['_3']
    userPandas.at[index,'Vector'] = rating*vector

In [34]:
userPandas

Unnamed: 0,_1,_2,_3,Vector
0,3egcdazws_x1wW35jgXfNw,gae9LAyt7Qvf_OgAkWASxA,4.0,"[0.0349865, -0.0106697, -0.0254736, -0.014482,..."
1,4wp4XI9AxKNqJima-xahlg,7O_mNtg37-1sMvQ5xmA8Dw,4.0,"[0.138538, -0.0387583, -0.109732, -0.0595934, ..."
2,BxDsaVNeWxc5mNyA1HtSHQ,oeW0vIYd3rUnAPgmD4fEFg,5.0,"[0.0379545, -0.0111202, -0.0294376, -0.0158777..."
3,BBg_86FEejn3dNzj0JOR9Q,XItYW5ul3OW_AqpT2nDbBQ,4.0,"[0.00327241, -0.00115248, -0.00201679, -0.0014..."
4,z2YVGKKcup6mjQmDQ6arEg,RqmORv3974ZDC6Zh4nSQwg,5.0,"[0.0394794, -0.010021, -0.029873, -0.0161998, ..."
5,kb36ZHxGLdPU5p63Mj5u4w,mBycrsguRPhI0xtFBzsz_w,4.0,"[0.100866, -0.0314715, -0.0761508, -0.0424768,..."
6,TkFcPIHDPx43OqnqZ_ElPQ,SmkYLXEYhzwUZdS6TAevHg,4.0,"[0.0292667, -0.0106207, -0.0210769, -0.0114245..."
7,cUUCMncwWZm57s95iUX1Gg,dW79jPJVpzlTKGF-1JAsaw,3.0,"[0.0240211, -0.00689577, -0.0188331, -0.010171..."
8,JVJOfFqPmZEvkRznzF8w-w,yZ_wpRr4Svw8YRZ9voCgIw,4.0,"[0.0852398, -0.0250814, -0.0657066, -0.035448,..."
9,Q0Bs5ObN4w2U5WHs2gLsPA,MsFj3NTvmhYNYS8Ef8HwFA,2.0,"[0.0129259, -0.00435273, -0.00969217, -0.00527..."


Product and rating column not needed

In [35]:
del userPandas['_2']
del userPandas['_3']

Linear combination of feature vectors

In [36]:
userPandas = userPandas.groupby(['_1']).sum()

Normalize the user feature vectors

In [37]:
from sklearn.preprocessing import Normalizer

  _nan_object_mask = _nan_object_array != _nan_object_array


In [38]:
for index, row in userPandas.iterrows():
    vector = np.array(row['Vector']).reshape(1, -1)
    transformer = Normalizer().fit(vector)
    userPandas.at[index,'Vector'] = transformer.transform(vector)

In [39]:
userPandas

Unnamed: 0_level_0,Vector
_1,Unnamed: 1_level_1
-0-hVEpwWEcJLJoGq3rE3g,"[[0.0361696, -0.0106016, -0.0278118, -0.015141..."
-2OB54nQ6FsGLUM-R1KXnA,"[[0.0362862, -0.0106815, -0.0278889, -0.015098..."
-ARdx8hOcEWlMDjzwLYZ_g,"[[0.0361009, -0.0106476, -0.0278246, -0.015111..."
-Pk25bOBsvemFaWKDBVBzA,"[[0.0363492, -0.0108974, -0.0277772, -0.015049..."
-Q2wBtscwW6JOqlBndji4A,"[[0.0361955, -0.0107154, -0.0277945, -0.015060..."
-Q4bjWlbxmb1yKP4U7OODg,"[[0.0363478, -0.0107216, -0.0278263, -0.015063..."
-SDx-d5jppC4OBBosLVpYw,"[[0.0360146, -0.0103094, -0.0279749, -0.015155..."
-XgVXGJnOnW0kQEol6O3Pg,"[[0.0362197, -0.0106756, -0.027751, -0.0151491..."
-Y6tXYPYqeVy37-L5p0rMw,"[[0.0360319, -0.0105734, -0.0278866, -0.01509,..."
-a873HRQxWRRobMNT4xOKg,"[[0.0359095, -0.0102367, -0.0279689, -0.015127..."


### Create user numpy matrix from feature vectors

In [41]:
user_matrix = np.zeros((len(userPandas), len(feature_vectors[0])))
idx = 0
for index, row in userPandas.iterrows():
    vector = np.array(row['Vector'])[0]
    user_matrix[idx] = vector
    idx += 1
    

In [42]:
user_matrix

array([[  3.61696221e-02,  -1.06016286e-02,  -2.78117526e-02, ...,
         -3.76913790e-03,  -1.75396490e-05,   3.06606255e-02],
       [  3.62862274e-02,  -1.06814979e-02,  -2.78889313e-02, ...,
         -3.69135523e-03,  -7.47819286e-05,   3.06509361e-02],
       [  3.61008830e-02,  -1.06476303e-02,  -2.78246421e-02, ...,
         -3.72381089e-03,   2.14886095e-05,   3.06694750e-02],
       ..., 
       [  3.59692872e-02,  -1.03257420e-02,  -2.79108584e-02, ...,
         -3.65439383e-03,   4.39439231e-04,   3.08899973e-02],
       [  3.61240320e-02,  -1.06435344e-02,  -2.79051680e-02, ...,
         -3.73188732e-03,   8.95461108e-06,   3.05845160e-02],
       [  3.59661616e-02,  -1.01692686e-02,  -2.79367883e-02, ...,
         -3.79038579e-03,   7.52814580e-04,   3.10796779e-02]])

### Create product numpy matrix from feature vectors

In [43]:
prod_matrix = np.zeros((len(prodPandas), len(feature_vectors[0])))
idx = 0
for index, row in prodPandas.iterrows():
    vector = np.array(row['Vector'])
    prod_matrix[idx] = vector
    idx += 1

In [44]:
prod_matrix

array([[  2.05016192e-02,  -6.44904654e-03,  -1.54693164e-02, ...,
         -2.04841513e-03,  -5.43475966e-04,   1.67470761e-02],
       [  2.50453204e-02,  -7.34597305e-03,  -1.95707120e-02, ...,
         -2.71955272e-03,   6.84636107e-05,   2.13790480e-02],
       [  6.21634116e-03,  -2.10195687e-03,  -4.74029174e-03, ...,
         -6.91560621e-04,  -2.74897699e-04,   5.17081236e-03],
       ..., 
       [  9.24581010e-03,  -2.92712334e-03,  -6.88633136e-03, ...,
         -1.01360527e-03,  -3.39465652e-04,   7.64718931e-03],
       [  5.85718034e-03,  -1.65172527e-03,  -4.48580924e-03, ...,
         -7.36568589e-04,   2.33735365e-04,   5.13249310e-03],
       [  1.12336557e-02,  -3.15764849e-03,  -8.67991522e-03, ...,
         -1.27841369e-03,  -2.52891594e-04,   9.25785489e-03]])

### Compute cosine similarity by taking dot product

In [45]:
similarity_matrix = np.dot(user_matrix, prod_matrix.T)

In [46]:
similarity_matrix.shape

(987, 3098)

### Flatten similarity matrix to related with user and products

In [47]:
prod = prodPandas['_1'].values
user = userPandas.index.values

In [48]:
zf = similarity_matrix.flatten()
xr = np.repeat(user, prod.size)
yt = np.tile(prod, user.size)
d = np.stack((xr, yt, zf), axis=-1)

In [49]:
d.shape

(3057726, 3)

### Convert similarity matrix to RDD

In [50]:
similarity_rdd = sc.parallelize(d)

In [51]:
similarity_rdd.take(5)

[array([u'-0-hVEpwWEcJLJoGq3rE3g', u'5REYrZfsX3m4E3FTwovp5Q',
        0.5584845063387178], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'HWrbZS1mxVRj2Y2VwMmDMg',
        0.6976263497454933], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'MvlQo4bev1eqp1q0HYOLHg',
        0.16895888751936397], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'X9Bql7RrPU5Mab5-hJsI8A',
        0.2425728577085518], dtype=object),
 array([u'-0-hVEpwWEcJLJoGq3rE3g', u'owO2UkNKk9qrWWd_PTYLDA',
        0.9567172341532354], dtype=object)]

### Use and Test Data for predictions

In [52]:
trainData.take(5)

['1VVHf1BvtGC0aSCCIjQyiA,K5jY2W5Q3eNnwssV5UZtow,4,2016-11-16,2,2,2,past sunday one several time ive spirit its always eccentric fun time first music performance second hang recent visit sundays bingo bango spirit know pizza good drink also awesome last time get chard margarita time get tomatillo margarita hot ciders its always pleasant surprise see whats menu food drink drink little pricey drawback one coolest things spirit atmosphere its always super strange positive way really never know expect bingo bango definitely family appropriate its fun activity friends maybe even date doesnt mind something ordinary',
 'QYKexxaOJQlseGWmc6soRg,rzByiKaj-bLeLz-zKNBQdg,2,2015-04-13,0,0,0,old cramp build lot enough employees staff keep demand cause long wait time',
 '-ARdx8hOcEWlMDjzwLYZ_g,3cbsPfoUUrysf-M8FI_0IA,4,2014-03-24,6,4,3,live long world without donut menu dont know group nine din three varieties donuts include lemon lavender chocolate espresso zeppolli amaze pepper donut concoction ever m

Create key on (user, product)

In [53]:
train_rdd = trainData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))
test_rdd = testData.mapPartitions(lambda x: csv.reader(x)).map(lambda x: ((x[0], x[1]), float(x[2])))

In [54]:
similarity_rdd = similarity_rdd.map(lambda x: ((x[0], x[1]), float(x[2])))

In [55]:
similarity_rdd.take(5)

[((u'-0-hVEpwWEcJLJoGq3rE3g', u'5REYrZfsX3m4E3FTwovp5Q'), 0.5584845063387178),
 ((u'-0-hVEpwWEcJLJoGq3rE3g', u'HWrbZS1mxVRj2Y2VwMmDMg'), 0.6976263497454933),
 ((u'-0-hVEpwWEcJLJoGq3rE3g', u'MvlQo4bev1eqp1q0HYOLHg'), 0.16895888751936397),
 ((u'-0-hVEpwWEcJLJoGq3rE3g', u'X9Bql7RrPU5Mab5-hJsI8A'), 0.2425728577085518),
 ((u'-0-hVEpwWEcJLJoGq3rE3g', u'owO2UkNKk9qrWWd_PTYLDA'), 0.9567172341532354)]

Join similarity matrix with train and test RDD to take only similarity values for training and testing

In [56]:
train = similarity_rdd.join(train_rdd)
test = similarity_rdd.join(test_rdd)

In [57]:
train.take(5)

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (0.5929494120589855, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (0.4222360846859405, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (0.7562312611012413, 3.0)),
 ((u'4r33dXcE1oYZxjONrhxTiA', u'9gNko6cFCMZbvy1zhJ7-Xg'),
  (0.503761942458243, 5.0)),
 ((u'Rem81Xoev05aqeA-mFbM4A', u'1LUaZFVMEjodl1tbAGF3sQ'),
  (0.6223480746300942, 4.0))]

Convert RDD to List

In [58]:
train_ratings = train.collect()
test_ratings = test.collect()

In [59]:
test_rdd.take(5)

[(('JiPMk9WmbJu-VfTRAKpZpw', 'PdDpIGwBZoTYzOVasT-WuA'), 4.0),
 (('2wKnvn68eWybc7ID-7UQmQ', 'khRo2a5OaIjumox-tkg3GA'), 4.0),
 (('LsWpfxWjLQcazDqnZ_A62g', 'D_pwairtGGR0V_w2xx5XeA'), 2.0),
 (('0N9bSCmoJMoGmR0EldzjQg', '3iaOYhNoc6XL935MqnxJSQ'), 5.0),
 (('-hietrA8M58asfpyJkCyiA', 'O1ird5yRyuDFnOmYu90OoA'), 4.0)]

In [60]:
train_ratings

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (0.5929494120589855, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (0.4222360846859405, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (0.7562312611012413, 3.0)),
 ((u'4r33dXcE1oYZxjONrhxTiA', u'9gNko6cFCMZbvy1zhJ7-Xg'),
  (0.503761942458243, 5.0)),
 ((u'Rem81Xoev05aqeA-mFbM4A', u'1LUaZFVMEjodl1tbAGF3sQ'),
  (0.6223480746300942, 4.0)),
 ((u'4wp4XI9AxKNqJima-xahlg', u'nc5uuDeM3EA9WJycGDeg1w'),
  (0.6111078184072443, 4.0)),
 ((u'BkMqpJikNc3r5itc-ui6ww', u'x8WI_GkeGHGJCXggDm8flg'),
  (0.42543171611221586, 3.0)),
 ((u'ftUrNSpxUr8eWCyOlsZVgg', u'-zu_7iWh5CnV9kEQyCDAQQ'),
  (2.184518067932159, 4.0)),
 ((u'vHHjaq9pSuwq8uCrQ85qlw', u'_R1jBQQieKpNGMBqmrLRyA'),
  (0.5298842491403585, 2.0)),
 ((u'BkMqpJikNc3r5itc-ui6ww', u'Tdc2v9DBRRYeG2P_h9OvRw'),
  (0.10282343753042394, 3.0)),
 ((u'1ZPnQs-tdvbX8ROjtnzcEg', u'OXrFWgoz533T8tMRemkiww'),
  (0.028542403685761365, 5.0)),
 ((u'tR12WeWha2DGhUrKcvTttQ', 

In [61]:
train.take(5)

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (0.5929494120589855, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (0.4222360846859405, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (0.7562312611012413, 3.0)),
 ((u'4r33dXcE1oYZxjONrhxTiA', u'9gNko6cFCMZbvy1zhJ7-Xg'),
  (0.503761942458243, 5.0)),
 ((u'Rem81Xoev05aqeA-mFbM4A', u'1LUaZFVMEjodl1tbAGF3sQ'),
  (0.6223480746300942, 4.0))]

### Training Regressor on similarity values

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

Convert data to numpy array for regressor

In [63]:
X_train = []
y_train = []
for ratings in train_ratings:
    X_train.append(ratings[1][0])
    y_train.append(ratings[1][1])

X_train = np.array(X_train)
X_train = X_train.reshape(-1,1)

In [64]:
X_test = []
y_test = []
for ratings in test_ratings:
    X_test.append(ratings[1][0])
    y_test.append(ratings[1][1])

X_test = np.array(X_test)
X_test = X_test.reshape(-1,1)

In [65]:
X_test.shape

(9890, 1)

Train regressor on training data and make prediction on Test data<br>
Computed Mean Squared Error on predicted values

In [79]:
forest = RandomForestRegressor(max_depth=1, n_estimators=20)

rs = GradientBoostingRegressor(loss='ls', learning_rate=0.005, n_estimators=10000)

reg = LinearRegression()

forest.fit(X_train, y_train)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

expected = y_test
predicted = forest.predict(X_test)

train_expected = y_train
train_predicted = forest.predict(X_train)

print("Training:\n%s" % np.sqrt(mean_squared_error(train_expected, train_predicted)))
print("Result:\n%s" % np.sqrt(mean_squared_error(expected, predicted)))

KeyboardInterrupt: 

### Computed Ratings on missing values

In [67]:
missing_test = test_rdd.subtractByKey(test)

In [68]:
missing_ratings_user = missing_test.map(lambda x: ((x[0][0]), (x[0][1], x[1]))).join(avg_rating).map(lambda x: ((x[0], x[1][0][0]), (x[1][1], x[1][0][1])))

In [69]:
missing_ratings_user.take(5)

[(('70sSlkooEgL_TEjWDQbr3A', 'afXMX5llxcMFzbaPaBBm6A'), (3.12, 4.0)),
 (('pr8_C12oHakeNB4ZPp_dig', '3ZcxnR9YkDVRqqkDJMRWBg'), (4.2, 5.0)),
 (('dz8CFWEWuR_4S1zlZhWCMQ', 'rKh_Nl5edIB9AevqnDmO6g'),
  (3.7291666666666665, 3.0)),
 (('dz8CFWEWuR_4S1zlZhWCMQ', '_VYUU5HPLYasd-xdKLimNA'),
  (3.7291666666666665, 3.0)),
 (('z4MQzyewTRzSoStg0NwL-w', 'lvZOJWiwNymeBhOAgoy11w'),
  (3.911764705882353, 4.0))]

In [70]:
predictions = test.map(lambda x: ((x[0]), (rs.predict(np.array(x[1][0]).reshape(1,-1))[0], x[1][1])))

In [71]:
final_predictions = predictions.union(missing_ratings_user)

In [159]:
final_predictions.count()

12496

In [160]:
mse = final_predictions.map(lambda x: (x[1][0]-x[1][1])**2)

In [161]:
np.sqrt(mse.mean())

1.0299156039052781

In [72]:
final_predictions = final_predictions.collect()

### Save predictions file

In [73]:
with open('/Users/lakshya/Desktop/INF-553/Project/Doc2Vec_ReviewBasedPredictions_pittsburgh.txt', 'w') as f:
    for item in final_predictions:
        f.write(str(item[0][0])+","+str(item[0][1])+","+str(item[1][0])+"\n")

### Making predictions on training data for Hybrid model

In [74]:
train_predict = train.map(lambda x: ((x[0]), (rs.predict(np.array(x[1][0]).reshape(1,-1))[0], x[1][1])))

In [75]:
train_predict.take(5)

[((u'hHqH_E9FCI_B6WubV0jPYA', u'ZNdV9ytExuxPTXSN8i2xhw'),
  (3.8043756446421129, 3.0)),
 ((u'V0pP_PQnWdtyKpF-pifiaw', u'Fpm3WvqtrAg2ueh_4pz7iA'),
  (3.8790432749793768, 4.0)),
 ((u'IKnLl7SbuP0u6HS34jwHhw', u'guQww9yGHP7rRTea6zTnDg'),
  (3.8445359178337113, 3.0)),
 ((u'4r33dXcE1oYZxjONrhxTiA', u'9gNko6cFCMZbvy1zhJ7-Xg'),
  (3.9274325671063925, 5.0)),
 ((u'Rem81Xoev05aqeA-mFbM4A', u'1LUaZFVMEjodl1tbAGF3sQ'),
  (3.7981318019342316, 4.0))]

In [166]:
train_predictions = train_predict.map(lambda x: (x[1][0]-x[1][1])**2)

In [167]:
np.sqrt(train_predictions.mean())

1.03320310263514

In [76]:
train_predict = train_predict.collect()

In [77]:
with open('/Users/lakshya/Desktop/INF-553/Project/TrainDoc2Vec_ReviewBasedPredictions_pittsburgh.txt', 'w') as f:
    for item in train_predict:
        f.write(str(item[0][0])+","+str(item[0][1])+","+str(item[1][0])+"\n")