In [1]:
import numpy as np


import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml import Pipeline

from nlp_cl_start import data_tokenizer

In [2]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder
        .master("local[4]")
        .appName("yelp_academic")
        .getOrCreate()
        )
sc = spark.sparkContext

In [3]:
import pandas as pd

In [4]:
from pipe_spar import api_f, cluster_biz_by_review , cluster_user_by_review

In [5]:
from nlp_cl_start import if_rest_udf

In [6]:
from my_metr import transform_to_score, my_scorer, transform_aggregated
from sklearn.metrics import accuracy_score, recall_score

## Load data
load all data, select bad reviews, define train and test

In [7]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

1. Filter restaraunts

In [8]:
rests = biz.filter(if_rest_udf(biz.categories))

In [9]:
rest_rev = rev.join(rests.select('business_id','stars').withColumnRenamed('stars','rating'),'business_id')
bad_reviews = rest_rev.filter('stars < 3')

In [31]:
bad_reviews.count()

768690

### Now I change train sample from 20000 to 100000

In [10]:
bad_sample = bad_reviews.sample(False, 0.127, seed =91)
#bad_sample = bad_reviews.sample(False, 0.027, seed =91)


In [89]:
%%time
bad_sample.count()

CPU times: user 575 µs, sys: 1.04 ms, total: 1.62 ms
Wall time: 158 ms


97353

In [11]:
bad_sample.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double]

2. Tokenize sample

In [12]:
sample_token= data_tokenizer(bad_sample)

In [13]:
splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed = 91)


In [14]:
train = splits[0]
add_cl = splits[1]
test = splits[2]

In [16]:
train.first()

Row(business_id='--9e1ONYQuAa-CB_Rrw7Tw', cool=0, date='2012-05-11', funny=0, review_id='OaRMJKI6S7LAoa8xoEaqPg', stars=2, text="I feel like Twitter is pretty useless so far; I have an ok amount of followers I guess, but I feel like they're all following so many people that anything I say gets lost in the mess of their feed. I could tweet all day and get very little response to anything aside from a couple loyal folks on there. And it was one such reader that led me to Delmonico recently for what he claimed to be an excellent burger. He's a food writer in town so he should know what he's talking about, right?\n\nDelmonico is located in the small restaurant row of the Venetian right by another great burger spot, although most people would not come here for their burger. The interior is nothing too fancy in either the bar area or more traditional area in the back. I did notice while sitting at the bar that they have an amazing bourbon selection, but since I was there during a lunch break

In [15]:
train.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

### Model to cluster all reviews

In [17]:
cv = CountVectorizer(minDF=5, vocabSize=6000, inputCol='token', outputCol='vectors')
km1 = KMeans(k = 10, featuresCol='vectors', maxIter= 22)


pipe_count = Pipeline(stages=[cv, km1])

### idf 22.33

In [95]:
cv = CountVectorizer(minDF=5, vocabSize=5000, inputCol='token', outputCol='vectors')
idf = IDF(minDocFreq=7, inputCol="vectors", outputCol="features")
km2 = KMeans(k = 18, featuresCol='features', maxIter= 30)
pipe_idf = Pipeline(stages = [cv, idf, km2])

## idf 4/12/18 20:24 

In [16]:
cv = CountVectorizer(minDF=5, vocabSize=5000, inputCol='token', outputCol='vectors')
idf = IDF(minDocFreq=10, inputCol="vectors", outputCol="features")
km2 = KMeans(k = 10, featuresCol='features', maxIter= 30)
pipe_idf = Pipeline(stages = [cv, idf, km2])

In [148]:
IDF?

In [17]:
%%time
pipe_cv_model = pipe_count.fit(train)

NameError: name 'pipe_count' is not defined

In [18]:
%%time
###!!!!!!!!IDF
pipe_idf_model = pipe_idf.fit(train)

CPU times: user 150 ms, sys: 69 ms, total: 219 ms
Wall time: 1min 29s


3. Clustering user and bussiness

In [19]:
both = train.union(add_cl)
both.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

In [20]:
user_with_cl = cluster_user_by_review(both, pipe_cv_model)

NameError: name 'pipe_cv_model' is not defined

In [118]:
biz_with_cl = cluster_biz_by_review(both, pipe_cv_model)

### idf 22.36

In [21]:
user_with_cl = cluster_user_by_review(both, pipe_idf_model)
biz_with_cl = cluster_biz_by_review(both, pipe_idf_model)

In [22]:
user_with_cl.cache()
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

In [177]:
biz_with_cl.count()

87551

## Create test set 

I need reviews that I didn't use for clusterization restorants and user. But in same time that contains clusterized restaraunts and users

In [23]:
train_rev_id =both.select('review_id')
train_rev_id.take(2)

[Row(review_id='OaRMJKI6S7LAoa8xoEaqPg'),
 Row(review_id='syUWGlVaWBkr9iI7sMJGcw')]

In [120]:
%%time
user_with_cl.count()

CPU times: user 5.28 ms, sys: 3.46 ms, total: 8.74 ms
Wall time: 225 ms


87551

In [24]:
#### a little bit effective
known_rev2 = user_with_cl.join(rest_rev.select('business_id',
 'review_id',
 'stars',
 'user_id',
 'rating'), 'user_id').join(biz_with_cl, 'business_id')

In [25]:
%%time
known_rev2.count()

CPU times: user 98.2 ms, sys: 39.4 ms, total: 138 ms
Wall time: 1min 53s


3044361

In [26]:
new_t = known_rev2.join(train_rev_id, 'review_id','left_anti' )

In [190]:
%%time
new_t.count()

CPU times: user 77.3 ms, sys: 32.1 ms, total: 109 ms
Wall time: 1min 1s


12438216

In [27]:
####???
new_t.cache()

DataFrame[review_id: string, business_id: string, user_id: string, user_cl: int, stars: bigint, rating: double, biz_cl: int]

In [191]:
new_t.columns

['review_id', 'business_id', 'user_id', 'user_cl', 'stars', 'rating', 'biz_cl']

In [28]:
new_t = new_t.withColumn('similar', (new_t.user_cl == new_t.biz_cl).cast("int"))

In [29]:
regroup = new_t.groupBy('review_id').agg({'rating': 'mean', 'stars':'mean', 'similar':'sum' })
regroup.columns

['review_id', 'sum(similar)', 'avg(rating)', 'avg(stars)']

In [157]:
%%time
regroup.count()

CPU times: user 79.1 ms, sys: 32 ms, total: 111 ms
Wall time: 1min


735060

In [30]:
%%time
new_grf = regroup.toPandas()
new_grf.head()

CPU times: user 4.45 s, sys: 412 ms, total: 4.86 s
Wall time: 37.1 s


In [31]:
new_grf = transform_aggregated(new_grf)
new_grf.head()

Unnamed: 0,review_id,sum(similar),avg(rating),avg(stars),pred,act,base,base_3.5
0,-01ePuKPxMw1bhR4ISbIlw,1,4.0,4.0,True,False,False,False
1,-0XptEAda6qaK7QrkGF-IQ,1,3.0,4.0,True,False,False,True
2,-0u6BAh47_WiKXcjYcq8vQ,1,3.0,3.0,True,False,False,True
3,-1GR5fgGizpSfTMtWFQRUw,2,3.0,3.0,True,False,False,True
4,-1kQanNhit-7B9RBjY6p_A,2,4.0,3.0,True,False,False,False


In [32]:
my_scorer(new_grf)

Unnamed: 0,base,base_3.5,combo_35,combo_base,pred
accuracy,0.799355,0.748716,0.353616,0.367422,0.367484
recall,0.181595,0.406068,0.861614,0.808663,0.762169
prec,0.512817,0.385574,0.220051,0.216137,0.20892


In [144]:
act = new_grf.act
act.mean()


0.20248279052050172

In [161]:
new_grf['pred'].mean(), new_grf['base_3.5'].mean()

(0.7080836938481212, 0.213245177264441)

### 0.11 5,7,18 best

In [33]:
bad_sample2 = bad_reviews.sample(False, 0.5, seed =94)


In [34]:
sample_token2= data_tokenizer(bad_sample2)

In [35]:

biz_with_cl = cluster_biz_by_review(sample_token2, pipe_idf_model)

In [36]:
biz_with_cl = biz_with_cl.dropDuplicates()

In [37]:
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

In [230]:
%%time
biz_with_cl.count()

CPU times: user 19 ms, sys: 9.26 ms, total: 28.3 ms
Wall time: 3.17 s


113815

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [206]:
biz.select('business_id').count()

188593

In [38]:
rests_id = rests.select('business_id')

In [223]:
rests_id.count()

57173

In [40]:
b0 = rests_id.join(biz_with_cl.filter('biz_cl ==0'), 'business_id', how = 'left')

In [41]:
b0.first()

Row(business_id='--9e1ONYQuAa-CB_Rrw7Tw', biz_cl=None)

In [234]:
%%time
b0.count()

CPU times: user 13.2 ms, sys: 7.83 ms, total: 21 ms
Wall time: 887 ms


57173

In [42]:
b0 = b0.withColumnRenamed('biz_cl', 'cl_0')
b0.columns

['business_id', 'cl_0']

In [237]:
for i in range(1,18):
    cond = 'biz_cl =='+str(i)
    colName = 'cl_'+str(i)
    b0 = b0.join(biz_with_cl.filter(cond), 'business_id', how = 'left').withColumnRenamed('biz_cl', colName)

In [43]:
### 4.12.18
for i in range(1,10):
    cond = 'biz_cl =='+str(i)
    colName = 'cl_'+str(i)
    b0 = b0.join(biz_with_cl.filter(cond), 'business_id', how = 'left').withColumnRenamed('biz_cl', colName)

In [44]:
b0.columns

['business_id',
 'cl_0',
 'cl_1',
 'cl_2',
 'cl_3',
 'cl_4',
 'cl_5',
 'cl_6',
 'cl_7',
 'cl_8',
 'cl_9']

In [45]:
biz_df_cl = rests.select(api_f).join(b0, 'business_id').toPandas()

In [46]:
biz_df_cl.head()

Unnamed: 0,business_id,RestaurantsPriceRange2,stars,review_count,categories,cl_0,cl_1,cl_2,cl_3,cl_4,cl_5,cl_6,cl_7,cl_8,cl_9
0,--9e1ONYQuAa-CB_Rrw7Tw,4,4.0,1546,"Steakhouses, Restaurants, Cajun/Creole",,1.0,,3.0,,5.0,6.0,,8.0,
1,-VAsjhmAbKF3Pb_-8rh3xg,1,2.0,10,"Fast Food, Burgers, Restaurants",,1.0,,,,,,,,
2,-cxD1NimFldATDUsN-oa3A,2,2.0,23,"Mexican, Restaurants",,1.0,,,,,6.0,,,
3,-r8SvItXXG6_T3mP5GXRAw,2,4.0,10,"Restaurants, Noodles, Food, Cafes, Chinese, Co...",,,,,,,,,,
4,0859wfd1BQHG46Zpwhc0ZQ,2,4.5,245,"Nightlife, Pizza, Wine Bars, Bars, American (N...",,1.0,,,,,,,,


In [47]:
biz_df_cl.to_csv('biz_cluster10.csv')

## Dirty trial to test on real data

In [48]:
from pymongo import MongoClient
import time
from bs4 import BeautifulSoup

mc = MongoClient()
db  = mc['raw_restaurants']

In [24]:
db.collection_names()

  """Entry point for launching an IPython kernel.


['reviews', 'reviews_scrap', 'users', 'restaurants']

In [49]:
rv_s = db['reviews_scrap']

In [50]:
rv_s.find_one({'rating': '2.0'})

{'_id': ObjectId('5bfb5ed579884720729dd20e'),
 'id': 'qgpe9AFPEQ7nc5NAnuACeA',
 'rating': '2.0',
 'text': 'Service at this bakery has gone to the toilet. First, they don\'t ever answer the phone. Why have a phone and not answer it? We called for 2 HRs straight and no one picked up. When we get there and ask them why they didn\'t answer the phone, they said they were busy. So we drove across town to find out they don\'t have the cake we need. On a separate occasion (today), we got the cake we want but they refused to write "Happy Birthday" on the cake. They told us they can only write "Happy Thanksgiving"! Unbelievable! Never again will we give them our business. I only give them 2 stars because of the quality of the cake in the past.',
 'category': '\nBakeries,\n                    Desserts\n',
 'date': '11/22/2017',
 'alias': 'schuberts-bakery-san-francisco',
 'user_id': '_3KTfz0hcPl_yq4jOyOPiA'}

In [45]:
rv_s.count_documents( {'rating': '1.0'})

797

In [51]:
scr1 = pd.DataFrame(list(rv_s.find({'rating': '1.0'})))

In [52]:
scr2 = pd.DataFrame(list(rv_s.find({'rating': '2.0'})))

In [50]:
scr1.shape, scr2.shape

((797, 9), (983, 9))

In [53]:
scr_b2 =pd.concat([scr1, scr2] ,axis = 0)

In [54]:
scr_b = scr_b2[['text','user_id']].copy()

In [55]:
scr_sp = spark.createDataFrame(scr_b)

In [56]:
scr_token = data_tokenizer(scr_sp)
scr_token.first()

Row(text="Rude employees, expensive, and not a lot of flavorsWe order 7 different small cups and the total came out to $45. When we asked for a receipt, the employee snapped at us. They don't have coffee flavor or coconut ice cream. Much better shaved ice else where near by like Loco Coco. Wouldn't come back or do not recommend. The ice cream place right next door would have been a better.", user_id='_3KTfz0hcPl_yq4jOyOPiA', token=['rude', 'employe', 'expens', 'lot', 'flavorsw', 'order', 'differ', 'small', 'cup', 'total', 'came', 'ask', 'receipt', 'employe', 'snap', 'us', 'coffe', 'flavor', 'coconut', 'ice', 'cream', 'much', 'better', 'shave', 'ice', 'els', 'near', 'like', 'loco', 'coco', 'come', 'back', 'recommend', 'ice', 'cream', 'place', 'right', 'next', 'door', 'would', 'better'])

In [57]:
user_scr= cluster_user_by_review(scr_token, pipe_idf_model)

In [58]:
user_scr.take(3)

[Row(user_id='SqPqSd7ql_UD5cYxRg_prA', user_cl=8),
 Row(user_id='zriuvFjGFSjvER9OgFJCew', user_cl=6),
 Row(user_id='havBggvuzfE6IPfkT-njbg', user_cl=1)]

In [59]:
user_df = user_scr.toPandas()

In [60]:
user_df.head()

Unnamed: 0,user_id,user_cl
0,SqPqSd7ql_UD5cYxRg_prA,8
1,zriuvFjGFSjvER9OgFJCew,6
2,havBggvuzfE6IPfkT-njbg,1
3,Y5b286tescycns--SAZ-yg,6
4,zxn9ZlFJKTTddjF6Du7rRA,0


In [61]:
user_df.to_csv('user_from_scr_clidf10.csv')

In [62]:
len(pd.unique(user_df.user_cl))

8

[CountVectorizer_48f9b7c8d6430072c4b0,
 IDF_4ad1b3e8bbd33e98b427,
 KMeans_44cc8d441b418e8af8a6]

### Finding center reviews

In [63]:
cvm = cv.fit(train)

In [64]:
cvm.save('center18idf/cv')

Py4JJavaError: An error occurred while calling o665.save.
: java.io.IOException: Path center18idf/cv already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:503)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [65]:
cv_tr = cvm.transform(train)

In [66]:
cv_tr.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>, vectors: vector]

In [67]:
idfm = idf.fit(cv_tr)

In [133]:
idfm.save('center18idf/idf')

In [68]:
idf_tr = idfm.transform(cv_tr)

In [70]:
km10_mdl = km2.fit(idf_tr)

In [134]:
km2_mdl.save('center18idf/km')

In [72]:
cluster_C = km10_mdl.clusterCenters()

In [74]:
train_with_cl = km10_mdl.transform(idf_tr)

In [116]:
'''cv = CountVectorizer(minDF=5, vocabSize=5000, inputCol='token', outputCol='vectors')
idf = IDF(minDocFreq=7, inputCol="vectors", outputCol="features")
km2 = KMeans(k = 18, featuresCol='features', maxIter= 30)
'''
pipe_idf_model.save('pipe-idf-5-7-18')

In [85]:
%%time
train_with_cl.filter("prediction = 1").count()

CPU times: user 3.94 ms, sys: 2.31 ms, total: 6.26 ms
Wall time: 229 ms


44554

In [135]:
train_with_cl.filter("prediction = 17").select('text').first()

Row(text='There are many great choices for Italian in Las Vegas.  I probably won\'t be coming back here.\n\nThe antipasto tray was nice.  I thought the wine  list was a little pricey.  I am not enough of a wine aficionado to wash my $20 plate of pasta down with $75+ worth of wine.\n\nSalads were very good.  I had the arugula salad and my husband had the tricolore salad. Unfortunately, it only set us up for disapppointment with our entrees.\n\nMy husband\'s lasagna was decent -- I snuck a bite. and lasagna is one of my all-time favorite foods.  However, my risotto primavera was shockingly mediocre.  The "primavera" consisted of peas and cubed carrots, and the sauce was quite bland.  The dish was eerily similar to the Lipton Rice \'n Sauce mix  I make when I am out of ideas of what to serve as a side dish at home.\n\nThe service was not up to par, either.  The wait time  between courses was not paced weil.  Our server put our check on the table before she asked us if we wanted dessert or

In [137]:
train_with_cl.filter("prediction = 13").select('text').first()['text']

"Arrived just before 8pm on a Friday night. Came with my 8 year old son for a late dinner. Had to wait nearly 20 minutes for a table meanwhile they had a few tables open that I could see. The waitress promptly arrived, took our drink order and returned minutes later. We then placed our food order and went to the restroom to wash up. The ladies room smells like an outhouse. No joke. So gross. So, back to waiting for our food. It took quite a while. Long enough to deter me from coming back. When you're hungry, it's hard to be patient and I realize it isn't a fast food establishment. But when you're hungry, you're hungry. I ordered a medium well steak and fries and little man ordered chicken fingers and fries. Took a total of 25 minutes to arrive. My steak was a bit warmer than room temperature. Not impressed, again. Fries were hot so it was just poor timing by be kitchen staff. Food was flavourful. Steak was fairly tender but still too pink in the center for my liking. Will probably retu

In [86]:
for i in range(10):
    cond = 'prediction = '+str(i)
    text = train_with_cl.filter(cond).select('text').first()['text']
    print('\ncluster ', i, ':\n')
    print(text)


cluster  0 :

Overpriced and verrrrrrrrrry slow.  The pizza is good, the beer selection is fantastic, and the people are nice. Two big problems though:  1.  The pizza is much more expensive than comparable pizza elsewhere.  Beer prices are steep too.  2. Both times I've been here the pizza took far too long. I won't be coming back. If you value your time and money you'll stay away too.

cluster  1 :

My wife and I ate at Delmonico the night of July 4th. I had the bone-in ribeye and my wife had the filet. My wife's filet was tasty and what one would expect from a restaurant of this caliber. Unfortunately, my ribeye was tough and full of fat. I would say about half of it was edible. With al of the choices to enjoy a quality steak in Vegas, ignore the hype and go somewhere else.

cluster  2 :

I've appreciated Izakaya's around the world from the ultra-traditional in small-town Japan, where they are much better at sitting on the floor than me, to the ultra-hip Toronto version of Guu Izaka


cluster  7 :

Mom's out of town for Thanksgiving so what do two grown men do???  They go out to eat!  This choice brought us to Ben & Jacks in Old Town.  I've eaten at Peter Luger's in New York but never one of Ben & Jacks spin-offs so I was excited to try it.  

The dinner started off at the bar for a cocktail.  Guinness on draft is always a good idea but in a 10oz glass doesn't quite seem right.  The Stoli and Tonic I ordered was great but again the glassware made for a short pour. When the check came, almost 22 dollars for the beer and cocktail.  I don't care how high-end you are, that's absurd pricing.   

Continuing on with our drink pricing fiasco we were moved from the bar to our table where we ordered a bottle of Panna for our flat water...Always a good choice when a restaurant serves it, but as we came to find when the check came, 10 dollars was the going price for that liter of water. Did I mention the word absurd?!?  I don't know if they are trying to recoup all of their st