In [1]:
import numpy as np
import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf

from pyspark.sql import Row

PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))

In [116]:
from nlp_cl_start import print_cl

In [99]:
api_f = ['attributes.RestaurantsPriceRange2', 'business_id', 'stars', 'review_count', 'categories']

In [15]:
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [70]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [16]:
def if_restaurant(text):
    if text is None:
        return False
    else:
        return 'Restaurants' in text

if_rest_udf = udf(if_restaurant, BooleanType())

In [3]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder
        .master("local[4]")
        .appName("yelp_academic")
        .getOrCreate()
        )
sc = spark.sparkContext

In [4]:
import pandas as pd

In [12]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')

In [5]:
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

In [6]:
bad = rev.filter()

Row(business_id='iCQpiavjjPzJ5_3gPD5Ebg', cool=0, date='2011-02-25', funny=0, review_id='x7mDIiDB3jEiPGPHOmDzyw', stars=2, text="The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...", useful=0, user_id='msQe1u7Z_XuqjGoqhB0J5g')

In [8]:
bad = rev.filter('stars <= 3')

In [9]:
bad.take(2)

[Row(business_id='iCQpiavjjPzJ5_3gPD5Ebg', cool=0, date='2011-02-25', funny=0, review_id='x7mDIiDB3jEiPGPHOmDzyw', stars=2, text="The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...", useful=0, user_id='msQe1u7Z_XuqjGoqhB0J5g'),
 Row(business_id='jtQARsP6P-LbkyjbO1qNGg', cool=1, date='2014-10-23', funny=1, review_id='LZp4UX5zK3e-c5ZGSeo3kA', stars=1, text='Terrible. Dry corn bread. Rib tips were all fat and mushy and had no flavor. If you want bbq in this neighborhood go to john mulls roadkill grill. Trust me.', useful=3, user_id='msQe1u7Z_XuqjGoqhB0J5g')]

In [10]:
bad.count()

2019159

In [11]:
bad.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string]

In [17]:
rests = biz.filter(if_rest_udf(biz.categories))

In [18]:
rests.join?

In [19]:
bad_rest_rev = bad.join(rests.select('business_id'),'business_id')

In [21]:
bad_rest_rev.count()

1265432

In [23]:
bad_rest_rev.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string]

In [25]:
bad_sample = bad_rest_rev.select('business_id','review_id', 'stars', 'text').sample(False, 0.001, seed =91)

In [26]:
%%time
bad_sample.count()

CPU times: user 1.96 ms, sys: 1.87 ms, total: 3.83 ms
Wall time: 17.2 s


1219

In [27]:
def tokenize(text):
    regex = re.compile('<.+?>|[^a-zA-Z]')
    clean_txt = regex.sub(' ', text)
    tokens = clean_txt.split()
    lowercased = [t.lower() for t in tokens]

    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]

    STEMMER = PorterStemmer()
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    return [w for w in stemmed if w]

In [29]:
udf_tokenize = udf(f=tokenize, returnType=ArrayType(StringType()))

In [31]:
bad_sample = bad_sample.withColumn('token', udf_tokenize('text'))

In [33]:
cv = CountVectorizer(minDF=10, vocabSize=5000, inputCol='token', outputCol='vectors')

In [35]:
model = cv.fit(bad_sample)

In [36]:
%%time
sample_vect = model.transform(bad_sample)

CPU times: user 2.56 ms, sys: 1.54 ms, total: 4.1 ms
Wall time: 124 ms


In [38]:
%%time
sample_vect.limit(3).toPandas()

CPU times: user 9.88 ms, sys: 4.2 ms, total: 14.1 ms
Wall time: 1.28 s


Unnamed: 0,business_id,review_id,stars,text,token,vectors
0,rPk01P-gHFZnBtRyfc7Wrg,GD0hHJCuCSkvPHCowIs5Tw,1,The food is not good. Enchilada sauce taste ve...,"[food, good, enchilada, sauc, tast, bland, str...","(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
1,aLcFhMe6DDJ430zelCpd2A,qp5HMCDxnXEmBudkZtskDg,2,Here is the best advice I can provide if you'r...,"[best, advic, provid, think, head, khao, san, ...","(6.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 1.0, 4.0, ..."
2,FRIIEcW64yA_bW6sCBqBIA,XJs8cfkHZqFC88fKcEriww,3,Went for dinner with couple of friends. The re...,"[went, dinner, coupl, friend, restaur, dim, li...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [39]:
%%time
idf = IDF(inputCol= 'vectors', outputCol= 'features')

model2 = idf.fit(sample_vect)

sample_tfidf = model2.transform(sample_vect)

sample_tfidf.cache()

CPU times: user 9.96 ms, sys: 4.02 ms, total: 14 ms
Wall time: 4.81 s


In [66]:
sample_tfidf.first()

Row(business_id='rPk01P-gHFZnBtRyfc7Wrg', review_id='GD0hHJCuCSkvPHCowIs5Tw', stars=1, text="The food is not good. Enchilada sauce taste very bland and strange. Kids meal are bad and it's all over priced. It's an open kitchen too. I don't see how they pass health inspections . Won't go back.", token=['food', 'good', 'enchilada', 'sauc', 'tast', 'bland', 'strang', 'kid', 'meal', 'bad', 'price', 'open', 'kitchen', 'see', 'pass', 'health', 'inspect', 'go', 'back'], vectors=SparseVector(1128, {0: 1.0, 3: 1.0, 8: 1.0, 11: 1.0, 26: 1.0, 36: 1.0, 50: 1.0, 57: 1.0, 63: 1.0, 109: 1.0, 154: 1.0, 191: 1.0, 251: 1.0, 291: 1.0, 514: 1.0, 1055: 1.0}), features=SparseVector(1128, {0: 0.6098, 3: 0.8801, 8: 1.1456, 11: 1.3166, 26: 1.7221, 36: 1.8439, 50: 2.1368, 57: 2.0314, 63: 2.1794, 109: 2.4061, 154: 2.6407, 191: 2.7628, 251: 3.1748, 291: 3.1554, 514: 3.6726, 1055: 4.7087}))

In [96]:
###number of cluster!! make sence to send so class
cl_num = 20


In [97]:
%%time
km = KMeans(k = cl_num)

model_km = km.fit(sample_tfidf)

centers = model_km.clusterCenters()

CPU times: user 65.4 ms, sys: 21.1 ms, total: 86.6 ms
Wall time: 1.54 s


In [None]:
vocab = np.array(model.vocabulary)
vocab[np.argsort(centers )[:2,-10:]]

In [None]:
sample_tfidf.columns

In [92]:
pred = model_km.transform(sample_tfidf)
pred.columns

['business_id',
 'review_id',
 'stars',
 'text',
 'token',
 'vectors',
 'features',
 'prediction']

In [88]:
ClusteringEvaluator?

In [94]:
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(pred)
print("Silhouette with squared euclidean distance = " + str(silhouette))


Silhouette with squared euclidean distance = -0.36086075681766144


In [100]:
rest_with_cl_train =pred.join(biz.select(api_f ), 'business_id')

In [101]:
rest_with_cl_train.count()

1219

In [102]:
clust_df = rest_with_cl_train.toPandas()

In [105]:
clust_df.groupby('prediction').count()

Unnamed: 0_level_0,business_id,review_id,stars,text,token,vectors,features,RestaurantsPriceRange2,stars,review_count,categories
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,5,5,5,5,5,5,5,5,5,5,5
1,3,3,3,3,3,3,3,3,3,3,3
2,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1
5,1168,1168,1168,1168,1168,1168,1168,1154,1168,1168,1168
6,1,1,1,1,1,1,1,1,1,1,1
7,1,1,1,1,1,1,1,1,1,1,1
8,4,4,4,4,4,4,4,4,4,4,4
9,1,1,1,1,1,1,1,1,1,1,1


In [106]:
clu12 = clust_df[clust_df.prediction == 12]

In [109]:
clu12[['business_id','stars','categories','RestaurantsPriceRange2','token']]

Unnamed: 0,business_id,stars,stars.1,categories,RestaurantsPriceRange2,token
0,qfcdMhm1Ff28JHVpHca20g,2,1.5,"Pizza, Restaurants",2.0,"[good, pizza, deliveri, time, realli, bad, eve..."
9,Z5eukYH32_nFljOTC2DJ0g,3,4.0,"Pizza, Restaurants",2.0,"[classic, ny, pizza, place, interior, design, ..."
127,OXrFWgoz533T8tMRemkiww,2,4.0,"Pizza, Italian, Restaurants",2.0,"[sever, disappoint, trip, il, pizzaiolo, marke..."
180,v9jNkOIBfP4aW2ru50Rn-A,2,2.5,"Restaurants, Italian",2.0,"[boyfriend, weekend, alway, best, experi, food..."
192,A5Rkh7UymKm0_Rxm9K2PJw,2,4.0,"Cocktail Bars, Vegetarian, Steakhouses, Americ...",2.0,"[last, night, visit, yardhous, big, fan, sure,..."
237,rHzf-EDTP9g6gQFYAz0RuQ,1,2.5,"Salad, Restaurants, Pizza, Chicken Wings",,"[disappoint, panago, pizza, order, larg, peppe..."
242,ai8nfTBNvL579cjIuqZajQ,3,4.0,"Food, Restaurants, Pizza, Sandwiches, Desserts...",2.0,"[want, tri, differ, pizza, place, one, ever, c..."
336,CJvN2k3gjR7JspTx21icTQ,3,4.5,"Pizza, Restaurants, Italian",1.0,"[pizza, delici, big, fan, new, york, pizza, pi..."
406,swi7mi1ixWpu5-tZO2mtsg,1,3.0,"Italian, Restaurants, Sandwiches, Pizza",2.0,"[earlier, even, place, order, grub, hub, inch,..."
444,kd1VNJdd92T2eQ4hqvYQ5A,2,4.0,"Pizza, Restaurants, Chicken Wings, Salad",1.0,"[updat, sicilian, receiv, april, order, somewh..."
