In [154]:
import numpy as np
import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf

from pyspark.sql import Row

PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words('english'))

In [155]:
from pyspark.ml.clustering import  KMeansModel

In [2]:
#from nlp_cl_start import print_cl

In [3]:
api_f = ['attributes.RestaurantsPriceRange2', 'business_id', 'stars', 'review_count', 'categories']

In [4]:
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [5]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [6]:
def if_restaurant(text):
    if text is None:
        return False
    else:
        return 'Restaurants' in text

if_rest_udf = udf(if_restaurant, BooleanType())

In [7]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder
        .master("local[4]")
        .appName("yelp_academic")
        .getOrCreate()
        )
sc = spark.sparkContext

In [8]:
import pandas as pd

In [9]:
from nlp_cl_start import kmean_counts

In [10]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')

In [11]:
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

In [12]:
bad = rev.filter('stars < 3')

In [12]:
bad.take(2)

[Row(business_id='iCQpiavjjPzJ5_3gPD5Ebg', cool=0, date='2011-02-25', funny=0, review_id='x7mDIiDB3jEiPGPHOmDzyw', stars=2, text="The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...", useful=0, user_id='msQe1u7Z_XuqjGoqhB0J5g'),
 Row(business_id='jtQARsP6P-LbkyjbO1qNGg', cool=1, date='2014-10-23', funny=1, review_id='LZp4UX5zK3e-c5ZGSeo3kA', stars=1, text='Terrible. Dry corn bread. Rib tips were all fat and mushy and had no flavor. If you want bbq in this neighborhood go to john mulls roadkill grill. Trust me.', useful=3, user_id='msQe1u7Z_XuqjGoqhB0J5g')]

In [13]:
bad.count()

1345953

In [14]:
bad.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string]

In [13]:
rests = biz.filter(if_rest_udf(biz.categories))

In [111]:
#rest_rev = rev.join(rests.select('business_id', 'stars').withColumnRenamed('stars','rating'),'business_id')

In [15]:
bad_rest_rev = bad.join(rests.select('business_id','stars').withColumnRenamed('stars','rating'),'business_id')

In [19]:
bad_rest_rev.count()

768690

In [20]:
bad_rest_rev.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string]

In [19]:
bad_sample = bad_rest_rev.select('business_id','review_id', 'user_id','stars', 'text', 'rating').sample(False, 0.027, seed =91)

In [20]:
%%time
bad_sample.count()

CPU times: user 1.82 ms, sys: 1.61 ms, total: 3.44 ms
Wall time: 15.3 s


20765

In [21]:
bad_sample.cache()

DataFrame[business_id: string, review_id: string, user_id: string, stars: bigint, text: string, rating: double]

In [22]:
def tokenize(text):
    regex = re.compile('<.+?>|[^a-zA-Z]')
    clean_txt = regex.sub(' ', text)
    tokens = clean_txt.split()
    lowercased = [t.lower() for t in tokens]

    no_punctuation = []
    for word in lowercased:
        punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION])
        no_punctuation.append(punct_removed)
    no_stopwords = [w for w in no_punctuation if not w in STOPWORDS]

    STEMMER = PorterStemmer()
    stemmed = [STEMMER.stem(w) for w in no_stopwords]
    return [w for w in stemmed if w]

In [23]:
udf_tokenize = udf(f=tokenize, returnType=ArrayType(StringType()))

In [24]:
bad_sample = bad_sample.withColumn('token', udf_tokenize('text'))

In [25]:
cv = CountVectorizer(minDF=10, vocabSize=5000, inputCol='token', outputCol='vectors')

In [26]:
splits = bad_sample.randomSplit([0.8,0.1,0.1])

In [27]:
train_k = splits[0]
train_ad = splits[1]
valid = splits[2]

In [28]:
model = cv.fit(train_k)

In [29]:
%%time
sample_vect = model.transform(train_k)

CPU times: user 2.63 ms, sys: 1.89 ms, total: 4.53 ms
Wall time: 159 ms


In [146]:
model.save('countvect')

In [30]:
%%time
sample_vect.limit(3).toPandas()

CPU times: user 15 ms, sys: 10.9 ms, total: 25.9 ms
Wall time: 1.81 s


Unnamed: 0,business_id,review_id,user_id,stars,text,rating,token,vectors
0,-BxWyEIQ6wypT-37MzZizQ,fLFxlSggbp9JJyL11Kt5mw,WBDenhIG-zB271HN9CVAAQ,2,see the waitress? see her ignore you? see a ...,4.0,"[see, waitress, see, ignor, see, bartend, good...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 3.0, 0.0, 0.0, ..."
1,-Miw03v5yXJWjH9MN1aglw,p_zxYEhzM43o52NeFlKunA,Xo99Z-shvqVAijcrQjr_Sg,2,Not a stellar start for Michael Noble. As a r...,4.0,"[stellar, start, michael, nobl, regular, notab...","(3.0, 4.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0, 1.0, ..."
2,-MuatiMmslPOvk9kOMyjkA,iHA6N96oIVQOTJ5biZmhXw,9wF_E3anRNOTq2BrgUXuVg,1,Stopped in for dinner and a glass of wine........,4.0,"[stop, dinner, glass, wine, sat, booth, bar, w...","(1.0, 1.0, 1.0, 3.0, 0.0, 1.0, 0.0, 1.0, 1.0, ..."


In [33]:
sample_vect.cache()

DataFrame[business_id: string, review_id: string, user_id: string, stars: bigint, text: string, rating: double, token: array<string>, vectors: vector]

## Train k-mean on sample (countvectorizer)

In [34]:
%%time
cl = 20

km = KMeans(k = cl, featuresCol='vectors', maxIter= 30)

model_km = km.fit(sample_vect)

centers_c = model_km.clusterCenters()

CPU times: user 219 ms, sys: 56 ms, total: 275 ms
Wall time: 21.3 s


In [147]:
model_km.save('kmeans')

In [153]:
mode_kkk = KMeansModel.load('kmeans')

In [46]:
vocab = np.array(model.vocabulary)
vocab[np.argsort(centers_c )[:2,-10:]]

array([['get', 'back', 'like', 'time', 'servic', 'good', 'order', 'food',
        'go', 'place'],
       ['restaur', 'good', 'go', 'time', 'get', 'one', 'place', 'order',
        'like', 'food']], dtype='<U15')

In [35]:
pred = model_km.transform(sample_vect)
pred.columns

['business_id',
 'review_id',
 'user_id',
 'stars',
 'text',
 'rating',
 'token',
 'vectors',
 'prediction']

In [39]:
ad_vect = model.transform(train_ad)
pred_ad = model_km.transform(ad_vect)

both = pred_ad.union(pred)

In [129]:
to_hold = pred_ad.union(pred)
to_hold = to_hold.select('review_id')
to_hold.cache()

DataFrame[review_id: string]

In [41]:
both = both.select('business_id','user_id','prediction')
both.cache()

DataFrame[business_id: string, user_id: string, prediction: int]

In [43]:
user_pr = both.select('user_id', 'prediction').withColumnRenamed('prediction','user_cl')
biz_pr= both.select('business_id', 'prediction').withColumnRenamed('prediction','biz_cl')

In [37]:
valid.columns

['business_id', 'review_id', 'user_id', 'stars', 'text', 'rating', 'token']

In [42]:
test = valid.select('business_id',  'user_id', 'review_id','stars',  'rating')
test.count()

2063

In [46]:
test_join = test.join(user_pr, 'user_id').join(biz_pr, 'business_id')

In [50]:
test_df =test_join.toPandas()

In [55]:
len(test_df['user_id'].unique()), len(test_df['business_id'].unique()), len(test_df['review_id'].unique())

(108, 107, 108)

In [80]:
test_df['similar']  = (test_df.user_cl == test_df.biz_cl)
test_df.head()

Unnamed: 0,business_id,user_id,review_id,stars,rating,user_cl,biz_cl,similar
0,AWJhMTYsIYzCIYAIUA-waA,SPtjWBJmUQOaVManvSLY5Q,5foH5YEx7irPfLa_mOrg8A,2,2.5,15,2,False
1,AWJhMTYsIYzCIYAIUA-waA,SPtjWBJmUQOaVManvSLY5Q,5foH5YEx7irPfLa_mOrg8A,2,2.5,15,0,False
2,AWJhMTYsIYzCIYAIUA-waA,SPtjWBJmUQOaVManvSLY5Q,5foH5YEx7irPfLa_mOrg8A,2,2.5,15,11,False
3,AWJhMTYsIYzCIYAIUA-waA,SPtjWBJmUQOaVManvSLY5Q,5foH5YEx7irPfLa_mOrg8A,2,2.5,15,1,False
4,AWJhMTYsIYzCIYAIUA-waA,SPtjWBJmUQOaVManvSLY5Q,5foH5YEx7irPfLa_mOrg8A,2,2.5,15,0,False


In [96]:
test_gr_df =  test_df.groupby('review_id').agg({'similar':'sum', 'stars':'mean', 'rating':'mean'})
test_gr_df['pred'] = (test_gr_df.similar > 0)
test_gr_df['act'] = (test_gr_df.stars < 3)
test_gr_df['base'] = (test_gr_df.rating < 3)
test_gr_df['base_3.5'] = (test_gr_df.rating < 3.5)


test_gr_df.head()

Unnamed: 0_level_0,similar,stars,rating,pred,act,base,base_3.5
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1Ct6EYB3JYa9vp092w1JQA,2.0,2,3.0,True,True,False,True
21VS5dUcc5OtF-sREbgRyw,0.0,2,4.0,False,True,False,False
2LuaEfPsNb_77DxPiIIGSg,1.0,2,2.5,True,True,True,True
2Vn2J41b-wrS__5JcaKrlQ,0.0,1,3.5,False,True,False,False
32nl-wa3RtRsihVkzB4tVg,2.0,2,3.5,True,True,False,False


In [115]:
from my_metr import transform_to_score

In [91]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Compare accuracy, precision, recall with base model 

In [127]:
recall_score(test_gr_df.act, test_gr_df.pred), recall_score(test_gr_df.act, test_gr_df.base), recall_score(test_gr_df.act, test_gr_df['base_3.5'])

(0.37962962962962965, 0.19444444444444445, 0.4537037037037037)

## New test

In [107]:
new_T = rev.select('business_id',  'user_id', 'review_id','stars').join(rests.select('business_id', 'stars').withColumnRenamed('stars','rating'),'business_id')

In [108]:
new_T.columns

['business_id', 'user_id', 'review_id', 'stars', 'rating']

In [109]:
new_tf = new_T.join(user_pr, 'user_id').join(biz_pr, 'business_id')

In [112]:
BIG_T = new_tf.toPandas()

In [114]:
BIG_T.head()

Unnamed: 0,business_id,user_id,review_id,stars,rating,user_cl,biz_cl
0,3VvPKmr-6LXJ7aWlFtp4Yg,YgavGxfAdjhkkbwlAY_9ZQ,zPRRy6tQZQEAP46IGQn_8Q,4,3.5,10,1
1,leMIHa6TogufHv5HNYjnfw,YgavGxfAdjhkkbwlAY_9ZQ,6PkCZqMq9pgpBBQEe00eDg,2,4.0,10,10
2,d84_mtSFHt8xgYTly2T-9A,YgavGxfAdjhkkbwlAY_9ZQ,FEtMyYYTSQyOArTSrFSrHA,3,2.5,10,1
3,EJmiKQKlWfY-0iVlya7FOQ,YgavGxfAdjhkkbwlAY_9ZQ,8ZMhRRoj3TEq32TQkGfgIA,2,2.5,10,1
4,gOBxVkHpqtjRRxHBIrpnMA,YgavGxfAdjhkkbwlAY_9ZQ,J0WxHbfHRJRwR8v5eZSv5A,2,3.5,10,1


In [117]:
BIG_agg = transform_to_score(BIG_T)
BIG_agg.head()

Unnamed: 0_level_0,similar,stars,rating,pred,act,base,base_3.5
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
---p28WNWGZuG6gLAt-V1w,1.0,2,2.5,True,True,True,True
--0y7xOZPpiGD5d6vgfafg,0.0,1,3.5,False,True,False,False
--4yS1qzrZI9xoH_2Z-sCA,0.0,4,4.0,False,False,False,False
--6WmlklBfY7Ecr3A-Ft0Q,0.0,4,2.5,False,False,True,True
--74laQJk_BAMziNznD5Ig,0.0,3,3.5,False,False,False,False


In [132]:
hold_rev_id = to_hold.toPandas()

In [135]:
hold_rev_id.head()

Unnamed: 0,review_id
0,E_H7LnBlqff0GwiUCWmBmQ
1,f0u86A3T9bkIzn1gX7amUA
2,0rAxUcOIzn42G7LmMs7qVw
3,qlji3Yypqe0CF0_NRxzE9w
4,eZXZhG5Bk3p0LWPSqPtfsw


In [136]:
BIG_filter = BIG_agg.loc[~BIG_agg.index.isin(hold_rev_id)].copy()

In [138]:
BIG_filter.head(3)

Unnamed: 0_level_0,similar,stars,rating,pred,act,base,base_3.5
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
---p28WNWGZuG6gLAt-V1w,1.0,2,2.5,True,True,True,True
--0y7xOZPpiGD5d6vgfafg,0.0,1,3.5,False,True,False,False
--4yS1qzrZI9xoH_2Z-sCA,0.0,4,4.0,False,False,False,False


In [137]:
my_pred = BIG_filter.pred
act = BIG_filter.act
base = BIG_filter.base
base_35 = BIG_filter['base_3.5']

In [139]:
accuracy_score(act, my_pred), accuracy_score(act, base), accuracy_score(act, base_35)

(0.6423021138372792, 0.718671585901869, 0.7003788693479808)

In [140]:
recall_score(act, my_pred), recall_score(act, base), recall_score(act, base_35)

(0.5706969818036538, 0.1737188101550866, 0.3977045726945847)

In [151]:
recall_score(act, my_pred),  recall_score(act, base_35)

(0.5706969818036538, 0.3977045726945847)

In [150]:
accuracy_score(act, my_pred),  accuracy_score(act, base_35)

(0.6423021138372792, 0.7003788693479808)

In [155]:
test.count()

938

In [93]:
test.first()['business_id'], test.first()['user_id']

('3DgPtOe-FKRH0bFE3ptzxA', '6IhssoagVtfNMnNjU71Q1A')

In [156]:
test.cache()

DataFrame[business_id: string, user_id: string, rating: double, stars: bigint]

In [188]:
def cluster_rest(business_id, not_user_id):
    texts = rest_rev.filter('stars < 3').filter((rest_rev.business_id == business_id)
                                        &(rest_rev.user_id!=not_user_id)).select('text')
    texts = texts.withColumn('token', udf_tokenize('text'))
    vect = model.transform(texts)
    #vect.cache()
    pred = model_km.transform(vect).select('prediction')
    return pred
    

In [126]:
%%time
cl_r = cluster_rest('3DgPtOe-FKRH0bFE3ptzxA','6IhssoagVtfNMnNjU71Q1A')

CPU times: user 10.4 ms, sys: 3.64 ms, total: 14 ms
Wall time: 938 ms


In [187]:
def cluster_user(not_business_id, user_id):
    texts = rest_rev.filter('stars < 3').filter((rest_rev.user_id == user_id)
                                        &(rest_rev.business_id!=not_business_id)).select('text')
    texts = texts.withColumn('token', udf_tokenize('text'))
    vect = model.transform(texts)
    #vect.cache()
    pred = model_km.transform(vect).select('prediction')
    return pred

In [127]:
%%time
cl_u =cluster_user('3DgPtOe-FKRH0bFE3ptzxA','6IhssoagVtfNMnNjU71Q1A')

CPU times: user 14.1 ms, sys: 4.93 ms, total: 19 ms
Wall time: 719 ms


In [144]:
%%time
cl_r.intersect(cl_u).count()

CPU times: user 35 ms, sys: 16.8 ms, total: 51.8 ms
Wall time: 1.4 s


0

In [172]:
def predict_bad(row):
    business_id = row['business_id']
    user_id = row['user_id']
    mean_rating = row['rating']
    cl_u = cluster_user(business_id, user_id)
    cl_r = cluster_rest(business_id, user_id)
    if cl_r.intersect(cl_u).count() > 0:
        return 2
    else:
        return mean_rating

In [166]:
test1_df = test.toPandas()

In [191]:
#

In [170]:
ttt = test1_df[:4].copy()

In [174]:
%%time
y_h = ttt.apply(predict_bad, axis =1)

CPU times: user 262 ms, sys: 104 ms, total: 365 ms
Wall time: 3min 11s


In [175]:
test2 = rest_rev.select('business_id','user_id','rating', 'stars').sample(False, 0.0027, seed =91)

In [176]:
test2.cache()

DataFrame[business_id: string, user_id: string, rating: double, stars: bigint]

In [177]:
t2_df = test2.toPandas()

In [None]:
t2_df['y_hat'] = t2_df.apply(predict_bad, axis =1)

['business_id', 'review_id', 'stars', 'text', 'token', 'vectors', 'prediction']

In [48]:
evaluator = ClusteringEvaluator(featuresCol='vectors')

silhouette = evaluator.evaluate(pred)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = -0.1117817836165352


In [76]:
count_tr_df =pred.join(biz.select(api_f ).withColumnRenamed('stars', 'rating'), 'business_id').toPandas()

In [109]:
len(count_tr_df.user_id.unique())

19575

In [110]:
count_tr_df.shape

(20765, 12)

In [77]:
count_tr_df.groupby('prediction').count()

Unnamed: 0_level_0,business_id,review_id,user_id,stars,text,token,vectors,RestaurantsPriceRange2,rating,review_count,categories
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,6841,6841,6841,6841,6841,6841,6841,6718,6841,6841,6841
1,161,161,161,161,161,161,161,161,161,161,161
2,3550,3550,3550,3550,3550,3550,3550,3515,3550,3550,3550
3,1642,1642,1642,1642,1642,1642,1642,1619,1642,1642,1642
4,1244,1244,1244,1244,1244,1244,1244,1221,1244,1244,1244
5,2425,2425,2425,2425,2425,2425,2425,2399,2425,2425,2425
6,48,48,48,48,48,48,48,48,48,48,48
7,511,511,511,511,511,511,511,507,511,511,511
8,219,219,219,219,219,219,219,216,219,219,219
9,355,355,355,355,355,355,355,354,355,355,355


## A little more beatiful