In [1]:
import numpy as np


import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml import Pipeline

from nlp_cl_start import data_tokenizer

In [None]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder
        .master("local[4]")
        .appName("yelp_academic")
        .getOrCreate()
        )
sc = spark.sparkContext

In [2]:
import pandas as pd

In [3]:
from pipe_spar import api_f, cluster_biz_by_review , cluster_user_by_review

In [4]:
from nlp_cl_start import if_rest_udf

In [84]:
from my_metr import transform_to_score, my_scorer, transform_aggregated
from sklearn.metrics import accuracy_score, recall_score

## Load data
load all data, select bad reviews, define train and test

In [6]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

1. Filter restaraunts

In [7]:
rests = biz.filter(if_rest_udf(biz.categories))

In [8]:
rest_rev = rev.join(rests.select('business_id','stars').withColumnRenamed('stars','rating'),'business_id')
bad_reviews = rest_rev.filter('stars < 3')

In [31]:
bad_reviews.count()

768690

### Now I change train sample from 20000 to 100000

In [45]:
bad_sample = bad_reviews.sample(False, 0.127, seed =91)
#bad_sample = bad_reviews.sample(False, 0.027, seed =91)


In [46]:
%%time
bad_sample.count()

CPU times: user 2.23 ms, sys: 2.04 ms, total: 4.26 ms
Wall time: 20 s


97353

In [47]:
bad_sample.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double]

In [107]:
bad_sample.

True

2. Tokenize sample

In [48]:
sample_token= data_tokenizer(bad_sample)

In [49]:
splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed = 91)


In [50]:
train = splits[0]
add_cl = splits[1]
test = splits[2]

In [52]:
train.first()

Row(business_id='--9e1ONYQuAa-CB_Rrw7Tw', cool=0, date='2012-05-11', funny=0, review_id='OaRMJKI6S7LAoa8xoEaqPg', stars=2, text="I feel like Twitter is pretty useless so far; I have an ok amount of followers I guess, but I feel like they're all following so many people that anything I say gets lost in the mess of their feed. I could tweet all day and get very little response to anything aside from a couple loyal folks on there. And it was one such reader that led me to Delmonico recently for what he claimed to be an excellent burger. He's a food writer in town so he should know what he's talking about, right?\n\nDelmonico is located in the small restaurant row of the Venetian right by another great burger spot, although most people would not come here for their burger. The interior is nothing too fancy in either the bar area or more traditional area in the back. I did notice while sitting at the bar that they have an amazing bourbon selection, but since I was there during a lunch break

In [53]:
train.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

### Model to cluster all reviews

In [54]:
cv = CountVectorizer(minDF=10, vocabSize=5000, inputCol='token', outputCol='vectors')
km1 = KMeans(k = 15, featuresCol='vectors', maxIter= 30)


pipe_count = Pipeline(stages=[cv, km1])

In [55]:
%%time
pipe_cv_model = pipe_count.fit(train)

CPU times: user 115 ms, sys: 57.5 ms, total: 173 ms
Wall time: 1min 19s


3. Clustering user and bussiness

In [56]:
both = train.union(add_cl)
both.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

In [57]:
user_with_cl = cluster_user_by_review(both, pipe_cv_model)

In [58]:
biz_with_cl = cluster_biz_by_review(both, pipe_cv_model)

In [62]:
user_with_cl.cache()
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

['business_id', 'biz_cl']

## Create test set 

I need reviews that I didn't use for clusterization restorants and user. But in same time that contains clusterized restaraunts and users

In [60]:
train_rev_id =both.select('review_id')
train_rev_id.take(2)

[Row(review_id='OaRMJKI6S7LAoa8xoEaqPg'),
 Row(review_id='syUWGlVaWBkr9iI7sMJGcw')]

In [61]:
%%time
user_with_cl.count()

CPU times: user 8.59 ms, sys: 4.93 ms, total: 13.5 ms
Wall time: 54 s


87551

In [63]:
#### a little bit effective
known_rev2 = user_with_cl.join(rest_rev.select('business_id',
 'review_id',
 'stars',
 'user_id',
 'rating'), 'user_id').join(biz_with_cl, 'business_id')

In [64]:
%%time
known_rev2.count()

CPU times: user 32.9 ms, sys: 14.6 ms, total: 47.5 ms
Wall time: 27.5 s


13749260

In [65]:
new_t = known_rev2.join(train_rev_id, 'review_id','left_anti' )

In [66]:
%%time
new_t.count()

CPU times: user 37.1 ms, sys: 17.9 ms, total: 55 ms
Wall time: 27.9 s


12438216

In [67]:
new_t.cache()

DataFrame[review_id: string, business_id: string, user_id: string, user_cl: int, stars: bigint, rating: double, biz_cl: int]

In [71]:
new_t.columns

['review_id', 'business_id', 'user_id', 'user_cl', 'stars', 'rating', 'biz_cl']

In [76]:
new_t = new_t.withColumn('similar', (new_t.user_cl == new_t.biz_cl).cast("int"))

In [80]:
regroup = new_t.groupBy('review_id').agg({'rating': 'mean', 'stars':'mean', 'similar':'sum' })
regroup.columns

['review_id', 'sum(similar)', 'avg(rating)', 'avg(stars)']

In [81]:
%%time
regroup.count()

CPU times: user 17.1 ms, sys: 7.95 ms, total: 25.1 ms
Wall time: 1.97 s


735060

In [82]:
new_grf = regroup.toPandas()
new_grf.head()

Unnamed: 0,review_id,sum(similar),avg(rating),avg(stars)
0,-01ePuKPxMw1bhR4ISbIlw,1,4.0,4.0
1,-0XptEAda6qaK7QrkGF-IQ,1,3.0,4.0
2,-0u6BAh47_WiKXcjYcq8vQ,0,3.0,3.0
3,-1GR5fgGizpSfTMtWFQRUw,1,3.0,3.0
4,-1kQanNhit-7B9RBjY6p_A,6,4.0,3.0


In [90]:
new_grf = transform_aggregated(new_grf)
new_grf.head()

Unnamed: 0,review_id,sum(similar),avg(rating),avg(stars),pred,act,base,base_3.5
0,-01ePuKPxMw1bhR4ISbIlw,1,4.0,4.0,True,False,False,False
1,-0XptEAda6qaK7QrkGF-IQ,1,3.0,4.0,True,False,False,True
2,-0u6BAh47_WiKXcjYcq8vQ,0,3.0,3.0,False,False,False,True
3,-1GR5fgGizpSfTMtWFQRUw,1,3.0,3.0,True,False,False,True
4,-1kQanNhit-7B9RBjY6p_A,6,4.0,3.0,True,False,False,False


In [91]:
my_scorer(new_grf)

Unnamed: 0,base,base_3.5,combo_35,combo_base,pred
accuracy,0.799355,0.748716,0.450313,0.473242,0.472483
recall,0.181595,0.406068,0.761578,0.66857,0.587811


In [93]:
act = new_grf.act
act.mean(), act.sum()

(0.20248279052050172, 148837)

In [105]:
rand = np.random.binomial(1, 0.55, 735060 )

In [106]:
accuracy_score(act, rand), recall_score(act, rand)

(0.47070307185807964, 0.5487479591768176)

In [72]:
####work only for small dataframes
newt_df = new_t.toPandas()

In [69]:
new_gr = transform_to_score(newt_df)

In [30]:
new_gr.head(), new_gr.shape

(                        similar  stars  rating   pred    act   base  base_3.5
 review_id                                                                    
 ---94vtJ_5o_nikEs6hUjg      0.0      5     4.5  False  False  False     False
 ---L4b6VR6HoB-q7cfMWIA      1.0      5     3.5   True  False  False     False
 --0y7xOZPpiGD5d6vgfafg      0.0      1     3.5  False   True  False     False
 --74laQJk_BAMziNznD5Ig      0.0      3     3.5  False  False  False     False
 --7IGGbMWBpvFtEtXZxAZg      0.0      3     4.0  False  False  False     False,
 (163368, 7))

In [43]:
my_scorer(new_gr)

Unnamed: 0,base,base_3.5,combo_35,combo_base,pred
accuracy,0.779333,0.733926,0.562301,0.593195,0.592154
recall,0.168927,0.396413,0.623572,0.478074,0.369211
