In [2]:
import numpy as np


import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml import Pipeline

from nlp_cl_start import data_tokenizer

In [102]:
import pandas as pd

In [47]:
from pipe_spar import api_f, cluster_biz_by_review , cluster_user_by_review

In [25]:
from nlp_cl_start import if_rest_udf

In [103]:
from my_metr import transform_to_score, my_scorer
from sklearn.metrics import accuracy_score, recall_score

## Load data
load all data, select bad reviews, define train and test

In [26]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

1. Filter restaraunts

In [27]:
rests = biz.filter(if_rest_udf(biz.categories))

In [28]:
rest_rev = rev.join(rests.select('business_id','stars').withColumnRenamed('stars','rating'),'business_id')
bad_reviews = rest_rev.filter('stars < 3')

In [31]:
bad_reviews.count()

768690

In [33]:
bad_sample = bad_reviews.sample(False, 0.027, seed =91)

In [34]:
bad_sample.count()

20765

In [35]:
bad_sample.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double]

2. Tokenize sample

In [36]:
sample_token= data_tokenizer(bad_sample)

In [127]:
splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed = 91)


In [111]:
train = splits[0]
add_cl = splits[1]
test = splits[2]

In [112]:
train.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

### Model to cluster all reviews

In [105]:
cv = CountVectorizer(minDF=10, vocabSize=5000, inputCol='token', outputCol='vectors')
km1 = KMeans(k = 20, featuresCol='vectors', maxIter= 30)


pipe_count = Pipeline(stages=[cv, km1])

In [106]:
%%time
pipe_cv_model = pipe_count.fit(train)

CPU times: user 107 ms, sys: 51.1 ms, total: 158 ms
Wall time: 7.33 s


3. Clustering user and bussiness

In [114]:
both = train.union(add_cl)
both.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

In [115]:
user_with_cl = cluster_user_by_review(both, pipe_cv_model)

In [116]:
biz_with_cl = cluster_biz_by_review(both, pipe_cv_model)

In [117]:
user_with_cl.cache()
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

['business_id', 'biz_cl']

## Create test set 

I need reviews that I didn't use for clusterization restorants and user. But in same time that contains clusterized restaraunts and users

In [118]:
train_rev_id =both.select('review_id')
train_rev_id.take(2)

[Row(review_id='fLFxlSggbp9JJyL11Kt5mw'),
 Row(review_id='YNcg5AGAlf0ZK--F2G0ecw')]

In [75]:
rest_rev.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id',
 'rating']

In [119]:
#### a little bit effective
known_rev2 = user_with_cl.join(rest_rev.select('business_id',
 'review_id',
 'stars',
 'user_id',
 'rating'), 'user_id').join(biz_with_cl, 'business_id')

In [83]:
%%time
known_rev2.count()

CPU times: user 7.35 ms, sys: 4.54 ms, total: 11.9 ms
Wall time: 21.2 s


801005

In [120]:
new_t = known_rev2.join(train_rev_id, 'review_id','left_anti' )

In [121]:
%%time
new_t.count()

CPU times: user 58.9 ms, sys: 26.2 ms, total: 85.1 ms
Wall time: 1min 8s


591218

In [122]:
new_t.cache()

DataFrame[review_id: string, business_id: string, user_id: string, user_cl: int, stars: bigint, rating: double, biz_cl: int]

In [123]:
newt_df = new_t.toPandas()

In [124]:
new_gr = transform_to_score(newt_df)

In [125]:
new_gr.head(), new_gr.shape

(                        similar  stars  rating   pred    act   base  base_3.5
 review_id                                                                    
 ---94vtJ_5o_nikEs6hUjg      0.0      5     4.5  False  False  False     False
 ---L4b6VR6HoB-q7cfMWIA      1.0      5     3.5   True  False  False     False
 --0y7xOZPpiGD5d6vgfafg      0.0      1     3.5  False   True  False     False
 --4yS1qzrZI9xoH_2Z-sCA      0.0      4     4.0  False  False  False     False
 --6WmlklBfY7Ecr3A-Ft0Q      0.0      4     2.5  False  False   True      True,
 (161539, 7))

In [126]:
my_scorer(new_gr)

Unnamed: 0,base,base_3.5,pred
accuracy,0.777181,0.732281,0.591702
recall,0.166429,0.394731,0.364713


In [None]:
Try to 