In [1]:
import numpy as np


import string
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml import Pipeline

from nlp_cl_start import data_tokenizer

In [2]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder
        .master("local[4]")
        .appName("yelp_academic")
        .getOrCreate()
        )
sc = spark.sparkContext

In [3]:
import pandas as pd

In [4]:
from pipe_spar import api_f, cluster_biz_by_review , cluster_user_by_review

In [5]:
from nlp_cl_start import if_rest_udf

In [6]:
from my_metr import transform_to_score, my_scorer, transform_aggregated
from sklearn.metrics import accuracy_score, recall_score

## Load data
load all data, select bad reviews, define train and test

In [7]:
biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')
rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

1. Filter restaraunts

In [8]:
rests = biz.filter(if_rest_udf(biz.categories))

In [9]:
rest_rev = rev.join(rests.select('business_id','stars').withColumnRenamed('stars','rating'),'business_id')
bad_reviews = rest_rev.filter('stars < 3')

In [31]:
bad_reviews.count()

768690

### Now I change train sample from 20000 to 100000

In [203]:
bad_sample = bad_reviews.sample(False, 0.127, seed =91)
#bad_sample = bad_reviews.sample(False, 0.027, seed =91)


In [11]:
%%time
bad_sample.count()

CPU times: user 2.38 ms, sys: 2.14 ms, total: 4.52 ms
Wall time: 18.3 s


97353

In [12]:
bad_sample.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double]

2. Tokenize sample

In [13]:
sample_token= data_tokenizer(bad_sample)

In [14]:
splits = sample_token.randomSplit([0.8, 0.1, 0.1], seed = 91)


In [15]:
train = splits[0]
add_cl = splits[1]
test = splits[2]

In [16]:
train.first()

Row(business_id='--9e1ONYQuAa-CB_Rrw7Tw', cool=0, date='2012-05-11', funny=0, review_id='OaRMJKI6S7LAoa8xoEaqPg', stars=2, text="I feel like Twitter is pretty useless so far; I have an ok amount of followers I guess, but I feel like they're all following so many people that anything I say gets lost in the mess of their feed. I could tweet all day and get very little response to anything aside from a couple loyal folks on there. And it was one such reader that led me to Delmonico recently for what he claimed to be an excellent burger. He's a food writer in town so he should know what he's talking about, right?\n\nDelmonico is located in the small restaurant row of the Venetian right by another great burger spot, although most people would not come here for their burger. The interior is nothing too fancy in either the bar area or more traditional area in the back. I did notice while sitting at the bar that they have an amazing bourbon selection, but since I was there during a lunch break

In [17]:
train.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

### Model to cluster all reviews

In [114]:
cv = CountVectorizer(minDF=5, vocabSize=6000, inputCol='token', outputCol='vectors')
km1 = KMeans(k = 10, featuresCol='vectors', maxIter= 22)


pipe_count = Pipeline(stages=[cv, km1])

### idf 22.33

In [197]:
cv = CountVectorizer(minDF=5, vocabSize=5000, inputCol='token', outputCol='vectors')
idf = IDF(minDocFreq=7, inputCol="vectors", outputCol="features")
km2 = KMeans(k = 18, featuresCol='features', maxIter= 30)
pipe_idf = Pipeline(stages = [cv, idf, km2])

In [148]:
IDF?

In [198]:
%%time
pipe_cv_model = pipe_count.fit(train)

CPU times: user 86.2 ms, sys: 40.2 ms, total: 126 ms
Wall time: 6.79 s


In [199]:
%%time
###!!!!!!!!IDF
pipe_idf_model = pipe_idf.fit(train)

CPU times: user 112 ms, sys: 52.2 ms, total: 165 ms
Wall time: 10.1 s


3. Clustering user and bussiness

In [134]:
both = train.union(add_cl)
both.cache()

DataFrame[business_id: string, cool: bigint, date: string, funny: bigint, review_id: string, stars: bigint, text: string, useful: bigint, user_id: string, rating: double, token: array<string>]

In [117]:
user_with_cl = cluster_user_by_review(both, pipe_cv_model)

In [118]:
biz_with_cl = cluster_biz_by_review(both, pipe_cv_model)

### idf 22.36

In [187]:
user_with_cl = cluster_user_by_review(both, pipe_idf_model)
biz_with_cl = cluster_biz_by_review(both, pipe_idf_model)

In [23]:
user_with_cl.cache()
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

In [177]:
biz_with_cl.count()

87551

## Create test set 

I need reviews that I didn't use for clusterization restorants and user. But in same time that contains clusterized restaraunts and users

In [119]:
train_rev_id =both.select('review_id')
train_rev_id.take(2)

[Row(review_id='OaRMJKI6S7LAoa8xoEaqPg'),
 Row(review_id='syUWGlVaWBkr9iI7sMJGcw')]

In [120]:
%%time
user_with_cl.count()

CPU times: user 5.28 ms, sys: 3.46 ms, total: 8.74 ms
Wall time: 225 ms


87551

In [188]:
#### a little bit effective
known_rev2 = user_with_cl.join(rest_rev.select('business_id',
 'review_id',
 'stars',
 'user_id',
 'rating'), 'user_id').join(biz_with_cl, 'business_id')

In [64]:
%%time
known_rev2.count()

CPU times: user 32.9 ms, sys: 14.6 ms, total: 47.5 ms
Wall time: 27.5 s


13749260

In [189]:
new_t = known_rev2.join(train_rev_id, 'review_id','left_anti' )

In [190]:
%%time
new_t.count()

CPU times: user 77.3 ms, sys: 32.1 ms, total: 109 ms
Wall time: 1min 1s


12438216

In [67]:
####???
new_t.cache()

DataFrame[review_id: string, business_id: string, user_id: string, user_cl: int, stars: bigint, rating: double, biz_cl: int]

In [191]:
new_t.columns

['review_id', 'business_id', 'user_id', 'user_cl', 'stars', 'rating', 'biz_cl']

In [192]:
new_t = new_t.withColumn('similar', (new_t.user_cl == new_t.biz_cl).cast("int"))

In [193]:
regroup = new_t.groupBy('review_id').agg({'rating': 'mean', 'stars':'mean', 'similar':'sum' })
regroup.columns

['review_id', 'sum(similar)', 'avg(rating)', 'avg(stars)']

In [157]:
%%time
regroup.count()

CPU times: user 79.1 ms, sys: 32 ms, total: 111 ms
Wall time: 1min


735060

In [194]:
%%time
new_grf = regroup.toPandas()
new_grf.head()

CPU times: user 4.5 s, sys: 351 ms, total: 4.86 s
Wall time: 1min 10s


In [195]:
new_grf = transform_aggregated(new_grf)
new_grf.head()

Unnamed: 0,review_id,sum(similar),avg(rating),avg(stars),pred,act,base,base_3.5
0,-01ePuKPxMw1bhR4ISbIlw,5,4.0,4.0,True,False,False,False
1,-0XptEAda6qaK7QrkGF-IQ,2,3.0,4.0,True,False,False,True
2,-0u6BAh47_WiKXcjYcq8vQ,2,3.0,3.0,True,False,False,True
3,-1GR5fgGizpSfTMtWFQRUw,2,3.0,3.0,True,False,False,True
4,-1kQanNhit-7B9RBjY6p_A,6,4.0,3.0,True,False,False,False


In [196]:
my_scorer(new_grf)

Unnamed: 0,base,base_3.5,combo_35,combo_base,pred
accuracy,0.799355,0.748716,0.371719,0.387243,0.387118
recall,0.181595,0.406068,0.844635,0.78592,0.734589
prec,0.512817,0.385574,0.222732,0.218429,0.210122


In [144]:
act = new_grf.act
act.mean()


0.20248279052050172

In [161]:
new_grf['pred'].mean(), new_grf['base_3.5'].mean()

(0.7080836938481212, 0.213245177264441)

### 0.11 5,7,18 best

In [207]:
bad_sample2 = bad_reviews.sample(False, 0.5, seed =94)


In [211]:
sample_token2= data_tokenizer(bad_sample2)

In [212]:

biz_with_cl = cluster_biz_by_review(sample_token2, pipe_idf_model)

In [228]:
biz_with_cl = biz_with_cl.dropDuplicates()

In [229]:
biz_with_cl.cache()

DataFrame[business_id: string, biz_cl: int]

In [230]:
%%time
biz_with_cl.count()

CPU times: user 19 ms, sys: 9.26 ms, total: 28.3 ms
Wall time: 3.17 s


113815

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [206]:
biz.select('business_id').count()

188593

In [216]:
rests_id = rests.select('business_id')

In [223]:
rests_id.count()

57173

In [233]:
b0 = rests_id.join(biz_with_cl.filter('biz_cl ==0'), 'business_id', how = 'left')

In [221]:
b0.first()

Row(business_id='--9e1ONYQuAa-CB_Rrw7Tw', biz_cl=0)

In [234]:
%%time
b0.count()

CPU times: user 13.2 ms, sys: 7.83 ms, total: 21 ms
Wall time: 887 ms


57173

In [236]:
b0 = b0.withColumnRenamed('biz_cl', 'cl_0')
b0.columns

['business_id', 'cl_0']

In [237]:
for i in range(1,18):
    cond = 'biz_cl =='+str(i)
    colName = 'cl_'+str(i)
    b0 = b0.join(biz_with_cl.filter(cond), 'business_id', how = 'left').withColumnRenamed('biz_cl', colName)

In [238]:
b0.columns

['business_id',
 'cl_0',
 'cl_1',
 'cl_2',
 'cl_3',
 'cl_4',
 'cl_5',
 'cl_6',
 'cl_7',
 'cl_8',
 'cl_9',
 'cl_10',
 'cl_11',
 'cl_12',
 'cl_13',
 'cl_14',
 'cl_15',
 'cl_16',
 'cl_17']

In [242]:
biz_df_cl = rests.select(api_f).join(b0, 'business_id').toPandas()

In [243]:
biz_df_cl.head()

Unnamed: 0,business_id,RestaurantsPriceRange2,stars,review_count,categories,cl_0,cl_1,cl_2,cl_3,cl_4,...,cl_8,cl_9,cl_10,cl_11,cl_12,cl_13,cl_14,cl_15,cl_16,cl_17
0,--9e1ONYQuAa-CB_Rrw7Tw,4,4.0,1546,"Steakhouses, Restaurants, Cajun/Creole",0.0,1.0,2.0,,4.0,...,8.0,,,,,13.0,,,16.0,17.0
1,-VAsjhmAbKF3Pb_-8rh3xg,1,2.0,10,"Fast Food, Burgers, Restaurants",,1.0,,,,...,,,,,,,,,,
2,-cxD1NimFldATDUsN-oa3A,2,2.0,23,"Mexican, Restaurants",,1.0,,,4.0,...,,,,,,,,,,
3,-r8SvItXXG6_T3mP5GXRAw,2,4.0,10,"Restaurants, Noodles, Food, Cafes, Chinese, Co...",,,,,,...,,,,,,,,,,
4,0859wfd1BQHG46Zpwhc0ZQ,2,4.5,245,"Nightlife, Pizza, Wine Bars, Bars, American (N...",,1.0,,,,...,,,,,,,,,,


In [244]:
biz_df_cl.to_csv('biz_cluster.csv')