In [1]:
import numpy as np
import pandas as pd

from operator import add
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF

from pyspark.ml import Pipeline, PipelineModel


from pyspark.sql.functions import *

from pyspark.sql.types import *

import folium
import html


In [2]:
#path

data = '/Users/jerrygeorge/Desktop/finalproj/data/'
model = '/Users/jerrygeorge/Desktop/finalproj/model/'
output = '/Users/jerrygeorge/Desktop/finalproj/output/'

In [3]:
#load the dataset to spark

business_df = spark.read.parquet(data + 'business-small.parquet')
user_df = spark.read.parquet(data + 'user-small.parquet')
review_df = spark.read.parquet(data + 'review-small.parquet')

In [4]:
business_df.createOrReplaceTempView("businesses")
user_df.createOrReplaceTempView("users")
review_df.createOrReplaceTempView("reviews")

In [5]:
reviews_text = spark.sql("SELECT user_id, review_text FROM reviews")
reviews_text1 = spark.sql("SELECT business_id, review_text FROM reviews")


In [7]:
#user Reviews
reviews_text_rdd = reviews_text.rdd
reviews_by_user_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_user_df = spark.createDataFrame(reviews_by_user_rdd)
reviews_by_user_df = reviews_by_user_df \
                            .withColumnRenamed('_1', 'user_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_user_df.count()

73041

In [8]:
#business Reviews

reviews_text_rdd = reviews_text1.rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_business_df.count()

7148

In [43]:


# Build the pipeline 
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

# fit the model
pipeline_mdl = pipeline.fit(reviews_by_business_df)
pipeline_mdl1 = pipeline.fit(reviews_by_user_df)

In [44]:
pipeline_mdl.write().overwrite().save(model + 'bmn')

In [10]:
pipeline_mdl.write().overwrite().save(model + 'kmn')

In [9]:
pipeline_mdl = PipelineModel.load(model + 'bmn')
pipeline_mdl1 = PipelineModel.load(model + 'kmn')

In [10]:
reviews_by_business_trf_df = pipeline_mdl.transform(reviews_by_business_df)
all_business_vecs = reviews_by_business_trf_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [11]:
reviews_by_user_trf_df = pipeline_mdl1.transform(reviews_by_user_df)
all_user_vecs = reviews_by_user_trf_df.select('user_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [17]:
df=reviews_by_user_trf_df.select('user_id', 'word_vec')
df1=reviews_by_business_trf_df.select('business_id', 'word_vec')

In [18]:
#clustering on User Reviews
from pyspark.ml.clustering import KMeans


for i in [5,10,20,40,60,100,200,500,1000]:
    k_means = KMeans(featuresCol='word_vec', k=i)
    model_k2 = k_means.fit(df)
    wssse = model_k2.computeCost(df)
    print("K =",i," Error = " + str(wssse))
    md1=model_k2.transform(reviews_by_business_trf_df)
    cd1=model_k2.transform(df).groupBy('prediction')
    

K = 5  Error = 12802.347581913407
K = 10  Error = 11887.354837578136
K = 20  Error = 11119.188117519327
K = 40  Error = 10256.735516459115
K = 60  Error = 9822.806304125636
K = 100  Error = 9324.91252053553
K = 200  Error = 8737.054560978304
K = 500  Error = 8013.783930887123
K = 1000  Error = 7490.742202247959


In [19]:
#clustering on Business Reviews

for i in [5,10,20,40,60,100,200,500,1000]:
    k_means = KMeans(featuresCol='word_vec', k=i)
    model_k2 = k_means.fit(df1)
    wssse = model_k2.computeCost(df1)
    print("K =",i," Error = " + str(wssse))
    md=model_k2.transform(reviews_by_user_trf_df)
    cd=model_k2.transform(df1).groupBy('prediction')
    

K = 5  Error = 334.72580562117344
K = 10  Error = 281.796801316916
K = 20  Error = 240.84834948249204
K = 40  Error = 208.03118329668746
K = 60  Error = 193.7375857201574
K = 100  Error = 176.12536853593758
K = 200  Error = 155.9337213778811
K = 500  Error = 129.8862568218172
K = 1000  Error = 105.03756355436978


In [27]:
#clustered business with clusters (100)
cd.show()

+--------------------+--------------------+----------+
|         business_id|            word_vec|prediction|
+--------------------+--------------------+----------+
|5r6-G9C4YLbC7Ziz5...|[0.08240829653115...|        78|
|z8oIoCT1cXz7gZP5G...|[0.07068449387273...|        95|
|XWTPNfskXoUL-Lf32...|[0.07005708263746...|        56|
|oWTn2IzrprsRkPfUL...|[0.07971497538309...|        42|
|28adZ4lsuUeVB2aWz...|[0.08529358667228...|        37|
|Xy74meQwdTnloAAyR...|[0.07789672117561...|        47|
|4_GIJk0tX3k0x0FcU...|[0.13063595783159...|        74|
|0-yj2jtzLUHG2b7Pp...|[0.10097199339370...|        70|
|Tn8O4tv1U-n0PRC8k...|[0.13260006795403...|        35|
|vyeQzjZFx6KoL2pJB...|[0.08210715715551...|        87|
|D2PmpZYRdRnzL7q4W...|[0.10713880017141...|         9|
|7Uti5EeAwm3drG14K...|[0.08812031351261...|         3|
|3RlylOY452bA8rwli...|[0.08965089654908...|        90|
|lRwpFFaFr3QQ5fGHk...|[0.08738266912594...|        27|
|IBZ6e3P0OqcPXVjvG...|[0.07212599243266...|        12|
|rdn0vLw8L