In [1]:
import numpy as np
import pandas as pd

from operator import add
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF

from pyspark.ml import Pipeline, PipelineModel


from pyspark.sql.functions import *

from pyspark.sql.types import *

import folium
import html

### Data Loading

In [3]:
business_df = spark.read.parquet( 'business-small.parquet')

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
business_df.createOrReplaceTempView("businesses")

In [5]:
user_df = spark.read.parquet('user-small.parquet')
user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [6]:
user_df.createOrReplaceTempView("users")

In [7]:
review_df = spark.read.parquet('review-small.parquet')
review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [8]:
review_df.createOrReplaceTempView("reviews")

### Обработка текста и создание фич на его основе

In [9]:
reviews_text = spark.sql("SELECT business_id, review_text FROM reviews")
reviews_text.show(3)

+--------------------+--------------------+
|         business_id|         review_text|
+--------------------+--------------------+
|mr4FiPaXTWlJ3qGzp...|I left Table 17 f...|
|mr4FiPaXTWlJ3qGzp...|for the time bein...|
|mr4FiPaXTWlJ3qGzp...|Love this place. ...|
+--------------------+--------------------+
only showing top 3 rows



In [10]:
# соединяем отзывы и рестораны
reviews_text_rdd = reviews_text.rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_business_df.count()   

6750

In [None]:
# pipeline генерации фичей
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

# fit
pipeline_mdl = pipeline.fit(reviews_by_business_df)

#save
pipeline_mdl.write().overwrite().save('pipe_txt')

In [12]:
pipeline_mdl = PipelineModel.load( 'pipe_txt')

In [13]:
# обработка данных для модели
reviews_by_business_trf_df = pipeline_mdl.transform(reviews_by_business_df)


In [14]:
# 
reviews_by_business_trf_df.select( 'text', 'nostopwrd', 'idf_vec', 'word_vec', 'comb_vec').show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           nostopwrd|             idf_vec|            word_vec|            comb_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Attention allergy...|[attention, aller...|(126299,[0,1,2,3,...|[0.01133526750371...|(126399,[0,1,2,3,...|
|I don't understan...|[understand, prev...|(126299,[0,1,2,3,...|[-0.0166730810390...|(126399,[0,1,2,3,...|
|Food here is alwa...|[food, always, fr...|(126299,[0,2,3,4,...|[-0.0013763984012...|(126399,[0,2,3,4,...|
|i keep on coming ...|[keep, coming, ba...|(126299,[0,1,2,3,...|[0.04195703681602...|(126399,[0,1,2,3,...|
|I love this place...|[love, place, pro...|(126299,[0,1,2,3,...|[0.03136265755356...|(126399,[0,1,2,3,...|
|We were here last...|[last, night, rap...|(126299,[0,1,2,3,...|[0.05398540855765...|(126399,[0,1,2,3,...|
|Kirei was part of...|[kirei, part, s

In [16]:
reviews_by_business_trf_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwrd: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeature: vector (nullable = true)
 |-- idf_vec: vector (nullable = true)
 |-- word_vec: vector (nullable = true)
 |-- comb_vec: vector (nullable = true)



In [17]:
# функция схожести
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2)) 

In [18]:
all_business_vecs = reviews_by_business_trf_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [19]:
# 1 строка 
all_business_vecs[1]

('Dl2vgi5W_nbe-A97D0zgfA',
 DenseVector([-0.0167, 0.0002, -0.0424, -0.0282, -0.0148, -0.0422, 0.0008, -0.0571, -0.0222, 0.0302, -0.0607, 0.0654, -0.0822, -0.0371, 0.0366, 0.0218, 0.0188, -0.0654, -0.0116, -0.0427, 0.0087, 0.0133, 0.0079, 0.0058, 0.1021, -0.0085, 0.0447, -0.087, -0.003, -0.0083, -0.0345, 0.0596, -0.0254, 0.0326, 0.0526, 0.0563, 0.0442, 0.0128, -0.0115, -0.0216, 0.0598, -0.0289, -0.0175, 0.0052, 0.06, -0.0747, -0.0769, 0.0693, 0.0589, -0.017, 0.018, -0.0264, 0.0283, -0.0598, 0.0609, 0.0174, 0.0492, 0.0459, -0.0151, -0.0558, 0.0161, -0.1232, 0.0352, -0.0366, -0.0109, -0.0347, -0.0094, -0.0101, -0.024, -0.0322, -0.0297, 0.0323, 0.0171, 0.0436, 0.0587, -0.0266, 0.0826, -0.0328, 0.0251, -0.0253, -0.0636, -0.0724, -0.0517, 0.0041, 0.0043, 0.0803, -0.1038, -0.0452, 0.0507, -0.0056, -0.0278, 0.0256, 0.0247, 0.021, -0.1104, -0.017, 0.0096, -0.0659, -0.002, -0.0284]))

In [23]:
def getSimilarBusinesses(b_ids, sim_bus_limit=10):
    
    schema = StructType([   
                            StructField("business_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_business_id", StringType(), True)
                        ])
    
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0]
        similar_business_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(sim_bus_limit)
        similar_business_df = similar_business_df.withColumn('input_business_id', lit(b_id))
        
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
        
    
    return similar_businesses_df
    

In [24]:
def getBusinessDetails(in_bus):
    
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])
    

In [25]:
def showInMap(df):
    
    mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

    for i, r in df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color='green')).add_to(mp)
    return mp

In [26]:
# тестирование по 2 ID
bids = ['Dl2vgi5W_nbe-A97D0zgfA', 'RtUvSWO_UZ8V3Wpj0n077w']

print('\ninput restaurants details:')
business_df.select('business_id','business_name', 'categories') \
    .filter(business_df.business_id.isin(bids) == True).show(truncate=False)
    
# получаем результат
sims = getBusinessDetails(getSimilarBusinesses(bids))

sims.select('input_business_id','business_name', 'score','categories').toPandas()


input restaurants details:
+----------------------+----------------------+------------------------------------------------------------------------------+
|business_id           |business_name         |categories                                                                    |
+----------------------+----------------------+------------------------------------------------------------------------------+
|RtUvSWO_UZ8V3Wpj0n077w|KINKA IZAKAYA ORIGINAL|[Pubs, Japanese, Restaurants, Bars, Nightlife, Tapas Bars, Tapas/Small Plates]|
|Dl2vgi5W_nbe-A97D0zgfA|Tasty Hut             |[Restaurants, Chinese]                                                        |
+----------------------+----------------------+------------------------------------------------------------------------------+

Top 10 similar restaurants for each input restaurant are:"


Unnamed: 0,input_business_id,business_name,score,categories
0,Dl2vgi5W_nbe-A97D0zgfA,New Regime Restaurant,0.941276,"[Restaurants, Chinese]"
1,Dl2vgi5W_nbe-A97D0zgfA,The Only Cuisine Corp,0.940966,"[Restaurants, Chinese]"
2,Dl2vgi5W_nbe-A97D0zgfA,L's Chinese Eatery,0.940475,"[Chinese, Restaurants]"
3,Dl2vgi5W_nbe-A97D0zgfA,Qin Tang Taste,0.939837,"[Chinese, Restaurants]"
4,Dl2vgi5W_nbe-A97D0zgfA,Congee Me,0.938798,"[Chinese, Restaurants]"
5,Dl2vgi5W_nbe-A97D0zgfA,King Lobster,0.930083,"[Seafood, Restaurants, Chinese]"
6,Dl2vgi5W_nbe-A97D0zgfA,Si Chuan House Cuisine,0.929784,"[Restaurants, Chinese]"
7,Dl2vgi5W_nbe-A97D0zgfA,Golden House,0.929729,"[Restaurants, Chinese]"
8,Dl2vgi5W_nbe-A97D0zgfA,Jackpot Chicken Rice,0.929478,"[Restaurants, Chinese, Bars, Pan Asian, Singap..."
9,Dl2vgi5W_nbe-A97D0zgfA,Verdant Garden Chinese Restaurant,0.92908,"[Chinese, Restaurants]"


In [27]:
def getContentRecoms(u_id, sim_bus_limit=10):
    
    # получаем рестораны по рекомендациям, которые имеют рейтинг (3+)
    query = """
    SELECT distinct business_id FROM reviews  
    where stars >= 3.0 
    and user_id = "{}"
    """.format(u_id)

    usr_rev_bus = sqlContext.sql(query)
    
    # топ 5
    usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)
    
    # 
    print('\nBusinesses previously reviewed by user:')
    usr_rev_bus_det.select(['business_id', 'business_name', 'categories']).show(truncate = False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    # схожесть
    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_limit)

    # фильтрация
    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
         .where(col("r.business_id").isNull()) \
         .select([col('s.business_id'),col('s.score')])

    a = j.orderBy("score", ascending = False).limit(sim_bus_limit)

    return getBusinessDetails(a)

     

In [28]:
# тест на пользователе

u_id = 'Wc5L6iuvSNF5WGBlqIO8nw'

content_recom_df = getContentRecoms(u_id)

content_recom_df.toPandas()


Businesses previously reviewed by user:
+----------------------+------------------+-------------------------------------------------------------------------------+
|business_id           |business_name     |categories                                                                     |
+----------------------+------------------+-------------------------------------------------------------------------------+
|9jYnZymuaW-XpMIS75YxgQ|The Beaver        |[Canadian (New), Nightlife, Cafes, Bars, Restaurants, Gay Bars, American (New)]|
|_HqZL3gK98-Q4ObAoyM1aw|Rose and Sons Swan|[Breakfast & Brunch, American (Traditional), Restaurants]                      |
|wSojc-y-d7MWiGWdy8deCg|Barque Smokehouse |[Barbeque, Chicken Wings, Food, Restaurants, Smokehouse]                       |
|aAYl1-tsvP9Kzs7YESi8Rg|Electric Mud BBQ  |[Restaurants, Sandwiches, Food, Barbeque, Desserts]                            |
|oc8204pyvf9ixdA4JzzaLA|The Good Son      |[Canadian (New), Restaurants]                   

Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,NrJSy3dgcXErFtOKGREmnw,0.980849,Aft Kitchen & Bar,"[American (Traditional), Bars, Barbeque, Ameri...",4.0,130,43.658654,-79.350779
1,RyDiwx4xD3Lx8sWHx1aFlQ,0.978504,Smoque N' Bones,"[Barbeque, American (Traditional), Comfort Foo...",4.0,273,43.645676,-79.410701
2,Fl2yDmC0B0TTVsNixxVXfA,0.975142,The Emerson,"[Canadian (New), Restaurants]",4.0,104,43.658353,-79.442003
3,onCx6Ye5IwD07emyj-VLrg,0.973462,Hogtown Smoke,"[Food Trucks, Food, Barbeque, Restaurants, Sou...",3.5,132,43.669549,-79.302143
4,oQylTvXwGIkKFdCjmafKVg,0.973422,Fire on the East Side,"[Southern, Restaurants, Breakfast & Brunch, Am...",3.5,119,43.666765,-79.384836
5,WnUttoJffplgWaQGR2J2Xw,0.973141,The Saint Tavern,"[Restaurants, Bars, Nightlife, Gastropubs]",3.5,121,43.649062,-79.420478
6,ShUh_MMkaVp_KXCtNjPvXA,0.973117,Universal Grill,"[American (Traditional), Canadian (New), Break...",3.5,45,43.670521,-79.42644
7,cFyXbPxTAwKRlIqe-XLMcw,0.972669,STACK,"[Restaurants, Barbeque]",4.0,206,43.729286,-79.40309
8,Cew9RRnuldbqaL3MW6p1wg,0.970811,Greenwood Smokehouse BBQ,"[Southern, Soul Food, Barbeque, Restaurants]",3.5,120,43.678998,-79.344287
9,8cusyHsqMhNAsrNSMpzB1A,0.969611,The 420 Smokehouse,"[Food, Restaurants, American (Traditional), Sm...",4.0,53,43.662659,-79.367316


In [29]:
showInMap(content_recom_df)

In [30]:
def getKeyWordsRecoms(key_words, sim_bus_limit):
    
    print('\nBusinesses similar to key words: "' + key_words + '"')
    
    input_words_df = sc.parallelize([(0, key_words)]).toDF(['business_id', 'text'])
    
    # transform the the key words to vectors
    input_words_df = pipeline_mdl.transform(input_words_df)
    
    # choose word2vec vectors
    input_key_words_vec = input_words_df.select('word_vec').collect()[0][0]
    
    # get similarity
    sim_bus_byword_rdd = sc.parallelize((i[0], float(CosineSim(input_key_words_vec, i[1]))) for i in all_business_vecs)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed('_1', 'business_id') \
         .withColumnRenamed('_2', 'score') \
         .orderBy("score", ascending = False)
    
    # return top 10 similar businesses
    a = sim_bus_byword_df.limit(sim_bus_limit)
    return getBusinessDetails(a)


In [31]:
# test key word similarity to review text

key_words = 'chicken cheese burger'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_df.toPandas()


Businesses similar to key words: "chicken cheese burger"


Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,37joQpD9m5AIcrW1c8OBnQ,0.71846,Urban Smoke Fusion BBQ Food Truck,"[Desserts, Barbeque, Food, Restaurants, Food T...",4.0,8,43.718711,-79.470037
1,3Cu-af4en3uWCrAkkqfiHQ,0.697351,Epic Burgers and Waffles,"[Burgers, Food, Restaurants]",2.5,5,43.632351,-79.42128
2,nP87zXxeS-8got7IBvoAuA,0.662935,McCoy Burger Company,"[Local Flavor, Sandwiches, Restaurants, Poutin...",4.0,33,43.731511,-79.404081
3,DiCMYxT69I22-1nfsvYAJQ,0.662169,Gourmet Burger Co,"[Burgers, Restaurants]",3.5,37,43.664683,-79.368279
4,ky9RbwLtChekSrqcYR39kw,0.652767,Big Smoke Burger,"[Burgers, Poutineries, Restaurants]",3.0,6,43.611289,-79.556867
5,ZzF5098L4xg-0COjng2LVA,0.648873,Burgatory,"[Pubs, Burgers, Food Trucks, Nightlife, Bars, ...",3.0,9,43.655055,-79.418563
6,UN0UwUh7jaeX6Jg3lZImCg,0.644995,Holy Chuck,"[Food, Restaurants, Desserts, Poutineries, Bur...",3.0,43,43.665211,-79.384925
7,ycAW6Q5quaCSDX5zwQ3tPg,0.64092,New York Fries,"[Canadian (New), Specialty Food, Food, Restaur...",3.5,8,43.776875,-79.256655
8,PkeaeQS8aJTeS8PS_Hl_-g,0.635419,Steak and Cheese Factory,"[Sandwiches, Cheesesteaks, Restaurants]",3.0,3,43.708213,-79.392367
9,67Pa_CtXthgJzXfY8JzLDQ,0.635081,Holy Chuck,"[Burgers, Restaurants]",3.5,263,43.687527,-79.39406


In [31]:
showInMap(keywords_recom_df)