In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style="darkgrid")


from pyspark.sql.functions import *
from pyspark.sql.types import *



import folium
import html

### Load Data

##### Load business data

In [3]:
business_df = spark.read.parquet('business-small.parquet')

In [7]:
# создадим SQL вид для привычного обращения через SQL скрипт
business_df.createOrReplaceTempView("businesses")

##### Load user data

In [8]:
user_df = spark.read.parquet(data_path + 'user-small.parquet')

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [11]:
user_df.createOrReplaceTempView("users")

##### Load users' friends data

In [12]:
friend_df = spark.read.parquet('friend-small.parquet')

root
 |-- user_id: string (nullable = true)
 |-- friend_id: string (nullable = true)



In [21]:
friend_df.createOrReplaceTempView("friends")

##### Load review data

In [15]:
review_df = spark.read.parquet( 'review-small.parquet')

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [17]:
# меняем типы данных для лучшего обращения к ним
review_df = review_df.withColumn("review_date", review_df["review_date"].cast(DateType()))

review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: date (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [20]:
review_df.createOrReplaceTempView("reviews")

### Простая рекомендашка (как посоветуют друзья)

In [54]:
u_id = 'uxKSnOVAoEj4I6X9YhLBlg'

query = """
select business_id, count(*) as 4_5_stars_count 
from reviews
where user_id in
    (select f.friend_id from friends f
    inner join users u on f.friend_id = u.user_id
    where f.user_id = "{}") 
and stars >= 4 
and business_id not in (select business_id from reviews where user_id = "{}")
group by business_id
order by count(*) desc limit 100
""".format(u_id, u_id)

friend_recoms_df = sqlContext.sql(query)

friend_recoms_df.toPandas()

Unnamed: 0,business_id,4_5_stars_count
0,SGP1jf6k7spXkgwBlhiUVw,5
1,kOFDVcnj-8fd3doIpCQ06A,5
2,0a2O150ytxrDjDzXNfRWkA,4
3,k6zmSLmYAquCpJGKNnTgSQ,4
4,SjgeuBlgKER9yegpoxT99w,4
5,b4LmLgVdbhM-nc1IZc5Weg,3
6,-J6FVdY9pSgAdFmmalO-pQ,3
7,28adZ4lsuUeVB2aWzohK9g,3
8,G6EkDTXZ6zMUovg7JTG4YQ,3
9,RwRNR4z3kY-4OsFqigY5sw,3


In [58]:
# Возьмем сэмпл данных
friend_recoms_df = friend_recoms_df.sample(False, 0.5).limit(10)
friend_recoms_df.toPandas()

Unnamed: 0,business_id,4_5_stars_count
0,SGP1jf6k7spXkgwBlhiUVw,5
1,kOFDVcnj-8fd3doIpCQ06A,5
2,SjgeuBlgKER9yegpoxT99w,4
3,GcxE5hK_TaHP4EZFDYz2mg,3
4,crstB-H5rOfbXhV8pX0e6g,3
5,28adZ4lsuUeVB2aWzohK9g,3
6,HUYEadSbGSQNHXFmT2Ujjw,3
7,RwRNR4z3kY-4OsFqigY5sw,3
8,b4LmLgVdbhM-nc1IZc5Weg,3
9,a8pmtlVKf7NiSLI-4KejIw,2


In [38]:
def getBusinessDetails(in_bus):
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])
    

In [51]:
def showInMap(df):
    mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

    for i, r in df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["business_name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color='orange')).add_to(mp)
    return mp

In [55]:
def getFriendRecoms(u_id, sim_bus_limit=10):
    
    query = """
    select business_id, count(*) as 4_5_stars_count 
    from reviews
    where user_id in
        (select f.friend_id from friends f
        inner join users u on f.friend_id = u.user_id
        where f.user_id = "{}") 
    and stars >= 4 
    and business_id not in (select business_id from reviews where user_id = "{}")
    group by business_id
    order by count(*) desc limit 100
    """.format(u_id, u_id)

    friend_recoms_df = sqlContext.sql(query)
    
    friend_recoms_df = friend_recoms_df.sample(False, 0.5).limit(sim_bus_limit)

    return getBusinessDetails(friend_recoms_df)

In [56]:
#  тестируем на одном пользователе

u_id = 'uxKSnOVAoEj4I6X9YhLBlg'

friend_recom_df = getFriendRecoms(u_id)

friend_recom_df.toPandas()

Businesses recommended to user by best useful friends:


Unnamed: 0,business_id,4_5_stars_count,business_name,categories,stars,review_count,latitude,longitude
0,SGP1jf6k7spXkgwBlhiUVw,5,Kekou Gelato House,"[Food, Restaurants, Ice Cream & Frozen Yogurt,...",4.5,332,43.655983,-79.392686
1,kOFDVcnj-8fd3doIpCQ06A,5,Mildred's Temple Kitchen,"[Comfort Food, Event Planning & Services, Vege...",4.0,472,43.639911,-79.420424
2,0a2O150ytxrDjDzXNfRWkA,4,Miku Toronto,"[Sushi Bars, Restaurants, Seafood, Japanese]",4.0,384,43.641235,-79.37737
3,G6EkDTXZ6zMUovg7JTG4YQ,3,Vietnam Noodle Star,"[Restaurants, Vietnamese, Noodles]",3.5,148,43.804603,-79.287842
4,RwRNR4z3kY-4OsFqigY5sw,3,Uncle Tetsu's Japanese Cheesecake,"[Desserts, Japanese, Restaurants, Bakeries, Food]",3.5,806,43.655969,-79.384013
5,Yv4P4qUwd7F-qQ4Y4eD1JQ,3,Han Ba Tang,"[Nightlife, Pubs, Lounges, Korean, Asian Fusio...",3.5,213,43.762928,-79.411511
6,dTuT_G3Zp79RZmnF3oxfiA,3,The Bier Markt,"[Belgian, Nightlife, Bars, Gastropubs, Canadia...",3.0,197,43.647095,-79.373915
7,MhiBpIBNTCAm1Xd3WzRzjQ,3,Messini Authentic Gyros,"[Mediterranean, Sandwiches, Greek, Restaurants...",3.5,372,43.677691,-79.350536
8,9_CGhHMz8698M9-PkVf0CQ,2,Little Coxwell Vietnamese & Thai Cuisine,"[Vietnamese, Thai, Restaurants]",4.0,109,43.696175,-79.329092
9,ofw8aDSEg1HoQdmCgvLtaQ,2,The Pie Commission,"[Canadian (New), Fast Food, Food, Do-It-Yourse...",4.5,183,43.623881,-79.512074


In [57]:
showInMap(friend_recom_df)