![image-2.png](attachment:image-2.png)

In [6]:
# WITH rating AS (
# SELECT joined_table.business_id,
# joined_table.city,
# ROW_NUMBER() OVER(PARTITION BY joined_table.city ORDER BY joined_table.count_stars DESC) number,
# joined_table.count_stars
# FROM 
#     (SELECT 
#     review.business_id,
#     business.city,
#     count(review.stars) as count_stars
#     FROM review 
#     JOIN business ON review.business_id = business.business_id
#     WHERE review.stars<3
#     GROUP BY review.business_id, business.city
#     ) joined_table)
 
# SELECT business_id, city, count_stars        
# FROM rating
# WHERE number<=10

In [8]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import Window

In [9]:
spark = SparkSession.builder.appName('Spark DF task1').master('yarn').getOrCreate()

In [None]:
spark

In [10]:
business = (spark.read.format("json")
    .load("/data/yelp/business")
)

In [11]:
business.show(1, truncate=False, vertical=True)

In [12]:
review = spark.read.json('/data/yelp/review')

In [13]:
review.show(1, truncate=False, vertical=True)

In [14]:
spark.sql("SET spark.sql.autoBroadcastJoinThreshold = 10")
business_review = review.join(f.broadcast(business.drop('stars')), on='business_id', how='inner')

In [15]:
business_review.show(1, truncate=False, vertical=True)

In [16]:
business_review_processed = business_review.select(["business_id", "city", "stars"])

In [17]:
business_review_processed = business_review_processed.filter("stars <3")

In [18]:
business_review_processed.show(40)

In [19]:
business_review_grouped = business_review_processed.groupby(["business_id", "city"]).agg(f.count('stars'))

In [None]:
business_review_grouped.show(5)

In [None]:
business_review_rank = business_review_grouped.select("business_id", "city", 'count(stars)', f.rank().over(
    Window.partitionBy('city').orderBy(f.col('count(stars)').desc())
).alias('rank'))

In [None]:
business_review_rank.show(20)

In [None]:
business_review_rank = business_review_rank.select(
    business_review_rank.business_id,
    business_review_rank.city,
    business_review_rank.rank,
    business_review_rank['count(stars)'].alias("stars")
    )

In [None]:
business_review_rank.registerTempTable("business_review_rank")

In [None]:
query_str = """
SELECT business_review_rank.business_id, business_review_rank.city, business_review_rank.stars      
FROM business_review_rank
WHERE business_review_rank.rank<=10
"""

In [None]:
rating = spark.sql(query_str)

In [None]:
rating.show(50)

In [None]:
rating.count()

In [None]:
rating.write.csv("business_review_counts.tsv", sep='\t', mode='overwrite')

In [None]:
! hdfs dfs -ls 

In [None]:
! hdfs dfs -cat student44/* | head -n 100 

In [None]:
rating_hive = (spark.read.csv("/data/yelp/business")
)