In [150]:
!conda activate myenv

In [151]:
!where python

C:\Users\strix\miniconda3\envs\myenv\python.exe
C:\Users\strix\AppData\Local\Programs\Python\Python311\python.exe
C:\Users\strix\AppData\Local\Programs\Python\Python36-32\python.exe


In [152]:
!set PYSPARK_PYTHON=C:\Users\strix\miniconda3\envs\myenv\python.exe

In [153]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:\\Users\\strix\\miniconda3\\envs\\myenv\\python.exe'

In [154]:
import pyspark
import itertools

In [155]:
from pyspark.sql import SparkSession


# Create a Spark session
spark = SparkSession.builder \
    .appName("Large JSON Processing") \
    .config("spark.executor.heartbeatInterval", "200000")\
    .config("spark.network.timeout", "300000")\
    .config("spark.executor.memory", "47185920000") \
    .getOrCreate()


In [156]:
json_file_path = 'yelp_academic_dataset_review.json'

# Read the JSON file
df = spark.read.json(json_file_path)

In [157]:
df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [158]:
df.show()

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|XQfwVwDr-v0ZS3_Cb...|   0|2018-07-07 22:09:11|    0|KU_O5udG6zpxOg-Vc...|  3.0|If you decide to ...|     0|mh_-eMZ6K5RLWhZyI...|
|7ATYjTIgM3jUlt4UM...|   1|2012-01-03 15:28:18|    0|BiTunyQ73aT9WBnpR...|  5.0|I've taken a lot ...|     1|OyoGAe7OKpv6SyGZT...|
|YjUWPpI6HXG530lwP...|   0|2014-02-05 20:30:30|    0|saUsX_uimxRlCVr67...|  3.0|Family diner. Had...|     0|8g_iMtfSiwikVnbP2...|
|kxX2SOes4o-D3ZQBk...|   1|2015-01-04 00:01:03|    0|AqPFMleE6RsU23_au...|  5.0|Wow!  Yummy, diff...|     1|_7bHUi9Uuf5__HHc_...|
|e4Vwtrqf-wpJfwesg...|   1|2017-01-14 20:54:15|    0|Sx8TMOWLNuJBWer-0...|  4.0|Cute inter

In [159]:
df.count()

6990280

We select only the business_id and stars columns to reduce the data to be processed

In [160]:
data_frame = df.select("business_id", "stars")

In [161]:
data_frame.show()

+--------------------+-----+
|         business_id|stars|
+--------------------+-----+
|XQfwVwDr-v0ZS3_Cb...|  3.0|
|7ATYjTIgM3jUlt4UM...|  5.0|
|YjUWPpI6HXG530lwP...|  3.0|
|kxX2SOes4o-D3ZQBk...|  5.0|
|e4Vwtrqf-wpJfwesg...|  4.0|
|04UD14gamNjLY0IDY...|  1.0|
|gmjsEdUsKpj9Xxu6p...|  5.0|
|LHSTtnW3YHCeUkRDG...|  5.0|
|B5XSoSG3SfvQGtKEG...|  3.0|
|gebiRewfieSdtt17P...|  3.0|
|uMvVYRgGNXf5boolA...|  5.0|
|EQ-TZ2eeD_E0BHuvo...|  4.0|
|lj-E32x9_FA7GmUrB...|  4.0|
|RZtGWDLCAtuipwaZ-...|  4.0|
|otQS34_MymijPTdNB...|  4.0|
|BVndHaLihEYbr76Z0...|  5.0|
|YtSqYv1Q_pOltsVPS...|  5.0|
|rBdG_23USc7DletfZ...|  4.0|
|CLEWowfkj-wKYJlQD...|  5.0|
|eFvzHawVJofxSnD7T...|  5.0|
+--------------------+-----+
only showing top 20 rows



We make a map to count the positive reviews (>4) and the negative ones (<=4)

In [162]:
mapped_rdd = data_frame.rdd.map(lambda x: (x[0], (1,0)) if x[1] > 4.0  else (x[0], (0,1)))

We do a reduction to gather all the positive reviews (>4) and the negative ones (<=4) and add them up. Finally, we added the totals of negatives and positives, since reviewing the dataframe we realized that there were counts of reviews in the business dataset that did not match the reviews dataset.

In [163]:
mapped_rdd = mapped_rdd.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[0] + y[0] + x[1] + y[1]))

In [164]:
mapped_rdd.top(10)


[('zzyx5x0Z7xXWWvWnZFuxlQ', (0, 8, 8)),
 ('zzw66H6hVjXQEt0Js3Mo4A', (3, 2, 5)),
 ('zzu6_r3DxBJuXcjnOYVdTw', (1, 7, 8)),
 ('zztOG2cKm87I6Iw_tleZsQ', (5, 1, 6)),
 ('zznZqH9CiAznbkV6fXyHWA', (11, 1, 12)),
 ('zznJox6-nmXlGYNWgTDwQQ', (3, 27, 30)),
 ('zzjFdJwXuxBOGe9JeY_EMw', (27, 21, 48)),
 ('zzjCxn89a7RQo8keIOO_Ag', (0, 5, 5)),
 ('zziDpuuJw-Km1J4BaGpBKA', (1, 5, 6)),
 ('zzg-Il9zxsaVXlCDrcG7hg', (16, 0, 16))]

With this mapper we sort the rdd the different attributes that we have added

In [165]:
mapped_rdd = mapped_rdd.map(lambda x: (x[0], x[1][0], x[1][1], x[1][2]))

In [166]:
mapped_rdd.top(10)

[('zzyx5x0Z7xXWWvWnZFuxlQ', 0, 8, 8),
 ('zzw66H6hVjXQEt0Js3Mo4A', 3, 2, 5),
 ('zzu6_r3DxBJuXcjnOYVdTw', 1, 7, 8),
 ('zztOG2cKm87I6Iw_tleZsQ', 5, 1, 6),
 ('zznZqH9CiAznbkV6fXyHWA', 11, 1, 12),
 ('zznJox6-nmXlGYNWgTDwQQ', 3, 27, 30),
 ('zzjFdJwXuxBOGe9JeY_EMw', 27, 21, 48),
 ('zzjCxn89a7RQo8keIOO_Ag', 0, 5, 5),
 ('zziDpuuJw-Km1J4BaGpBKA', 1, 5, 6),
 ('zzg-Il9zxsaVXlCDrcG7hg', 16, 0, 16)]

We created a dataframe from the rdd with the renamed columns so that they are more intuitive when dealing with them

In [167]:
df_values = mapped_rdd.toDF(["business_id_rev", "negative_reviews", "positive_reviews", "review_count_reviews"])

In [168]:
df_values.count()

150346

Finally, we load the business dataset to be able to join the columns by the business_id and thus be able to extract a new json with the new columns.

In [169]:
json_file_path_business = 'yelp_academic_dataset_business.json'

df_business = spark.read.json(json_file_path_business)

In [170]:
df_business.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [171]:
df_business.count()

150346

In [172]:
final_df = df_business.join(df_values, df_business.business_id==df_values.business_id_rev)

In [173]:
final_df.show()

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+--------------------+----------------+----------------+--------------------+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|     business_id_rev|negative_reviews|positive_reviews|review_count_reviews|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+--------------------+----------------+----------------+--------------------+
| 10359 104 Street NW|{NULL, NULL, u'be...|WKMJwqnfZKsAae75R...|Coffee & Tea, Foo...|     Edmonton|{8:0-18:0, 8:0-18...| 

In [174]:
final_df

DataFrame[address: string, attributes: struct<AcceptsInsurance:string,AgesAllowed:string,Alcohol:string,Ambience:string,BYOB:string,BYOBCorkage:string,BestNights:string,BikeParking:string,BusinessAcceptsBitcoin:string,BusinessAcceptsCreditCards:string,BusinessParking:string,ByAppointmentOnly:string,Caters:string,CoatCheck:string,Corkage:string,DietaryRestrictions:string,DogsAllowed:string,DriveThru:string,GoodForDancing:string,GoodForKids:string,GoodForMeal:string,HairSpecializesIn:string,HappyHour:string,HasTV:string,Music:string,NoiseLevel:string,Open24Hours:string,OutdoorSeating:string,RestaurantsAttire:string,RestaurantsCounterService:string,RestaurantsDelivery:string,RestaurantsGoodForGroups:string,RestaurantsPriceRange2:string,RestaurantsReservations:string,RestaurantsTableService:string,RestaurantsTakeOut:string,Smoking:string,WheelchairAccessible:string,WiFi:string>, business_id: string, categories: string, city: string, hours: struct<Friday:string,Monday:string,Saturday:string

In [175]:
pandas_df = final_df.toPandas()

In [176]:
pandas_df.to_json("C:/Users/strix/OneDrive/Desktop/uni/explotacion de dades/yelp_academic_dataset_business_add.json", orient='records', lines=True)