In [1]:
import pyspark as ps
spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("yelp_academic") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [2]:
yelp_a_biz = spark.read.json('yelp_dataset/yelp_academic_dataset_business.json')

In [4]:
yelp_rev = spark.read.json('yelp_dataset/yelp_academic_dataset_review.json')

In [5]:
sc = spark.sparkContext

In [8]:
yelp_a_biz.createOrReplaceTempView('yelp_business')

In [12]:
yelp_a_biz.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [45]:
select = '''
SELECT business_id, name, stars, categories
FROM yelp_business 
WHERE  city = 'Phoenix' and review_count > 10 and review_count < 40 


'''
phoe_gr_40 = spark.sql(select)
phoe_gr_40.show(10)

+--------------------+--------------------+-----+--------------------+
|         business_id|                name|stars|          categories|
+--------------------+--------------------+-----+--------------------+
|CeuTRtwsq6w5rztGO...|            Salsitas|  2.5|Mexican, Restaurants|
|Zkesf4VRamyOBswJA...|       Audio Express|  3.5|Automotive, Auto ...|
|Ju2tEB-zllh9FoCma...|Above and Beyond ...|  3.5|Pets, Pet Groomer...|
|27fSAl_GsDDhrxzpE...|    Jiffy Lube #1664|  3.0|Oil Change Statio...|
|yBf7mMFELRa1vRora...|        Desert Homes|  2.0|Apartments, Home ...|
|WnRtn80rdsWOa2nWZ...|Central Phoenix W...|  3.5|Obstetricians & G...|
|8vA1d9_w4hBjOcrM7...|       Boston Market|  3.5|American (Traditi...|
|xnJl1DcMULTTzY21I...|Verizon Authorize...|  3.5|Professional Serv...|
|RQOorijTwEMC7G7Mi...|Integrity Air Con...|  4.0|Home Services, He...|
|GWpKZSrf3kO25vby0...|Valencia Park Apa...|  2.5|Real Estate, Apar...|
+--------------------+--------------------+-----+--------------------+
only s

In [46]:
phoe_biz = phoe_gr_40.toPandas()

(5475, 4)

In [26]:
yelp_rev.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [27]:
bid = phoe_biz.business_id[:3]

In [29]:
list(bid)

['45bWSZtniwPRiqlivpS8Og', '_c3ixq9jYKxhLUB0czi0ug', 'cKRMmytHxaSt8F0SMEzKqg']

In [31]:
rviews = yelp_rev.select('business_id', 'user_id', 'stars')

In [48]:
pho_rv = rviews.filter(rviews.business_id.isin(list(phoe_biz.business_id)))


In [49]:
pho_rv.count()

111814

In [50]:
pho_rv.take(2)

[Row(business_id='Szcr-yMxq76zRaSTUapAcw', user_id='0pf5VuzE4_1pwj5NJHG5TQ', stars=3),
 Row(business_id='BNzcfz3jmBo_1wUB5YIsXg', user_id='0pf5VuzE4_1pwj5NJHG5TQ', stars=4)]

In [55]:
hash('0pf5VuzE4_1pwj5NJHG5TQ')

-131021924558

In [60]:
hash_udf = udf(lambda idx: hash(idx) , IntegerType())

In [62]:
pho_rv.withColumn?

In [67]:
pho_rv_n = pho_rv.withColumn('user_num', hash_udf(pho_rv.user_id))
pho_rv_nn = pho_rv_n.withColumn('biz_num', hash_udf(pho_rv.business_id))

pho_rv_nn.first()

Row(business_id='Szcr-yMxq76zRaSTUapAcw', user_id='0pf5VuzE4_1pwj5NJHG5TQ', stars=3, user_num=-1178701720, biz_num=-759057679)

In [70]:
splits = pho_rv_nn.randomSplit([0.7,0.3], seed =91)
train = splits[0]
test = splits[1]

In [71]:
train.columns, test.columns

(['business_id', 'user_id', 'stars', 'user_num', 'biz_num'],
 ['business_id', 'user_id', 'stars', 'user_num', 'biz_num'])

In [59]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

In [87]:
als = ALS(rank=10, regParam=0.1, 
          itemCol='biz_num', 
          userCol='user_num',
          ratingCol='stars')
mdl = als.fit(train)


In [101]:
train_res = mdl.transform(train)
test_res = mdl.transform(test)

In [74]:
from pyspark.ml.evaluation import RegressionEvaluator

In [75]:
train_res.first()

Row(business_id='KCVnCb6TvTzoDp4t-Qf4wQ', user_id='8JAx8FnnLFZVc0h5W71zHw', stars=5, user_num=-1070722737, biz_num=-2024711065, prediction=4.839774131774902)

In [98]:
test_res.na.drop?

In [77]:
evaluator = RegressionEvaluator(labelCol="stars",predictionCol="prediction")

In [None]:
### Have to create some basic metric

In [102]:
test_res1 = test_res.na.drop()
test_res2 = test_res.fillna(3.6)

In [100]:
 evaluator.evaluate(train_res)

(3.57183013844712, 0.12047564570355435)

In [103]:
evaluator.evaluate(test_res1), evaluator.evaluate(test_res2)

(3.5718301384471194, 2.705844144986854)

In [106]:
test_res3 = test_res.withColumn?

In [112]:
test_res3 = test_res.withColumn('prediction', test_res.prediction*0+ 3.6)
test_res3 = test_res3.fillna(3.6)

In [113]:
evaluator.evaluate(test_res3)

1.6644441298302348

In [83]:
train.agg({'stars':'mean'}).collect()

[Row(avg(stars)=3.6376998518669867)]

To create model first I have to filter all bussiness and all reviews by category  'restaurants' and may be selected cities.


In [117]:
yelp_a_biz.filter(yelp_a_biz.city == 'San Francisco' ).count()

0

In [116]:
yelp_a_biz.filter?

In [3]:
select = '''
SELECT  city
FROM yelp_business 
WHERE review_count > 10  
GROUP BY city


'''
sf40 = spark.sql(select)
sf40.show(20)

AnalysisException: 'java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient;'

In [2]:
type(sf40)

NameError: name 'sf40' is not defined