In [None]:
#### INSTALLATION OF JAVA/SPARK
!sudo apt update
!sudo apt install openjdk-17-jdk -y
!curl -JLO 'https://apache.osuosl.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz'
!tar xvf spark-3.3.1-bin-hadoop3.tgz
!mv spark-3.3.1-bin-hadoop3 /opt/spark

#### INSTALLING LIBRARIES
!pip install findspark
!pip install pyspark
!pip install databricks
!pip install koalas

In [None]:
# IMPORT LIBRIRIES
import databricks.koalas as ks
import os
import findspark
from pyspark.sql import SparkSession
import pyspark.pandas as ps

In [None]:
#### SPARK
#### DIRECTORY SETTING FOR SPARK 
os.environ["JAVA_HOME"] = "/usr"
os.environ["SPARK_HOME"] = "/opt/spark"

findspark.init();
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.3")\
    .getOrCreate();

spark.sparkContext.setLogLevel("OFF");

ps.set_option('compute.ops_on_diff_frames', True);

Import data

In [None]:
review = ps.read_json("./data/review.json", lines=True)
user = ps.read_json("./data/user.json", lines=True)
business = ps.read_json("./data/business.json", lines=True)
checkin = ps.read_json("./data/checkin.json", lines=True)

In [5]:
user.head()

                                                                                

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,3.91,467,56,467,250,18,65,232,180,844,55,239,5994,2007,267,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",1259,Walker,585,7217,qVc8ODYU5SZjKXVBgXdI7w,2007-01-25 16:47:26
1,3.74,3131,157,3131,1145,251,264,1847,1946,7054,184,1521,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...",3138,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",13066,Daniel,4333,43091,j14WgRoU_-2ZE1aw1dXrJg,2009-01-25 04:35:42
2,3.32,119,17,119,89,3,13,66,18,96,10,35,1003,20092010201120122013,52,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",1010,Steph,665,2086,2WnXYQFK0hXEoTxPtV2zvg,2008-07-25 10:41:00
3,4.27,26,6,26,24,2,4,12,9,16,1,10,299,200920102011,28,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",330,Gwen,224,512,SZDeASXq7o05mMNLshsdIA,2005-11-29 04:38:33
4,3.54,0,0,0,1,0,1,1,0,1,0,0,7,,1,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",15,Karen,79,29,hA5lMy-EnncsH4JoR-hFGQ,2007-01-05 19:40:59


## General objective

In order to create a Mahcine Laerning model that is capable of predicting whether a business will grow or decline, we need to feed it a target that classifies successful businesses and those that are not.

This target is determined by the following variables:

## INFLUENCER SCORE 
---
The influencer score is determined by the relationship between the number of interactions received between the number of friends and fans

> $$ Influencer = \left(RI\over {fans + friends} \right) $$

_Influencer_  = Influencer rate<br>_I_ = received interactions<br>_SI_ = sent interactions
 
### Normalized influencer indicator (NI)
Then it becomes an index between 0 and 1

> $$ {\displaystyle IS = 1-{1 \over 1+ Influencer}}$$

_IS_ = Normalized influencer indicator<br>_I_ = Influencer rate 

In [6]:
def get_len(value):
  """
  It takes a string of comma separated names and returns the number of names in the string
  
  :param value: the value of the column you're applying the function to
  :return: The number of friends in the list.
  """
  ls = value.split(', ')
  return len(ls)

In [7]:
def influencer_Score(user): 
      """
      The function takes a dataframe of users and returns a dataframe with the influencer score for each
      user
      
      :param user: the user dataframe
      :return: A dataframe with the columns: n_interacionts_received, n_interactions_send, fans,
      friends_number, Score_influencer, Influencer, user_id
      """
      user['n_interacionts_received'] = user[[ 'compliment_hot',
      'compliment_more', 'compliment_profile', 'compliment_cute',
      'compliment_list', 'compliment_note', 'compliment_plain',
      'compliment_cool', 'compliment_funny', 'compliment_writer',
      'compliment_photos']].sum(axis=1)

      user['n_interactions_send'] = user['useful'] + user['funny'] + user['cool']
      user['friends_number'] = user.friends.apply(get_len)
      user['Influencer'] = user['n_interacionts_received'] / (1 + user['friends_number'] + user['fans'])
      user['Influencer'].fillna(0, inplace = True)
      user['Influencer_Score'] = 1 - (1 / (1 + user['Influencer']))
      
      user_df = user[['n_interacionts_received', 'n_interactions_send', 'fans', 'friends_number',
      'Influencer', 'Influencer_Score', 'user_id']]

      return user_df

In [8]:
user_df = influencer_Score(user)

                                                                                

In [10]:
user_df.sort_values(by='Influencer_Score', ascending=False).head()

                                                                                

Unnamed: 0,n_interacionts_received,n_interactions_send,fans,friends_number,Influencer,Influencer_Score,user_id
400147,277314,119736,319,934,221.143541,0.995498,Tqm7Wu7IBJ1td3Ab5ZpUhw
5052,82084,99266,247,310,147.103943,0.993248,UXbCcmkYGl3DH_Py5UOtbQ
1009800,37213,60377,125,208,111.416168,0.991104,8l_lV5khhzgs8SfLn9-9UQ
217667,87324,69921,298,503,108.882793,0.990899,w0Gp4qYFLhB6PbKrPhn6Tw
617655,88363,30753,216,601,108.023227,0.990828,JRAy4P4op3PCISZaMRA9_w


In [11]:
user_df.sort_values(by='n_interacionts_received', ascending=False).head()

                                                                                

Unnamed: 0,n_interacionts_received,n_interactions_send,fans,friends_number,Influencer,Influencer_Score,user_id
200787,324328,486573,3243,7228,30.97097,0.968722,JjXuiru1_ONzDkYVrHN0aw
400147,277314,119736,319,934,221.143541,0.995498,Tqm7Wu7IBJ1td3Ab5ZpUhw
800350,154351,323513,605,1794,64.312917,0.984689,ax7SnXOTIpatbsmqHLqVow
207385,133351,578739,880,3982,27.42155,0.964815,--2vR0DIsmQ6WfcSzKWigw
202196,132032,119510,804,1126,68.374935,0.985586,h4oOQdnfjpEHbygEJDsFbg


In [12]:
user_df.describe()

                                                                                

Unnamed: 0,n_interacionts_received,n_interactions_send,fans,friends_number,Influencer,Influencer_Score
count,1987897.0,1987897.0,1987897.0,1987897.0,1987897.0,1987897.0
mean,14.87311,83.05979,1.46574,53.37501,0.161577,0.07890231
std,547.1341,1600.32,18.13075,146.4435,0.6015574,0.1646012
min,0.0,0.0,0.0,1.0,0.0,0.0
25%,0.0,1.0,0.0,1.0,0.0,0.0
50%,0.0,4.0,0.0,2.0,0.0,0.0
75%,1.0,18.0,0.0,46.0,0.03636364,0.03508772
max,324328.0,587933.0,12497.0,14995.0,221.1435,0.9954984


### Reactions (R)
---
Creating a new column called 'reactions' and adding the values of the 'cool', 'funny', and 'useful' columns.

$$ R = c + f + u $$

c = cool 
f = funny 
u = useful


In [13]:
review.head()

                                                                                

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,2018-07-07 22:09:11,0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",0,mh_-eMZ6K5RLWhZyISBhwA
1,7ATYjTIgM3jUlt4UM3IypQ,1,2012-01-03 15:28:18,0,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,1,OyoGAe7OKpv6SyGZT5g77Q
2,YjUWPpI6HXG530lwP-fb2A,0,2014-02-05 20:30:30,0,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,0,8g_iMtfSiwikVnbP2etR0A
3,kxX2SOes4o-D3ZQBkiMRfA,1,2015-01-04 00:01:03,0,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",1,_7bHUi9Uuf5__HHc_Q8guQ
4,e4Vwtrqf-wpJfwesgvdgxQ,1,2017-01-14 20:54:15,0,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,1,bcjbaE6dDog4jkNY91ncLQ


In [14]:
review['reactions'] = review['cool'] + review['funny'] + review['useful']

In [15]:
review['reactions'].describe()

                                                                                

count    6.990280e+06
mean     2.009786e+00
std      6.196532e+00
min     -3.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      1.182000e+03
Name: reactions, dtype: float64

In [16]:
review.sort_values(by='reactions', ascending=False).head()

                                                                                

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,reactions
6793158,3RcQ7yTvJn9LLuXthTkWiQ,0,2016-11-22 01:12:43,0,b0HBLNn02tkpC9rIyPRc_Q,1.0,Avoid at ALL costs. Do NOT do business with D...,1182,aMR0N5sy95Qz0sUjg1bmYA,1182
6885693,iRIHK8-EwpeffwvoO4nzIA,107,2017-02-19 13:26:19,792,pL_jmXOSPOMq71ZwrlRynQ,1.0,Went there for a birthday dinner and had reser...,112,sMfFISqyKkxlLsRzYaL9Lw,1011
6640623,3RcQ7yTvJn9LLuXthTkWiQ,0,2016-01-29 19:48:53,0,kTSpD7akIapty9AtLwJ11g,1.0,WORST COMPANY EVER!\n\nI had the unfortunate e...,997,aPpOZxh2p1SoWxRL3ulYGQ,997
6595985,3RcQ7yTvJn9LLuXthTkWiQ,0,2016-09-28 23:04:48,0,_KhSKjKKXoLgINKv7lTr0Q,1.0,I set up for automatic withdrawals online. On...,969,CHN_EDxhElh9DS1Vud6-hA,969
6599431,3RcQ7yTvJn9LLuXthTkWiQ,0,2017-01-13 18:26:56,0,TFlWosaJn3lVJwgm3nrVYA,1.0,If you are unfortunate enough to have to deal ...,933,9YOVI-bxiiOx0bTZ-J_Irw,933


# Subset Checkin

In [17]:
checkin.head()

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


In [18]:
checkin['total'] = checkin.date.apply(lambda x: get_len(x))

# Export and import from spark to koalas

In [19]:
k_business = ks.read_json("./data/business.json", lines=True)

Checkin

In [20]:
checkin.to_json('checkin_json.json')
checkin_k = ks.read_json("./checkin_json.json", lines=True)

                                                                                

User

In [21]:
user_df.to_json('user_json.json')
user_k = ks.read_json("./user_json.json", lines=True)

                                                                                

Review

In [22]:
review.to_json('review_json.json')
review_k = ks.read_json("./review_json.json", lines=True)

                                                                                

In [23]:
user_k.head()

                                                                                

Unnamed: 0,Influencer,Influencer_Score,fans,friends_number,n_interacionts_received,n_interactions_send,user_id
0,4.443966,0.81631,44,187,1031,4254,SgiBkhXeqIKl1PlFpZOycQ
1,0.47619,0.322581,2,18,10,188,zjPbmmvO4QzE_nE9uErLTg
2,2.010232,0.6678,316,5840,12377,30349,QJI9OSEn6ujRCtrX06vs1w
3,1.041935,0.510269,25,284,323,1004,HoiOETUtwO4CL0PhjDCnSw
4,0.95098,0.487437,35,168,194,2138,KxrKVxdXGkfMJ9XwJZzoLQ


In [24]:
review.head()

                                                                                

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,reactions
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,2018-07-07 22:09:11,0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",0,mh_-eMZ6K5RLWhZyISBhwA,0
1,7ATYjTIgM3jUlt4UM3IypQ,1,2012-01-03 15:28:18,0,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,1,OyoGAe7OKpv6SyGZT5g77Q,2
2,YjUWPpI6HXG530lwP-fb2A,0,2014-02-05 20:30:30,0,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,0,8g_iMtfSiwikVnbP2etR0A,0
3,kxX2SOes4o-D3ZQBkiMRfA,1,2015-01-04 00:01:03,0,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",1,_7bHUi9Uuf5__HHc_Q8guQ,2
4,e4Vwtrqf-wpJfwesgvdgxQ,1,2017-01-14 20:54:15,0,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,1,bcjbaE6dDog4jkNY91ncLQ,2


## Query of success indicator 

In [25]:
# Join Query with Checkin

success_q = ks.sql('''select b.business_id, b.review_count, AVG(r.reactions), AVG(r.stars), AVG(u.Influencer), sum(c.total)

from {k_business} b 

join {review_k} r
    on b.business_id == r.business_id  

left join {user_k} u
    on u.user_id == r.user_id 

join {checkin_k} c
    on b.business_id == c.business_id 

group by b.business_id, b.review_count, b.stars''')

In [26]:
success_q.shape

                                                                                

(131930, 6)

In [27]:
# Join Query without Checkin
success_qc = ks.sql('''select b.business_id, b.review_count, AVG(r.reactions), AVG(r.stars), AVG(u.Influencer)

from {k_business} b 

join {review_k} r
    on b.business_id == r.business_id  

left join {user_k} u
    on u.user_id == r.user_id 

group by b.business_id, b.review_count, b.stars''')

In [28]:
success_qc.shape

                                                                                

(150346, 5)

In [44]:
# Values dropped due to join with checkin table
print(round((1 - success_q.shape[0]/business.shape[0])*100,2), '%')



12.25 %


                                                                                

In [30]:
success_q.head()

                                                                                

Unnamed: 0,business_id,review_count,avg(reactions),avg(stars),avg(Influencer),sum(total)
0,OJpwmYvsZnXt62sxco3F0Q,399,2.576642,3.822384,0.576839,153714
1,oZAdXhal_EZHePbjxo6s9g,15,2.866667,3.666667,0.217222,90
2,bvkZndsHPy0nwpn3_iKCQQ,99,1.519608,3.558824,0.856587,17544
3,DmkUXt42gLCFsQh_MVsAqw,60,1.66129,3.467742,0.728788,4960
4,wGNNCB-EjhTby2BD2iqJCw,18,1.555556,2.888889,0.668744,1242


In [32]:
success_qc.head()

                                                                                

Unnamed: 0,business_id,review_count,avg(reactions),avg(stars),avg(Influencer)
0,OJpwmYvsZnXt62sxco3F0Q,399,2.576642,3.822384,0.576839
1,oZAdXhal_EZHePbjxo6s9g,15,2.866667,3.666667,0.217222
2,bvkZndsHPy0nwpn3_iKCQQ,99,1.519608,3.558824,0.856587
3,DmkUXt42gLCFsQh_MVsAqw,60,1.66129,3.467742,0.728788
4,wGNNCB-EjhTby2BD2iqJCw,18,1.555556,2.888889,0.668744


In [33]:
success_q.corr()

                                                                                

Unnamed: 0,review_count,avg(reactions),avg(stars),avg(Influencer),sum(total)
review_count,1.0,-0.034464,0.072353,-0.017602,0.592467
avg(reactions),-0.034464,1.0,0.019554,0.333701,-0.006219
avg(stars),0.072353,0.019554,1.0,0.058765,0.014664
avg(Influencer),-0.017602,0.333701,0.058765,1.0,0.0015
sum(total),0.592467,-0.006219,0.014664,0.0015,1.0


In [34]:
success_qc.corr()

                                                                                

Unnamed: 0,review_count,avg(reactions),avg(stars),avg(Influencer)
review_count,1.0,-0.024723,0.063175,0.000919
avg(reactions),-0.024723,1.0,-0.02113,0.32888
avg(stars),0.063175,-0.02113,1.0,0.054347
avg(Influencer),0.000919,0.32888,0.054347,1.0


In [35]:
success_qc.corr(method="spearman")

                                                                                

Unnamed: 0,review_count,avg(reactions),avg(stars),avg(Influencer)
review_count,1.0,0.11763,-0.003367,0.169433
avg(reactions),0.11763,1.0,-0.033701,0.332409
avg(stars),-0.003367,-0.033701,1.0,0.031366
avg(Influencer),0.169433,0.332409,0.031366,1.0


## SUCESS SCORE

The success of a business is directly proportional to the number of reviews received, reviews reactions, review stars and the influencer score of its clients.

$$ SS = R * RC * S * IS $$ 

_SS_ = Sucess Score<br>
_RC_ = Review Counts<br>
_R_ = Reactions<br>
_S_ = Stars<br>
_IS_ = Influencer Score<br>

In [36]:
success_qc['success_score'] = success_qc['review_count'] * success_qc['avg(reactions)'] * success_qc['avg(stars)'] * success_qc['avg(Influencer)']

In [37]:
success_qc.success_score.describe()

                                                                                

count    150346.000000
mean        200.304303
std         681.963220
min           0.000000
25%           9.078790
50%          38.709749
75%         148.080786
max      109888.274094
Name: success_score, dtype: float64

In [38]:
success_qc[success_qc['success_score']==0].sort_values(by='avg(stars)', ascending=False).head()

                                                                                

Unnamed: 0,business_id,review_count,avg(reactions),avg(stars),avg(Influencer),success_score
397,G_mYBzkWfW8ugRdGf4w93g,5,0.0,5.0,0.026007,0.0
548,gjFl3SkRXNe5Aprdh9YD7Q,5,0.0,5.0,0.000656,0.0
1071,ty-U3L3b2ovOTt5JOilSkg,6,0.0,5.0,0.138603,0.0
1925,LllYcETCcistMoHZuhalXA,6,0.0,5.0,0.515957,0.0
2278,tagYDDu5iUAdOqIp8uGuHg,11,0.0,5.0,0.002525,0.0


In [39]:
# Number of businesses with zero score
len(success_qc[success_qc['success_score']==0])

                                                                                

3823

In [40]:
success_qc['target'] = 0 
success_qc.loc[success_qc['success_score']>=success_qc.success_score.mean(), 'target'] = 1

                                                                                

Export dataframe successful target

In [41]:
success_qc.head()

                                                                                

Unnamed: 0,business_id,review_count,avg(reactions),avg(stars),avg(Influencer),success_score,target
0,OJpwmYvsZnXt62sxco3F0Q,399,2.576642,3.822384,0.576839,2266.813432,1
1,oZAdXhal_EZHePbjxo6s9g,15,2.866667,3.666667,0.217222,34.248704,0
2,bvkZndsHPy0nwpn3_iKCQQ,99,1.519608,3.558824,0.856587,458.611143,1
3,DmkUXt42gLCFsQh_MVsAqw,60,1.66129,3.467742,0.728788,251.909527,1
4,wGNNCB-EjhTby2BD2iqJCw,18,1.555556,2.888889,0.668744,54.093995,0


In [42]:
success_q.to_csv('./data/success_target_with_checkin.csv')
success_qc.to_csv('./data/success_target_without_checkin.csv')

                                                                                