In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 2
memory_gb = 5
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set("spark.executor.heartbeatInterval","3600s").set('spark.driver.memory', '{}g'.format(memory_gb)).set("spark.network.timeout","10000s").set("spark.storage.blockManagerSlaveTimeoutMs","12000s"))
sc = pyspark.SparkContext(conf=conf)

In [28]:
sc.stop()

### Initial look at data

In [3]:
raw_data = sc.textFile("Documents/WCU/CSC496/yelp_academic_dataset_user.json.gz").cache()

In [44]:
%%time
raw_data.count()

CPU times: user 13.2 ms, sys: 25.7 ms, total: 38.9 ms
Wall time: 52.7 s


1968703

In [45]:
%%time 
raw_data.count()

CPU times: user 12.7 ms, sys: 23.7 ms, total: 36.4 ms
Wall time: 1min 4s


1968703

In [4]:
raw_data.take(1)

['{"user_id":"ntlvfPzc8eglqvk92iDIAw","name":"Rafael","review_count":553,"yelping_since":"2007-07-06 03:27:11","useful":628,"funny":225,"cool":227,"elite":"","friends":"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vm

Testing if all entries are standard user data format by seeing if count for certain formatting features matches count of raw data:

In [47]:
names = raw_data.map(lambda line: line.split(":")[3].split(",")[0])\
.filter(lambda line: True if line.isnumeric() else False)
names.count()

1968703

### SQL Test:

In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [5]:
df_json = sqlContext.read.json("Documents/WCU/CSC496/yelp_academic_dataset_user.json.gz").cache()

In [50]:
df_json.count()

1968703

In [7]:
df_json.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [8]:
df_json.registerTempTable("tb_json")

### List users with more than 2000 fans:

In [9]:
%%time
lots_fan = sqlContext.sql("SELECT user_id, fans FROM tb_json WHERE fans >2000")
print(lots_fan.count())
lots_fan.show()

13
+--------------------+-----+
|             user_id| fans|
+--------------------+-----+
|fgwI3rYHOv1ipfVfC...| 2113|
|eKUGKQRE-Ywi5dY55...| 2916|
|VHdY6oG2JPVNjihWh...| 2140|
|ITa3vh5ERI90G_WP4...| 2280|
|Hi10sGSZNxQH3NLyW...| 2718|
|nkN_do3fJ9xekchVC...| 2046|
|hizGc5W1tBHPghM5Y...| 3315|
|m07sy7eLtOjVdZ8oN...| 2034|
|j14WgRoU_-2ZE1aw1...| 2634|
|UsXqCXRZwSCSw0AT7...| 2263|
|JjXuiru1_ONzDkYVr...| 2316|
|iLjMdZi0Tm7DQxX1C...| 2516|
|37cpUoM8hlkSQfReI...|11568|
+--------------------+-----+

CPU times: user 9.42 ms, sys: 17 ms, total: 26.4 ms
Wall time: 1min 36s


#### Lots_Fans in RDD:

In [54]:
rdd_lots_fan = lots_fan.rdd
rdd_lots_fan.take(5)

[Row(user_id='fgwI3rYHOv1ipfVfCSx7pg', fans=2113),
 Row(user_id='eKUGKQRE-Ywi5dY55_zChg', fans=2916),
 Row(user_id='VHdY6oG2JPVNjihWhOooAQ', fans=2140),
 Row(user_id='ITa3vh5ERI90G_WP4SmGUQ', fans=2280),
 Row(user_id='Hi10sGSZNxQH3NLyWSZ1oA', fans=2718)]

In [55]:
rdd_lots_fan.map(lambda r: r['user_id']).take(5)

['fgwI3rYHOv1ipfVfCSx7pg',
 'eKUGKQRE-Ywi5dY55_zChg',
 'VHdY6oG2JPVNjihWhOooAQ',
 'ITa3vh5ERI90G_WP4SmGUQ',
 'Hi10sGSZNxQH3NLyWSZ1oA']

In [56]:
rdd_lots_fan.takeOrdered(10)

[Row(user_id='37cpUoM8hlkSQfReIEBd-Q', fans=11568),
 Row(user_id='Hi10sGSZNxQH3NLyWSZ1oA', fans=2718),
 Row(user_id='ITa3vh5ERI90G_WP4SmGUQ', fans=2280),
 Row(user_id='JjXuiru1_ONzDkYVrHN0aw', fans=2316),
 Row(user_id='UsXqCXRZwSCSw0AT7y1uBg', fans=2263),
 Row(user_id='VHdY6oG2JPVNjihWhOooAQ', fans=2140),
 Row(user_id='eKUGKQRE-Ywi5dY55_zChg', fans=2916),
 Row(user_id='fgwI3rYHOv1ipfVfCSx7pg', fans=2113),
 Row(user_id='hizGc5W1tBHPghM5YKCAtg', fans=3315),
 Row(user_id='iLjMdZi0Tm7DQxX1C1_2dg', fans=2516)]

#### Parse JSON:

In [5]:
import json

def parseJSON(t):
    res = json.loads(t)
    return res

test = '{"user_id":"ntlvfPzc8eglqvk92iDIAw","name":"Rafael","review_count":553,"yelping_since":"2007-07-06 03:27:11","useful":628,"funny":225,"cool":227,"elite":"","friends":"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vmxyTVfCQ, 5IM6YPQCK-NABkXmHhlRGQ, U_w8ZMD26vnkeeS1sD7s4Q, AbfS_oXF8H6HJb5jFqhrLw, hbcjX4_D4KIfonNnwrH-cg, UKf66_MPz0zHCP70mF6p1g, hK2gYbxZRTqcqlSiQQcrtQ, 2Q45w_Twx_T9dXqlE16xtQ, BwRn8qcKSeA77HLaOTbfiQ, jouOn4VS_DtFPtMR2w8VDA, ESteyJabbfvqas6CEDs3pQ","fans":14,"average_stars":3.57,"compliment_hot":3,"compliment_more":2,"compliment_profile":1,"compliment_cute":0,"compliment_list":1,"compliment_note":11,"compliment_plain":15,"compliment_cool":22,"compliment_funny":22,"compliment_writer":10,"compliment_photos":0}'
parseJSON(test)

{'user_id': 'ntlvfPzc8eglqvk92iDIAw',
 'name': 'Rafael',
 'review_count': 553,
 'yelping_since': '2007-07-06 03:27:11',
 'useful': 628,
 'funny': 225,
 'cool': 227,
 'elite': '',
 'friends': 'oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSL

## "Interesting Analysis (Throughout and after which you will find "interesting user attributes")": 
### Expanding on example from class:  exploring relationship between "cute" and "hot" compliments and number of fans:

In [6]:
import json
from pyspark.ml.linalg import Vectors

def parseJSON_ch(t):
    res = json.loads(t)
    return ([Vectors.dense(res['fans'], res['compliment_hot'], res['compliment_cute'])])

test = '{"user_id":"ntlvfPzc8eglqvk92iDIAw","name":"Rafael","review_count":553,"yelping_since":"2007-07-06 03:27:11","useful":628,"funny":225,"cool":227,"elite":"","friends":"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vmxyTVfCQ, 5IM6YPQCK-NABkXmHhlRGQ, U_w8ZMD26vnkeeS1sD7s4Q, AbfS_oXF8H6HJb5jFqhrLw, hbcjX4_D4KIfonNnwrH-cg, UKf66_MPz0zHCP70mF6p1g, hK2gYbxZRTqcqlSiQQcrtQ, 2Q45w_Twx_T9dXqlE16xtQ, BwRn8qcKSeA77HLaOTbfiQ, jouOn4VS_DtFPtMR2w8VDA, ESteyJabbfvqas6CEDs3pQ","fans":14,"average_stars":3.57,"compliment_hot":3,"compliment_more":2,"compliment_profile":1,"compliment_cute":0,"compliment_list":1,"compliment_note":11,"compliment_plain":15,"compliment_cool":22,"compliment_funny":22,"compliment_writer":10,"compliment_photos":0}'
parseJSON_ch(test)

[DenseVector([14.0, 3.0, 0.0])]

In [8]:
df_vector1 = raw_data.map(parseJSON_ch)
df_vector1.take(5)

[[DenseVector([14.0, 3.0, 0.0])],
 [DenseVector([27.0, 36.0, 2.0])],
 [DenseVector([5.0, 9.0, 1.0])],
 [DenseVector([6.0, 2.0, 0.0])],
 [DenseVector([78.0, 8.0, 1.0])]]

In [7]:
from pyspark.ml.stat import Correlation

spark = pyspark.sql.SparkSession(sc)

In [9]:
df_vector1 = spark.createDataFrame(df_vector1, ['features'])

In [33]:
df_vector1.take(5)

[Row(features=DenseVector([14.0, 3.0, 0.0])),
 Row(features=DenseVector([27.0, 36.0, 2.0])),
 Row(features=DenseVector([5.0, 9.0, 1.0])),
 Row(features=DenseVector([6.0, 2.0, 0.0])),
 Row(features=DenseVector([78.0, 8.0, 1.0]))]

### Correlation between number of "cute" and "hot" compliments and number of fans:

In [10]:
pearsonCorr = Correlation.corr(df_vector1, 'features', 'pearson').collect()[0][0]

Matrix:
   left:right & top:bottom: fans, hot, cute
    
    

In [17]:
print(str(pearsonCorr).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.45536087, 0.2263288 ],
             [0.45536087, 1.        , 0.58995499],
             [0.2263288 , 0.58995499, 1.        ]])


**Analysis**:  "Hot" compliments have a relatively strong correlation with number of fans at 0.45, and "cute" compliments have a modest positive correlation with number of fans at 0.22.  

In the hypothetical scenario where x "cute" and "hot" compliments are given to every user for every y followers they amass (i.e. every user is equally attractive, using these compliments as a proxy), number of cute/hot compliments would both correlate strongly with number of followers, but would explain nothing about the influence of cuteness/hotness on a user's number of followers.  Thus I don't think these correlations, when examined together, soundly point to any conclusion that attractive users tend to have more followers.  However, looking at the stronger correlation between "hot" compliments and number of fans than "cute" compliments, and making the (probably sound) assumption that "hot" compliments are a stronger indication of user attractiveness than "cute" compliments, the data may suggest that user attractiveness has some influence over number of fans.    


To better gauge this, I find the correlation between number of fans and cute/hot compliments *per fan*:


### Correlation between "hot" and "cute" compliments *per fan* and total number of fans:

#### Exclude people with zero fans b/c ratio can't be calculated

In [10]:
nonzerofans = raw_data.map(parseJSON).filter(lambda r: r['fans']>0)
#number of users with fans
nonzerofans.count()

423544

In [12]:
def comp_fan_ratio(t):
    return ([Vectors.dense(t['fans'], t['compliment_hot']/t['fans'], t['compliment_cute']/t['fans'])])

In [13]:
df_vector2 = nonzerofans.map(comp_fan_ratio)
df_vector2.take(5)

[[DenseVector([14.0, 0.2143, 0.0])],
 [DenseVector([27.0, 1.3333, 0.0741])],
 [DenseVector([5.0, 1.8, 0.2])],
 [DenseVector([6.0, 0.3333, 0.0])],
 [DenseVector([78.0, 0.1026, 0.0128])]]

In [14]:
df_vector2 = spark.createDataFrame(df_vector2, ['features'])

In [70]:
pearsonCorr_ = Correlation.corr(df_vector2, 'features', 'pearson').collect()[0][0]

Matrix: left:right & top:bottom: fans, (hot comps)/fans, (cute comps)/fans

In [71]:
print(str(pearsonCorr_).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.19678927, 0.08173216],
             [0.19678927, 1.        , 0.37055255],
             [0.08173216, 0.37055255, 1.        ]])


**Analysis**: There is a 0.19 correlation between number of fans and hot compliments per fan and a 0.08 correlation between number of fans and cute compliments per fan.  

Only considering users with >=1 fan, This probably suggests that there is a relatively small but existent relationship between "hot" compliment reception rate and number of fans.  The "cute" compliment reception rate appears to have a very small but perhaps non-negligible effect (given the sample size) on number of fans.

However, this analysis doesn't take the users with no fans (the large majority of users in this dataset) into account.  As we can see from the counts, there are 1968703 users, but only 423544 of them have fans.  Are the users without fans getting "cute" and "hot" compliments at a remotely comparable volume?  

Let's take some summary statistics to get a better idea of how these groups compare:

In [15]:
#Users with zero fans
zerofans = raw_data.map(parseJSON).filter(lambda r: r['fans']==0)
zerofans.take(5)

[{'user_id': 'ttumcu6hWshk_EJVWrduDg',
  'name': 'Stuart',
  'review_count': 12,
  'yelping_since': '2010-05-12 16:30:08',
  'useful': 29,
  'funny': 4,
  'cool': 6,
  'elite': '',
  'friends': '1pKOc55fFx09J5t2rPzTxw, HYpE14yYEKSgu3VNVzObHA, bW2AQqcSPascYvKXsF0S-g, sa5zd7bwBQzajQB48Y9SUw, GISGKvSU9j-71TQaopfFog, k0Qt0Lo7uEOBOPQU3RZU8w, KhQlPlX0FEBZSWnt7N_03A, _u3fnAioKgbcMRczm1cjbA, 13oHusx9NQiXOtzGLHkWKA, 5w7lSpikd7rt88KZGqAq3Q, GOWBkpDlkJY4jvHL0xWGlg, gVQRvH2Ee07x8cK7GoQgNQ, 0S6OtXUGAZX__HuXlqIO0g, x-UEmeYnm-guCKVEsBLqjA, WBpPeK_eDwP6dzDXML4eQA, fxxDma1W3aahZzLNP89WAA, 9GpMrHTbb_1lQuVB5UF4qA, Dfhd4xa16wnivWEZzvnZ_Q, mqX4eUbe-FloqluSb1w1DQ, FepUxtZ5xOPts7dGIUmipA, hjEzRcAf__Ggc1DqPGbeeQ, huMZyHKofLRnVqXIrdnE-w, Fr1AHwaEhj1zryGBR49Z5Q, 5i96QHviF0zFGDpiydAEzA, fGILVLvubXC6-91Uy0rydQ, _Ac7EfJcJoIFRJf6a9Tcag, GfJAY8vU4giYXMIy0UXFjg, eWAhBfihO8WhWhNAwaokbA, zV4ax3qXTbYea_RaGRYvtQ, KJX8uKN3_xu_lLa4Ft4ASQ, RG5yLxmEZpaMCtuUXdfpEQ, X2rDsx9adK0JxC0obtiJxg, CBjPeXSf9uaWTMoaL8rgIQ, FrtD1XOePjNdK

In [16]:
zerofansdf = spark.createDataFrame(zerofans, ['features'])



In [17]:
nonzerofansdf = spark.createDataFrame(nonzerofans, ['features'])



#### Approximate 50th, 90th, and 99th percentile quantities for number of "cute" and "hot" compliments for people with 0 fans and >=1 fan

In [18]:
quants = zerofansdf.approxQuantile(['compliment_cute', 'compliment_hot'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quants

[[0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]

**Outputs**: cute: 50th, 90th, 99th, hot: 50th, 90th, 99th

In [19]:
quantsnz = nonzerofansdf.approxQuantile(['compliment_cute', 'compliment_hot'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantsnz

[[0.0, 1.0, 9.0], [0.0, 5.0, 125.0]]

**Analysis**: As we can see from taking these metrics with a relatively high degree of accuracy (3rd argument in approxQuantile - Spark ran out of memory when attempting 100% accuracy), the vast majority (90+%) of users without fans recieve no "cute" and "hot" compliments.  I believe this gives the possible effects of "cute" and "hot" compliment reception rates more validity because it rules out the possibilty that fanless users could also be recieving these compliments at any comparable volume, which would weaken the evidence for a positive relationship between cute/hot compliment reception rate (or aptitude) and ability to amass fans.

I think further distinctions can be drawn.  Factors accounting for fan counts might be different for users with very few and very many fans.  People with large numbers of fans probably tend to post more regularly and post more photos, which would perhaps be the strongest impetus for giving cute/hot compliments.  Let's test this:

In [22]:
df_vector4 = raw_data.map(parseJSON)

In [23]:
medfansdf = spark.createDataFrame(df_vector4, ['features'])



#### Approximate 50th, 90th, and 99th percentile quantities for number of fans:

In [24]:
quantsmf = medfansdf.approxQuantile(['fans'], \
                                   [0.5,0.9,0.99], 0.0001)
quantsmf

[[0.0, 2.0, 26.0]]

In [25]:
manyfans = raw_data.map(parseJSON).filter(lambda r: r['fans']>25)
manyfans.count()

20029

In [26]:
dfvector_5 = manyfans.map(comp_fan_ratio)
dfvector_5.take(5)

[[DenseVector([27.0, 1.3333, 0.0741])],
 [DenseVector([78.0, 0.1026, 0.0128])],
 [DenseVector([137.0, 3.5255, 0.2555])],
 [DenseVector([31.0, 1.2581, 0.1613])],
 [DenseVector([197.0, 1.0761, 0.066])]]

In [27]:
dfvector_5 = spark.createDataFrame(dfvector_5, ['features'])

#### Correlation between cute/hot compliments per fan and total fans for users with >25 fans (top 1%)

In [28]:
_pearsonCorr = Correlation.corr(dfvector_5, 'features', 'pearson').collect()[0][0]

Matrix: left:right & top:bottom: fans, (hot comps)/fans, (cute comps)/fans

In [30]:
print(str(_pearsonCorr).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.12722572, 0.05123545],
             [0.12722572, 1.        , 0.59515282],
             [0.05123545, 0.59515282, 1.        ]])


The correlation between hot compliments per fan and total fans is 0.12 and the correlation between cute compliments per fan and total fans is 0.05.  These are weaker correlations than those for users with >=1 fan (0.19, 0.08, respectively).

We can also try this for >100 (much more than that and the sample size begins to get dubiously small)

In [32]:
mmanyfans = raw_data.map(parseJSON).filter(lambda r: r['fans']>100)
mmanyfans.count()

3487

#### Correlation between cute/hot compliments per fan and total fans for users with >100 fans

In [33]:
dfvector_6 = mmanyfans.map(comp_fan_ratio)
dfvector_6 = spark.createDataFrame(dfvector_6, ['features'])
__pearsonCorr = Correlation.corr(dfvector_6, 'features', 'pearson').collect()[0][0]
print(str(__pearsonCorr).replace('nan', 'NaN'))


DenseMatrix([[1.        , 0.01322553, 0.007071  ],
             [0.01322553, 1.        , 0.52511703],
             [0.007071  , 0.52511703, 1.        ]])


**Analysis** These correlations practically disappear for users with greater than 100 fans (0.01 hot, 0.007 cute), perhaps suggesting that an aptitude for recieving these compliments is essentially a nonfactor in amassing large followings.

A possible complicating factor is that people with large numbers of fans will likely tend to know a smaller proportion of their fans.  I'd infer that people are more likely to give hot/cute compliments to people they personally know, and that people may be less comfortable giving one of these compliments on a profile with a large following/lots of visibility.  I don't think these are really testable given the data, but I think it would probably be sound to assume that this effect of scale is at least somewhat uniform across users.  If all users with large followings experience this effect uniformly, then they'd likely retain their aptitudes for recieving these compliments relative to each other, such that if this aptitude was important to amassing followers, it would still appear in a correlation when compared with other users with large followings.  This of course is all rather speculative.  

Another factor is that such a large majority of users have relatively small followings (median=5) in terms of compliment reception sample size that the reception rate for most users might not be an accurate "experiment" gauging their reception aptitude.  

### Re-running some of these analyses with friends instead of fans:

In [15]:
def friendCount(t):
        if t['friends'] is 'None':
            return 0
        else:
            return len(t['friends'].split())    

In [23]:
nonzerofriends = raw_data.map(parseJSON).filter(lambda line: friendCount(line)>=1)
nonzerofriends.count()

1968703

In [10]:
zerofriends = raw_data.map(parseJSON).filter(lambda line: friendCount(line)==0)
zerofriends.count()

0

1133852 + 834851 = 1968703 = total count

In [28]:
zerofriends.take(5)

[{'user_id': '0HpQodiJEQ-bJ36AjeF2Nw',
  'name': 'Sarah',
  'review_count': 4,
  'yelping_since': '2013-11-20 18:18:29',
  'useful': 0,
  'funny': 0,
  'cool': 0,
  'elite': '',
  'friends': 'None',
  'fans': 0,
  'average_stars': 5.0,
  'compliment_hot': 0,
  'compliment_more': 0,
  'compliment_profile': 0,
  'compliment_cute': 0,
  'compliment_list': 0,
  'compliment_note': 0,
  'compliment_plain': 0,
  'compliment_cool': 0,
  'compliment_funny': 0,
  'compliment_writer': 0,
  'compliment_photos': 0},
 {'user_id': '-UkOzCgwkwrJtFj7bl1kig',
  'name': 'D',
  'review_count': 18,
  'yelping_since': '2017-08-20 17:14:59',
  'useful': 11,
  'funny': 2,
  'cool': 2,
  'elite': '',
  'friends': 'None',
  'fans': 0,
  'average_stars': 4.89,
  'compliment_hot': 0,
  'compliment_more': 0,
  'compliment_profile': 0,
  'compliment_cute': 0,
  'compliment_list': 0,
  'compliment_note': 0,
  'compliment_plain': 0,
  'compliment_cool': 0,
  'compliment_funny': 0,
  'compliment_writer': 0,
  'complim

In [16]:
def comp_friend_ratio(t):
    return ([Vectors.dense(friendCount(t), t['compliment_hot']/friendCount(t), t['compliment_cute']/friendCount(t))])

#### Correlation between total friends and cute/hot compliments per friend:

In [47]:
nzfriends = nonzerofriends.map(comp_friend_ratio)
nzfriends = spark.createDataFrame(nzfriends, ['features'])
__pearsonCorr_ = Correlation.corr(nzfriends, 'features', 'pearson').collect()[0][0]
print(str(__pearsonCorr_).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.01293239, 0.0049211 ],
             [0.01293239, 1.        , 0.47299973],
             [0.0049211 , 0.47299973, 1.        ]])


The correlation between "hot" compliments per friend and number of friends is 0.009 and for "cute" compliments per friend and number of friends it is 0.005.  These are an order of magnitude weaker than for fans (0.19, 0.08)  

#### 50th, 90th, 99th percentile for number of hot and cute compliments recieved for users with zero friends and >zero friends

In [42]:
zf_df = spark.createDataFrame(zerofriends, ['features'])

quantszfr = zf_df.approxQuantile(['compliment_hot', 'compliment_cute'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantszfr



[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]

In [43]:
nzf_df = spark.createDataFrame(nonzerofriends, ['features'])

quantsnzfr = nzf_df.approxQuantile(['compliment_hot', 'compliment_cute'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantsnzfr



[[0.0, 1.0, 34.0], [0.0, 0.0, 3.0]]

As with fans, it appears that users without friends essentially do not recieve these compliments.

In [17]:
num_friends = raw_data.map(parseJSON).map(lambda line: friendCount(line))
num_friends.take(5)

[45, 213, 35, 173, 895]

In [59]:
from pyspark.sql import Row

row = Row("val")

numfriends_df = num_friends.map(row).toDF()

numfriends_df.printSchema()

root
 |-- val: long (nullable = true)



#### 50th, 90th, 99th percentile for number of friends

In [65]:
quantsnumfr = numfriends_df.approxQuantile(['val'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantsnumfr

[[2.0, 149.0, 592.0]]

In [68]:
lotsfriends = raw_data.map(parseJSON).filter(lambda line: friendCount(line)>=149)
lotsfriends.count()

197361

#### Correlation between "hot" and "cute" compliments per friend and total friends amongst people with 149+ friends (top 10%)

In [69]:
lotsfriends_ = lotsfriends.map(comp_friend_ratio)
lotsfriends_ = spark.createDataFrame(lotsfriends_, ['features'])
__pearsonCorr_ = Correlation.corr(lotsfriends_, 'features', 'pearson').collect()[0][0]
print(str(__pearsonCorr_).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.10055694, 0.0434173 ],
             [0.10055694, 1.        , 0.5860906 ],
             [0.0434173 , 0.5860906 , 1.        ]])


Here I included the top 10% instead of just the top 1% because 150 friends is vastly different in quality from the median of 2, whereas the 90th percentile for fans was only 2.  

The correlation between hot compliments per friend and number of friends is 0.1 and the correlation between cute compliments per friend and number of friends is 0.04.  These are substantially stronger than these correlations for the overall user base with >=1 friend (0.009, 0.005) 

In [19]:
tonsfriends = raw_data.map(parseJSON).filter(lambda line: friendCount(line)>=592)
tonsfriends.count()

19683

#### Correlation between "hot" and "cute" compliments per friend and total friends amongst people with 592+ friends (top 1%)

In [72]:
tonsfriends_ = tonsfriends.map(comp_friend_ratio)
tonsfriends_ = spark.createDataFrame(tonsfriends_, ['features'])
___pearsonCorr_ = Correlation.corr(tonsfriends_, 'features', 'pearson').collect()[0][0]
print(str(___pearsonCorr_).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.08222323, 0.0237262 ],
             [0.08222323, 1.        , 0.58084266],
             [0.0237262 , 0.58084266, 1.        ]])


These correlations (.08 hot, .02 cute) are a bit lower than for the top 10% but still substantially stronger than for the general user base with >=1 friend.  I think these vast increases in correlation may suggest that aptitude for recieving cute/hot compliments (particularly hot) has a small but existent effect on a user's ability to amass friends.

This trend from an essentially nonexistent correlation in the overall userbase to a small correlation in those with many friends is the reverse of the trend when measuring this for fans, where there was a somewhat substantial correlation for the general user base that essentially dissapeared amongst the subsets of the populations with large numbers of fans.  

Reading some of the Yelp forums, it seems that people tend to follow/fan a person more purely for the content of their reviews whereas people may add friends for other personal and/or networking reasons.  This may serve to explain why aptitude for recieving "cute" and "hot" compliments has a stronger relationship with number of friends than number of fans, although both relationships are ultimately quite weak.   

### Correlation between "cool" compliments per friend and friends:

Briefly analyzing this one as well:

In [12]:
def cool_friend_ratio(t):
    return ([Vectors.dense(friendCount(t), t['compliment_cool']/friendCount(t))])


In [24]:
cool_friends = nonzerofriends.map(cool_friend_ratio)
cool_friends.take(5)

[[DenseVector([45.0, 0.4889])],
 [DenseVector([213.0, 0.2958])],
 [DenseVector([35.0, 0.4857])],
 [DenseVector([173.0, 0.0405])],
 [DenseVector([895.0, 0.0346])]]

In [26]:
cool_friends_= spark.createDataFrame(cool_friends, ['features'])

In [27]:
pearsonCorr2 = Correlation.corr(cool_friends_, 'features', 'pearson').collect()[0][0]
print(str(pearsonCorr2).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.00938486],
             [0.00938486, 1.        ]])


It doesn't look like people tend to follow people because they think they're cool either.

In retrospect, it may have been interesting to combine friends and fans for the above analyses, although it very likely stands to reason that friend count correlates strongly with fan count, and as we saw, both ultimately have little correlation with "cuteness" or "hotness" via the proxy of these types of compliments.  

## Finding the top 10 most influential users:

Exclude those with 0 friends - cuts the matrix size by a factor of ~4.

In [63]:
usersandfriends = nonzerofriends.map(lambda line: \
        (line['user_id'], line['friends']))
usersandfriends.take(5)

[('ntlvfPzc8eglqvk92iDIAw',
  'oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vmxyTVfCQ, 5IM6YPQCK-NABkXmHhlRGQ, U_w8ZMD26vnkeeS1sD7s4Q, AbfS_oXF8H6HJb5jFqhrLw, hbcjX4_D4KIfonNnwrH-cg, UKf66_MPz0zHCP70mF6p1g, hK2gYbxZR

In [64]:
N = usersandfriends.count()
ranks = usersandfriends.map(lambda line: (line[0], 1/N))
ranks.take(100)

[('ntlvfPzc8eglqvk92iDIAw', 5.079486342023149e-07),
 ('FOBRPlBHa3WPHFB5qYDlVg', 5.079486342023149e-07),
 ('zZUnPeh2hEp0WydbAZEOOg', 5.079486342023149e-07),
 ('QaELAmRcDc5TfJEylaaP8g', 5.079486342023149e-07),
 ('xvu8G900tezTzbbfqmTKvA', 5.079486342023149e-07),
 ('z5_82komKV3mI4ASGe2-FQ', 5.079486342023149e-07),
 ('ttumcu6hWshk_EJVWrduDg', 5.079486342023149e-07),
 ('f4_MRNHvN-yRn7EA8YWRxg', 5.079486342023149e-07),
 ('UYACF30806j2mfbB5vdmJA', 5.079486342023149e-07),
 ('QG13XBbgHWydzThRBGJtyw', 5.079486342023149e-07),
 ('f6YuZP6iennHFVlnFJOXLQ', 5.079486342023149e-07),
 ('I_6wY8_RsewziNnKhGZg4g', 5.079486342023149e-07),
 ('q-v8elVPvKz0KvK69QSj1Q', 5.079486342023149e-07),
 ('HwPGLzF_uXB3MF8bc5u5dg', 5.079486342023149e-07),
 ('y4UuVowA9i3zj2hHyRMfHw', 5.079486342023149e-07),
 ('1WBxJ2r3A2QYfRSEzgcmkQ', 5.079486342023149e-07),
 ('-TT5e-YQU9xLb1JAGCGkQw', 5.079486342023149e-07),
 ('6bbHSJ0PrgSxh7e5nigKMw', 5.079486342023149e-07),
 ('4VmuXuSRhv5UxYUy3tMpiQ', 5.079486342023149e-07),
 ('pVU2DdtBF

In [65]:
votes = ranks.join(usersandfriends)
votes.take(5)

[('q-v8elVPvKz0KvK69QSj1Q',
  (5.079486342023149e-07,
   'rt1KveqwFMnkN6dXKg5Qyg, NfnKx3z7zFottS3yHabw1g, H0jfY5R0W__smHIwYZ0niQ, B7ecAeAIrXg7sgmabS38pg, Y1q1tQV-MRmNmXg59PPVuA, 6QfkjPi0E2DP8n37CgJHnA, 5ND4p0rvTvjWilQG6Z8LiA, TKSoV9RFqhaUhZRg9UZtgg, A_Ftwpx-PXgo8y_WkCyJ9Q, JxbOYBzGdRhI1Ip9-chmkg, 04tkmJ-qZ9waV0t5n4_6sg, 2oiPE2rww3Ofw09aishi2A, JpBtLSlYXdrts7VGPVYflw, 4qzuqkYUrXWlcBImGp5oRQ, ISF8J60dilGeHKcLPlev4w, jVYzrVblDFSuL3GHtt8ZSA, pWVy4iCapVG-3DDb6celgA, aN3bpfDLsMjQloJ_4-P13Q, 2woU927sw2EM9HFFnuZM8g, Wx7cbLDqYEL3_aVZwh82Ww, -lLuGyseNbCXzhfYvOFJJA, y4k-G_W69QRMgbfrVqEBXg, CDsG_HB8ZghDmR91sDYguw, vUbBLitXfn7aeaRCOoKjdg, NC6tvKSt2mL8w8VN7S_EmQ, A3eAIFfkvfudRM9QeEaIJw, cGa4vDjLuXMZSUaWvnmlqA, TAE4Pf9T52wYMUfjis92ew, 1h1DD2hgfgud7V4NCE9mVg, nHoV323Fh7ZHSgXuNbDxkw, 28jl4Zl98ddSHxvue_KBcA, e0wirbsHLo8g-Ls1epPdiA, WyVh94BghK3ojOc9gKYccg, BbNhbZbPAPOIhsth4DdWWA, wI-g2UsyogOv2LtpJ4ek5Q, oORWfRFf8wnyo6gkghViJA, BidqjIJxDsJQ88AAJUSGmg, -ZzhWTKwcD_3TTLkZeRqXA, 6H4GQG0x5IX2v-nWvrT3tg, sKEFXF

In [13]:
def calculateVotes(t):
    res = []
    for item in t[1][1]:
        count = len(t[1][1])
        res.append((item, t[1][0] / count))
    return res
calculateVotes(('y', (0.3333333333333333, ['y', 'a'])))

[('y', 0.16666666666666666), ('a', 0.16666666666666666)]

In [None]:
votes = ranks.join(users) \
    .flatMap(calculateVotes)
votes.collect()

In [None]:
ranks = votes.reduceByKey(lambda x, y: x+y)
ranks.collect()

From textbook: "In practice, for the Web itself, 50–75 iterations are
sufficient to converge to within the error limits of double-precision arithmetic."

In [15]:
%%time
for i in range(1):
    votes = ranks.join(usersandfriends) \
        .flatMap(calculateVotes)
    ranks = votes.reduceByKey(lambda x, y: x+y)
print(ranks.take(100))

[('r', 0.007952090854380261), ('i', 0.00800818885633463), ('l', 0.008017051582176533), ('S', 0.008011782052404446), ('K', 0.008000823227890065), ('W', 0.008042094561150435), ('s', 0.00798333516854943), ('j', 0.007995073087960703), ('N', 0.11401577145746969), ('8', 0.00801956193495561), ('R', 0.007978802218570153), ('p', 0.008015432914378768), ('c', 0.008023159794259282), ('4', 0.007993417931155558), ('y', 0.008005868784597272), ('9', 0.007987747341631043), ('0', 0.007992931798829359), ('O', 0.007982403969400599), ('h', 0.008027740265091995), ('g', 0.014075688248381507), ('L', 0.007981587547396654), ('1', 0.007999826306525671), ('_', 0.007966162873627346), ('J', 0.007991495578296936), ('C', 0.00798497187426169), ('b', 0.007975426995543312), ('d', 0.008001411184246963), ('G', 0.008012727008186546), ('n', 0.11403039429577098), ('z', 0.007983776913974436), ('6', 0.00801478344546874), ('7', 0.008044761507008554), ('V', 0.007994056693895453), ('q', 0.00801116043701286), ('w', 0.0141081959139

In [None]:
%%time
for i in range(6):
    votes = ranks.join(usersandfriends) \
        .flatMap(calculateVotes)
    ranks = votes.reduceByKey(lambda x, y: x+y)
print(ranks.take(100))

I tried to do page rank but jupyter notebook crashed even with corrections for time out and only users with friends.  Also I acknowledge that the keys in the output need to be fixed here, but I think this would be unrelated to the performace issues.

I think page rank would have been a very strong measure of influence because (very informally) it finds the users with the most friends who themselves have the most friends.  People with the most friends influence the most opinions, and those they influence who also have lots of friends spread that influence.  Further, many measures of influence that can be gleaned from this dataset are likely highly correlated with number of friends (and influential friends) e.g. years of elite status, number of reviews, etc.  Let's examine these:  

In [15]:
def friendsandreviews(t):
    return ([Vectors.dense(friendCount(t), t['review_count'])])



In [16]:
df_vector7 = raw_data.map(parseJSON).map(friendsandreviews)

In [19]:
df_vector7 = spark.createDataFrame(df_vector7, ['features'])

**Correlation between number of friends and number of reviews**:

In [20]:
__pearsonCorr__ = Correlation.corr(df_vector7, 'features', 'pearson').collect()[0][0]
print(str(__pearsonCorr__).replace('nan', 'NaN'))

DenseMatrix([[1.        , 0.38977946],
             [0.38977946, 1.        ]])


In [57]:
df_vector9 = raw_data.map(parseJSON).map(lambda line: line['review_count'])

In [60]:
df_reviews = df_vector9.map(row).toDF()

### 50th, 90th, and 99th percentile for review count:

In [61]:

quantsreviews = df_reviews.approxQuantile(['val'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantsreviews

[[5.0, 45.0, 298.0]]

In [27]:
def hasbeenElite(t):
    for i in t['elite']:
        if i.isnumeric():
            return True
    return False

In [28]:
eliteusers = raw_data.map(parseJSON).filter(lambda line: hasbeenElite(line) is True)
eliteusers.take(5)

[{'user_id': 'FOBRPlBHa3WPHFB5qYDlVg',
  'name': 'Michelle',
  'review_count': 564,
  'yelping_since': '2008-04-28 01:29:25',
  'useful': 790,
  'funny': 316,
  'cool': 400,
  'elite': '2008,2009,2010,2011,2012,2013',
  'friends': 'ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug, kc-rnN-ndnFTdHG4TfIgeQ, GYndf-h6dAwpGP0lDBz2Wg, FPo3SwQuAK53QVZm_eIyBg, 9fF_T3pQu3ay1oA7h_VYNA, G5T3bd6dUs5zkQ2VMZtRUw, tufuEc5f9TWR05_yko46QQ, 4lMab047cJsEt8puhy8dew, bEbLwLpJauEG-HW0w_0IKw, xz9o3T3uJxyeLZ6Cyb8cFQ, LxBh46T0wO9VlsJUK5q2cw, dUALcXVUTC7NYHuE5GKjtw, LM6pxbcfdyUS8M2jWoQk1g, nQG7XkgrsG4MXXXfexJwAA, 8t6qbx5mNimrLrqs01VIvg, A-pcQqvtlsxd-JgBOIdwHw, 2i2NOMPClqwUA64SkzyIvQ, bnf7t21SCFw3HWfV_soG_Q, b_ED_rPl7OLwEpUlr4VI5A, Avlw4sAK8uGpcQY89Gk0Tg, euxheBcyvFLJqNQSbsFsJQ, 1Ksjxp5WuG2Cni24Ipal6w, nH5NCbQ7eacONPMV8VpdEw, AMqRKxavwt1yTBzlMrG8sQ, f71r-LEp9O8yJu76XhPadQ, cjotiAHFSbFuvbxTWmGi1Q, _KfzNJcprrOQlvoAOawS2Q, PjdEARXumUBwx5zj6jTkng, mGTObjvjjIVy6C3FLNH2pw, wwHxfdwZoGLGmvYznFMvgw, 1hup0FxTWkjNAcVf4Xvj6w, 

In [29]:
eliteusers_friends = eliteusers.map(friendCount)

In [34]:
elitefriends_df = eliteusers_friends.map(row).toDF()

#### Median, 90th and 99th percentile number of friends for elite users

In [35]:
quantsefr = elitefriends_df.approxQuantile(['val'], \
                                   [0.5, 0.9, 0.99], 0.0001)
quantsefr

[[128.0, 547.0, 1952.0]]

As we can see, the median number of friends for elite users is 128, which is near the 90th percentile for the overall userbase.  However, I don't see something inherent in being an elite user that makes them more influential than someone who simply has a lot of followers and produces a lot of reviews.  Elite status is likely moreso a symptom of influence rather than a determining factor in it (although perhaps some people may take elite users' opinions more seriously, even if subconciously).  It could also be argued that elite status is a symptom of review quality, but it could probably equally be argued that total following is as much or moreso a symptom of review quality and/or volume.  I think this is the most plausible primary factor in amassing followers/friends, and as we can infer from the hot/cute/cool analysis, people probably don't tend to follow others for these reasons.

In lieu of being able to calculate the level of influence of user's friends, I will approximate influence using the sum of a user's friends and fans multiplied by their total reviews.  Basically how many people do they directly "influence", and how much have they influenced that number of people.  Obviously total following increases over time and thus a user hasn't reached an audience of their current size with every review, but this effect applies for every user subject to this evaluation, so I think it serves as an effective proxy.

### InfluenceScore = (friends + fans) * review_count

In [29]:
def valueFunction(t):
    return (friendCount(t)+t['fans'])*t['review_count']

In [53]:
#Calculate value function and modify JSON to include it and friend count.  Also omit 
#friend list to make output more readable
def modJSON(res):
    res['friend_count'] = friendCount(res)
    res['influence_score'] = valueFunction(res)
    res.pop('friends', None)
    return res


In [54]:
influence_rank = raw_data.map(parseJSON).map(modJSON)
influence_rank.take(5)

[{'user_id': 'ntlvfPzc8eglqvk92iDIAw',
  'name': 'Rafael',
  'review_count': 553,
  'yelping_since': '2007-07-06 03:27:11',
  'useful': 628,
  'funny': 225,
  'cool': 227,
  'elite': '',
  'fans': 14,
  'average_stars': 3.57,
  'compliment_hot': 3,
  'compliment_more': 2,
  'compliment_profile': 1,
  'compliment_cute': 0,
  'compliment_list': 1,
  'compliment_note': 11,
  'compliment_plain': 15,
  'compliment_cool': 22,
  'compliment_funny': 22,
  'compliment_writer': 10,
  'compliment_photos': 0,
  'friend_count': 45,
  'influence_score': 32627},
 {'user_id': 'FOBRPlBHa3WPHFB5qYDlVg',
  'name': 'Michelle',
  'review_count': 564,
  'yelping_since': '2008-04-28 01:29:25',
  'useful': 790,
  'funny': 316,
  'cool': 400,
  'elite': '2008,2009,2010,2011,2012,2013',
  'fans': 27,
  'average_stars': 3.84,
  'compliment_hot': 36,
  'compliment_more': 4,
  'compliment_profile': 5,
  'compliment_cute': 2,
  'compliment_list': 1,
  'compliment_note': 33,
  'compliment_plain': 37,
  'compliment_c

## Top 10 most influential users:

In [56]:
influence_rank.takeOrdered(10, key = lambda x: -x['influence_score'])

[{'user_id': '8k3aO-mPeyhbR5HUucA5aA',
  'name': 'Victor',
  'review_count': 14455,
  'yelping_since': '2007-12-08 14:56:45',
  'useful': 101960,
  'funny': 62685,
  'cool': 79646,
  'elite': '2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018',
  'fans': 1271,
  'average_stars': 3.28,
  'compliment_hot': 2129,
  'compliment_more': 246,
  'compliment_profile': 210,
  'compliment_cute': 55,
  'compliment_list': 84,
  'compliment_note': 2009,
  'compliment_plain': 3552,
  'compliment_cool': 3304,
  'compliment_funny': 3304,
  'compliment_writer': 2636,
  'compliment_photos': 63,
  'friend_count': 5380,
  'influence_score': 96140205},
 {'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA',
  'name': 'Fox',
  'review_count': 11112,
  'yelping_since': '2009-05-26 11:33:58',
  'useful': 145838,
  'funny': 135459,
  'cool': 139117,
  'elite': '2014,2015,2016,2017,2018',
  'fans': 2718,
  'average_stars': 3.8,
  'compliment_hot': 3763,
  'compliment_more': 402,
  'compliment_profile': 383,
  'compliment_cu

Unsurprisngly, the top 10 users by influence score are all well above the 99th percentile in fans, friends and review count.  Further, all of them have been elite users for at least five years, and they all have thousands of compliments.  Clearly these are all highly influential users in terms of giving as many opinions as possible to as many people as possible.  It would be interesting to compare this to a PageRank for friends.  