In [None]:
// The environment setup is 12 GB memory with 4 CPU core. This machine only runs two docker containers: cassandra and the ipython notebook
// we want to answer the question - investigating the relationship: 
// does a user have more reviews means it has more fans?

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.cassandra._
val sparkSession = SparkSession.builder().config(sc.getConf).getOrCreate()
import sparkSession.implicits._

In [2]:
sparkSession.read.cassandraFormat(keyspace = "yelp", table = "user").load().createOrReplaceTempView("user")

In [3]:
val users_reviews_and_fans = sparkSession.sql("""
SELECT u.user_id, sum(u.review_count) as sum_reviews, sum(u.fans) as sum_fans
FROM user u
GROUP BY u.user_id
ORDER BY sum_reviews DESC
""")

In [4]:
// Let's have a brief view of the data
users_reviews_and_fans.show(30, false)

+----------------------+-----------+--------+
|user_id               |sum_reviews|sum_fans|
+----------------------+-----------+--------+
|hWDybu_KvYLSdEFzGrniTw|7125       |337     |
|CxDOIDnH8gp9KXzpBHJYXw|5596       |508     |
|Xj0O2l0bp633ebmG468aZw|4241       |165     |
|GwlrUwkULm4ZLN-Kum5nag|3639       |194     |
|U4INQZOPSUaj8hMjLlZ3KA|3632       |607     |
|CQUDh80m48xnzUkx-X5NAw|3568       |475     |
|cMEtAiW60I5wE_vLfTxoJQ|3480       |140     |
|bLbSNkLggFnqwNNzzq-Ijw|3463       |331     |
|Hi10sGSZNxQH3NLyWSZ1oA|3006       |574     |
|re3AdczLFP4D7xwI6DkBNA|2525       |126     |
|OapLWhQI4_Gm32-nYbHmUA|2299       |374     |
|PKEzKWv_FktMm2mGPjwd0Q|2049       |192     |
|62GNFh5FySkA3MbrQmnqvg|2012       |123     |
|Um5bfs5DH6eizgjH3xZsvg|1987       |95      |
|QJI9OSEn6ujRCtrX06vs1w|1983       |268     |
|DK57YibC5ShBmqQl97CKog|1741       |151     |
|dIIKEfOgo0KqUfGQvGikPg|1730       |722     |
|YRcaNlwQ6XXPFDXWtuMGdA|1729       |77      |
|j5ezF5mEGpnDwdTmAHci7Q|1677      

In [5]:
// Seeing the above looks like the relationship is not high, but let's calculate the correlation coefficient
import org.apache.spark.mllib.stat.Statistics

// Before calculating the correlation, let's check the schema first, because correlation function requires Double
users_reviews_and_fans.printSchema

root
 |-- user_id: string (nullable = true)
 |-- sum_reviews: long (nullable = true)
 |-- sum_fans: long (nullable = true)



In [6]:
val df = users_reviews_and_fans.selectExpr("user_id", "cast(sum_reviews as double) as sum_reviews", 
"cast(sum_fans as double) as sum_fans")

df.printSchema

root
 |-- user_id: string (nullable = true)
 |-- sum_reviews: double (nullable = true)
 |-- sum_fans: double (nullable = true)



In [7]:
// prepare the sequence 
val reviews = df.select($"sum_reviews").rdd.map(_.getDouble(0))
val fans = df.select($"sum_fans").rdd.map(_.getDouble(0))

In [8]:
val p_correlation: Double = Statistics.corr(reviews, fans, "pearson")
print(p_correlation)

0.6616127674231399

In [9]:
//  Pearson correlation is highly affected by outliers, and we can image many outliers in this case, 
// so let's use spearman 

val s_correlation: Double = Statistics.corr(reviews, fans, "spearman")
print(s_correlation)

0.783139779501409

In [None]:
// It looks like correlation is quite high, let's give more reviews and expect to have more fans!