In [1]:
import os

# spark_version
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [73.0 kB]
Get:9 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:10 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:13 http://ppa.launchpad.net/cran/

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudVineAnalysis").getOrCreate()

In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url="https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
electronics_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Electronics_v1_00.tsv.gz"), sep="\t", header=True, inferSchema=True)

# Show DataFrame
electronics_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   41409413|R2MTG1GCZLR2DK|B00428R89M|     112201306|yoomall 5M Antenn...|     Electronics|          5|            0|          0|   N|                Y|          Five Stars|       As described.| 2015-08-31|
|         US|   49668221|R2HBOEM8LE9928|B000068O48|     734576678|Hosa GPM-103 3.5m...|     Electronics|          5|    

In [5]:
vine_df = electronics_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df = vine_df.drop_duplicates(["review_id"])
vine_df = vine_df.dropna()
vine_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R10000WMGXS51T|          5|            1|          1|   N|
|R10003OLR2P5UE|          3|            0|          0|   N|
|R10005O193PJ6W|          3|            2|          2|   N|
|R10008LR7CU84N|          1|            2|          4|   N|
|R10009JN2UWOJC|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [42]:
# Number of total reviews
total_reviews = vine_df.count()
total_reviews

3093861

## Paid Reviews (part of Vine program)

In [22]:
# Reviews were written as part of the Vine program (people received some sort of payment for writing the reviews)
paid_reviews = vine_df.filter(vine_df['vine'] == "Y")
paid_reviews.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1009ULIBH7XJ7|          3|            0|          0|   Y|
|R1017GXIYICPPZ|          5|            3|          3|   Y|
|R1017MTXDRZ89M|          5|            3|          4|   Y|
|R102O6JM0AG213|          5|            7|          7|   Y|
|R1032U1ID9BC0K|          3|            0|          1|   Y|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [23]:
# Summary Statistics 
paid_reviews.describe().show()

+-------+--------------+------------------+------------------+-----------------+-----+
|summary|     review_id|       star_rating|     helpful_votes|      total_votes| vine|
+-------+--------------+------------------+------------------+-----------------+-----+
|  count|         18512|             18512|             18512|            18512|18512|
|   mean|          null|  4.13504753673293| 5.824654278305964|7.212564822817632| null|
| stddev|          null|0.9629397942126239|37.449657439000525|40.05881133654962| null|
|    min|R1009ULIBH7XJ7|                 1|                 0|                0|    Y|
|    max| RZZQWDXNY2SQA|                 5|              2561|             2688|    Y|
+-------+--------------+------------------+------------------+-----------------+-----+



In [24]:
# Number of reviews as a part of Vine program 
paidrev_total = paid_reviews.count()
paidrev_total

18512

In [44]:
# Percentage of paid reviews
percent_paidrev = (paidrev_total / total_reviews) * 100
percent_paidrev

0.5983462088309721

In [25]:
# Get all 5-star reviews (part of Vine)
FiveStar_paidrev_total = paid_reviews.filter(paid_reviews["star_rating"] == 5).count()
FiveStar_paidrev_total

8044

In [26]:
# Percentage of 5-star paid reviews
FiveStar_percent = (FiveStar_paidrev_total / paidrev_total) * 100
FiveStar_percent

43.452895419187556

In [27]:
# Get all 1-star reviews (part of Vine)
OneStar_paidrev_total = paid_reviews.filter(paid_reviews["star_rating"] == 1).count()
OneStar_paidrev_total


342

In [28]:
# Percentage of 1-star paid reviews
OneStar_percent = (OneStar_paidrev_total / paidrev_total) * 100
OneStar_percent

1.8474503025064821

In [29]:
from pyspark.sql.functions import desc

avg_ratings_vine =paid_reviews.select(["star_rating", "helpful_votes"])\
  .groupby("helpful_votes").agg({"star_rating": "avg", "helpful_votes":"count"})\
  .orderBy(desc("count(helpful_votes)"))

avg_ratings_vine.show(truncate=False)

+-------------+------------------+--------------------+
|helpful_votes|avg(star_rating)  |count(helpful_votes)|
+-------------+------------------+--------------------+
|0            |4.142897130860741 |7145                |
|1            |4.138399597382989 |3974                |
|2            |4.127016129032258 |1984                |
|3            |4.135254988913526 |1353                |
|4            |4.158551810237204 |801                 |
|5            |4.09126213592233  |515                 |
|6            |4.144702842377261 |387                 |
|7            |4.207843137254902 |255                 |
|8            |4.0424528301886795|212                 |
|9            |4.034285714285715 |175                 |
|11           |4.08              |125                 |
|10           |4.040650406504065 |123                 |
|12           |4.177570093457944 |107                 |
|13           |4.032967032967033 |91                  |
|14           |4.141025641025641 |78            

In [20]:
# Number of Helpfuls Votes with at least 1 vote
heplful_votes_paid = paid_reviews.filter(paid_reviews['helpful_votes'] >= 1).count()
heplful_votes_paid

11367

In [30]:
# Percentage of Helpfuls Votes with at least 1 vote
heplful_percent_paidv = (heplful_votes_paid / paidrev_total) * 100
heplful_percent_paidv

61.403414001728606

## Unpaid Reviews (not part of Vine program)

In [15]:
# Reviews were writen not related to Vine Program (these people haven't received payment or free products for writing reviews)
unpaid_reviews = vine_df.filter(vine_df['vine'] == "N")
unpaid_reviews.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R10000WMGXS51T|          5|            1|          1|   N|
|R10003OLR2P5UE|          3|            0|          0|   N|
|R10005O193PJ6W|          3|            2|          2|   N|
|R10008LR7CU84N|          1|            2|          4|   N|
|R10009JN2UWOJC|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [35]:
# Summary Statistics 
unpaid_reviews.describe().show()

+-------+--------------+-----------------+------------------+------------------+-------+
|summary|     review_id|      star_rating|     helpful_votes|       total_votes|   vine|
+-------+--------------+-----------------+------------------+------------------+-------+
|  count|       3075349|          3075349|           3075349|           3075349|3075349|
|   mean|          null|4.034907257680348|1.8358947228428384|2.3419426543133803|   null|
| stddev|          null|1.389579414886884|21.191344994625712| 22.33640449765777|   null|
|    min|R10000WMGXS51T|                1|                 0|                 0|      N|
|    max| RZZZVZAZKMT9J|                5|             12786|             12944|      N|
+-------+--------------+-----------------+------------------+------------------+-------+



In [36]:
# Number of reviews which aren't part of Vine program
unpaidrev_total = unpaid_reviews.count()
unpaidrev_total

3075349

In [45]:
# Percentage of paid reviews
percent_unpaidrev = (unpaidrev_total / total_reviews) * 100
percent_unpaidrev

99.40165379116902

In [37]:
# Get all 5-star reviews (not part of Vine)
FiveStar_unpaidrev_total = unpaid_reviews.filter(unpaid_reviews["star_rating"] == 5).count()
FiveStar_unpaidrev_total

1773112

In [38]:
# Percentage of 5-star unpaid reviews
FiveStar_nonvine_percent = (FiveStar_unpaidrev_total / unpaidrev_total) * 100
FiveStar_nonvine_percent

57.65563518156801

In [39]:
# Get all 1-star reviews (part of Vine)
OneStar_unpaidrev_total = unpaid_reviews.filter(unpaid_reviews["star_rating"] == 1).count()
OneStar_unpaidrev_total


357777

In [40]:
# Percentage of 1-star paid reviews
OneStar_nonvine_percent = (OneStar_unpaidrev_total / unpaidrev_total) * 100
OneStar_nonvine_percent

11.633704012130005

In [34]:
avg_ratings_nonvine = unpaid_reviews.select(["star_rating", "helpful_votes"])\
  .groupby("helpful_votes").agg({"star_rating": "avg", "helpful_votes":"count"})\
  .orderBy(desc("avg(star_votes)"))

avg_ratings_nonvine.show(truncate=False)

AnalysisException: ignored

In [21]:
# Number of Helpfuls Votes with at least 1 vote
heplful_votes_unpaid = unpaid_reviews.filter(unpaid_reviews['helpful_votes'] >= 1).count()
heplful_votes_unpaid 

991372

In [33]:
# Percentage of Helpfuls Votes with at least 1 vote
heplful_percent_unpaidv = (heplful_votes_unpaid / unpaidrev_total) * 100
heplful_percent_unpaidv

32.2360811732262