In [1]:
import os
# Find the latest version of spark 3.2 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.0'
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 htt

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("M16-Challenge-Deliverable-2").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [3]:
# Read in the Review dataset as a DataFrame
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|   product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   45610553| RMDCHWD0Y5OZ9|B00HH62VB6|     618218723|AGPtek® 10 Isolat...|Musical Instruments|          3|            0|          1|   N|                N|         Three Stars|Works very good, ...| 2015-08-31|
|         US|   14640079| RZSL0BALIYUNU|B003LRN53I|     986692292|Sennheiser HD203 ...|Musical Instruments| 

In [4]:
# Create the vine_table. DataFrame
vine_df = df.select(["review_id","star_rating","helpful_votes","total_votes","vine","verified_purchase"])
vine_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
| RMDCHWD0Y5OZ9|          3|            0|          1|   N|                N|
| RZSL0BALIYUNU|          5|            0|          0|   N|                Y|
| RIZR67JKUDBI0|          3|            0|          1|   N|                Y|
|R27HL570VNL85F|          5|            0|          0|   N|                Y|
|R34EBU9QDWJ1GD|          5|            0|          0|   N|                Y|
|R1WCUI4Z1SIQEO|          5|            0|          0|   N|                N|
| RL5LNO26GAVJ1|          2|            3|          4|   N|                Y|
|R3GYQ5W8JHP8SB|          5|            0|          0|   N|                Y|
|R30SHYQXGG5EYC|          5|            0|          0|   N|                Y|
|R14YLXA56NP51I|          5|            1|          1|   N|     

In [5]:
# Create new dataframe with filter total_votes>=20
votes_filtered_df = vine_df.filter(df['total_votes'] >= 20)
votes_filtered_df.orderBy("total_votes",asc=True).show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R1NEJI84WI7208|          1|           13|         20|   N|                Y|
|R2G5WF2T9E73KU|          5|           19|         20|   N|                Y|
| RWI2QZJMZYTVB|          2|            1|         20|   N|                Y|
|R37WGVXA94D62S|          2|            4|         20|   N|                Y|
|R360YLELRRDAAW|          1|           15|         20|   N|                Y|
|R2R6JPF9KOD2HJ|          5|           19|         20|   N|                Y|
| RO9DTZNGL1HL6|          5|           19|         20|   N|                Y|
|R2VF37RUEH137I|          5|           20|         20|   N|                N|
|R3P2ZNUZM4GJ2Y|          4|           19|         20|   N|                Y|
|R33DL69KQBPI0G|          1|           20|         20|   N|     

In [6]:
# Filter votes_filtered_df where helpful_votes/total_votes >= 0.50 
votes_divided_df = votes_filtered_df.filter((df["helpful_votes"]/df["total_votes"]) >= 0.50)
votes_divided_df.orderBy("helpful_votes", asc=True).show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
| RSCJ6Z1N54LLP|          1|           10|         20|   N|                Y|
|R1RHATKZEFRWQU|          4|           10|         20|   N|                N|
|R1NGLGZKBVL8HS|          1|           10|         20|   N|                Y|
|R32BKEJVRR3SJ9|          1|           10|         20|   N|                N|
|R1J3A9I51LN3B9|          2|           10|         20|   N|                N|
|R14COWZU3Y4G2W|          4|           10|         20|   N|                Y|
|R2IO7LFBOJGMAJ|          1|           10|         20|   N|                N|
|R2PEIHMPPOB1YJ|          1|           10|         20|   N|                N|
|R2NR2R7AHAHAUG|          3|           10|         20|   N|                Y|
|R1Y4IT49H24Z40|          1|           10|         20|   N|     

In [15]:
# filter votes_divided_df: only rows where vine = Y
vineuser_df = votes_divided_df.filter(df["vine"]=='Y')
print("VINE REVIEWS")
vineuser_df.show()

VINE REVIEWS
+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R1R9RU7JW0MFR2|          4|           20|         23|   Y|                N|
|R19EFYNN3W8Q07|          5|           26|         32|   Y|                N|
|R34DJ1R8AEU0SG|          5|           29|         35|   Y|                N|
|R25P5CXK5L9RHF|          5|          146|        161|   Y|                N|
|R2E9VZB3I4LSN5|          5|           55|         59|   Y|                N|
| RKYLHZL7EPELX|          4|           19|         25|   Y|                N|
|R1U13EKGQD3ZE6|          5|           22|         25|   Y|                N|
| RYW05F1MUEF01|          5|           87|        102|   Y|                N|
|R2SW4NXNO7HZJ5|          4|           28|         33|   Y|                N|
|R2016NFLSUR97Y|          2|           26|         

In [16]:
# filter votes_divided_df: only rows where vine = N
nonvineuser_df = votes_divided_df.filter(df["vine"]=='N')
print("Non-Vine Reviews")
nonvineuser_df.show()

Non-Vine Reviews
+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|         61|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|         23|   N|                Y|
| RX4D22YSXEF4P|          1|           37|         38|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|         37|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|         24|   N|                Y|
|R36V6V42VN5AS5|          5|           34|         37|   N|                Y|
|R27LZWE27BJPOB|          5|           22|         23|   N|                N|
|R1P7GJ0IN2BRNH|          5|           37|         37|   N|                Y|
|R2R6JPF9KOD2HJ|          5|           19|         20|   N|                Y|
|R2J0ZZGFXKM8KR|          2|           21|     

In [9]:
# count of vine reviews
vine_reviews_count = vineuser_df.count()
vine_reviews_count

60

In [10]:
# count of 5-star vine reviews
vine_five_star_count = vineuser_df.filter(df['star_rating'] == 5).count()
vine_five_star_count

34

In [19]:
# porportion of all vine reviews that are 5-star reviews
vine_percent = vine_five_star_count/vine_reviews_count
vine_percent

0.5666666666666667

In [12]:
# count of review by non-vine users
nonvine_reviews_count = nonvineuser_df.count()
nonvine_reviews_count

14477

In [13]:
# count of 5-star non-vine user reviews
nonvine_five_star_count = nonvineuser_df.filter(df['star_rating'] == 5).count()
nonvine_five_star_count

8212

In [18]:
# porportion of all vine reviews that are 5-star reviews
nonvine_percent = nonvine_five_star_count/nonvine_reviews_count
nonvine_percent

0.5672445948746287

In [24]:
results_df = spark.createDataFrame([
                                    ("Vine Reviewer",vine_five_star_count,vine_reviews_count,vine_percent),
                                    ("non-Vine Reviewer", nonvine_five_star_count,nonvine_reviews_count,nonvine_percent)
                                    ], ["Reviewer Type","5-Star Reviews", "Total Reviews","% 5-Star Reviews"])
results_df.show()

+-----------------+--------------+-------------+------------------+
|    Reviewer Type|5-Star Reviews|Total Reviews|  % 5-Star Reviews|
+-----------------+--------------+-------------+------------------+
|    Vine Reviewer|            34|           60|0.5666666666666667|
|non-Vine Reviewer|          8212|        14477|0.5672445948746287|
+-----------------+--------------+-------------+------------------+

