In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()
import pyspark

In [2]:
!pip install -q findspark
!pip install pyspark
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-03-15 19:38:10--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.2’


2020-03-15 19:38:11 (4.79 MB/s) - ‘postgresql-42.2.9.jar.2’ saved [914037/914037]



In [0]:
from pyspark import SparkFiles
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ChallengeExample").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [0]:
bucketUrl = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"

In [0]:
spark.sparkContext.addFile(bucketUrl)

In [0]:
#Read into dataframe
sparkDF = spark.read.csv(SparkFiles.get("amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"),sep="\t", header=True, inferSchema=True)

In [7]:
sparkDF.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   48881148|R215C9BDXTDQOW|B00FR4YQYK|     381800308|SadoTech Model C ...|Home Improvement|          4|            0|          0|   N|                Y|          Four Stars|        good product|2015-08-31 00:00:00|
|         US|   47882936|R1DTPUV1J57YHA|B00439MYYE|     921341748|iSpring T32M 3.2 ...|H

In [8]:
sparkDF.count()

2634781

In [9]:
#Remove Duplicates
removeDupliDf = sparkDF.distinct()
removeDupliDf.count()

2634781

In [10]:
#Drop duplicates
dropna_df =  removeDupliDf.dropna()
dropna_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   11701583|R100FAA6WE6G1N|B0018P1RTE|     570747356|Hardware House Be...|Home Improvement|          5|            0|          0|   N|                Y|           Beautiful|I purchased 2 of ...|2014-04-23 00:00:00|
|         US|   48172200|R100H3FSUUYEHI|B000XQ02PO|     749047588|Porcelain Medium ...|H

In [11]:
review_id_table  = dropna_df.select("review_id","customer_id","product_id","product_parent","review_date")
review_id_table = review_id_table.distinct()
review_id_table.show()

+--------------+-----------+----------+--------------+-------------------+
|     review_id|customer_id|product_id|product_parent|        review_date|
+--------------+-----------+----------+--------------+-------------------+
|R10EMQ669R1OW7|   31988888|B00JMXK0ZG|     369223727|2015-01-30 00:00:00|
|R10L5G41J5P1L9|   44128995|B002BA5VVA|     890071563|2010-04-30 00:00:00|
|R14LXTZ8EVXBNC|   14732814|B00L595FA0|     356117024|2014-12-28 00:00:00|
|R1AK12POOU7J7P|   14202343|B007M48PH2|     932684268|2013-10-28 00:00:00|
|R1BC3SGL2S4L1R|    4370855|B008DI5XRQ|     538890034|2014-06-06 00:00:00|
|R1EJKIPHJCUIAQ|   10655988|B00BJAD4CQ|     339677559|2015-07-14 00:00:00|
|R1G1IUSIB5R1QN|   13219423|B002P4O6QI|     337284137|2011-06-25 00:00:00|
|R1GORSQRAS5RUF|   26720035|B00IJHIF30|     359773806|2014-09-01 00:00:00|
|R1GSSOUTKG7AG6|   32777471|B00N9WCXWO|     188913262|2015-03-18 00:00:00|
|R1GYPKRXGPGVH3|   12474469|B007RKVT4C|      84351799|2013-02-24 00:00:00|
|R1H7GFKJN6JN2N|   371604

In [12]:
products_df = dropna_df.select("product_id","product_title")
products_df=products_df.distinct()
products_df.show()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B0006VVN1I|Culligan Certifie...|
|B00011KLH0|Frost King G9 Nai...|
|B003BIGD8Q|GE WB44X200 Bake ...|
|B00D82R8WU|Moen G2245622 220...|
|B0019K914Q|Rust-Oleum 69384 ...|
|B000GKZ2RU|RoomMates WT1068S...|
|B005WQIDHY|Woods 50007 Indoo...|
|B00IEDHJFE|LG LT120F Replace...|
|B00SM7MCO0|Kanthal A1 28 Gau...|
|B00CJ5EO2E|Gorilla Super Glu...|
|B001AHASM2|Unger Industrial ...|
|B002YD7UB8|Pfister Langston ...|
|B009VA1AAY|LEDJUMP SNOWFALL ...|
|B004T3KG4Y|Baldwin Prestige ...|
|B001AZYTP6|Speakman S-2252-P...|
|B00B7DI4N4|Vktech® 10Pcs 30M...|
|B0007XJRP4|Ultra Hardware 83...|
|B00CXAEQKK|Home Décor Iron K...|
|B001DECMSA|EZ Moves II Furni...|
|B002MVY9PW|Tie Down Engineer...|
+----------+--------------------+
only showing top 20 rows



In [13]:
customers_df = dropna_df.groupBy("customer_id").agg({"customer_id":"count"}).withColumnRenamed("count(customer_id)","customer_count")
customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   27367718|             3|
|   23782815|             1|
|   11555870|             2|
|   41936145|             2|
|   23541828|             2|
|   53089613|             1|
|   15032511|             1|
|   10354035|             4|
|   16909581|             1|
|   27040634|             4|
|   23732246|             4|
|   35931570|             1|
|   42538375|             1|
|   34886680|             1|
|   17900576|             1|
|   10200670|             1|
|   46588922|             2|
|   40366377|             1|
|   20096732|             5|
|   17416090|             1|
+-----------+--------------+
only showing top 20 rows



In [11]:
vine_table_df = dropna_df.select("review_id","star_rating","helpful_votes","total_votes","vine")
vine_table_df = vine_table_df.distinct()
vine_table_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R101OHILN612XM|          5|            0|          0|   N|
|R173K9BNHWJV15|          4|            0|          1|   N|
|R17SCQ76TEJEVB|          4|            0|          0|   N|
|R18VL0S120JE8B|          5|            0|          1|   N|
|R1AR2F1YD6IH16|          5|            0|          0|   N|
|R1BDE0RDHUR9TM|          5|            0|          0|   N|
|R1BIE6H29JOXZL|          1|            0|          0|   N|
|R1BMQELNOB6ONN|          5|            0|          0|   N|
|R1CGJDMTON7NZ8|          5|            0|          0|   N|
|R1CXXT29O4MUW6|          5|            0|          0|   N|
|R1EMLRDGRRUELB|          5|            0|          0|   N|
|R1LDUQAWH79GGO|          5|            0|          0|   N|
|R1QTMMPXP2FMCY|          1|           12|         12|   N|
|R1R5OTCZXP0RGW|          5|            

In [0]:
# Configure settings for RDS
import os
import pyspark
mode = "append"
jdbc_url="jdbc:postgresql://dataviz1.cjfblavlxb2k.us-east-2.rds.amazonaws.com:5432/postgres"
config = {"user":"postgres1",
          "password": "welcome123",
          "driver":"org.postgresql.Driver"}

In [16]:
vine_df = vine_table_df.filter(vine_table_df.vine == 'Y')
vine_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
| R84KTVCF5UW21|          4|            0|          0|   Y|
| R380OMSX9O0AF|          5|            0|          0|   Y|
|R2ZA6RL7HWQ0Z8|          5|            3|          3|   Y|
|R2QCMEEMVUMGNK|          3|            1|          1|   Y|
|R2TR8A897ZRDOH|          4|            0|          1|   Y|
|R183QLLWJVOG1Z|          5|            3|          3|   Y|
|R18TTQ2XEMSPTV|          5|            0|          0|   Y|
|R21GGVNKFKEPVW|          4|            0|          0|   Y|
|R21PX3JTSDVBHK|          5|            0|          0|   Y|
| RFJ8BSGG7APST|          5|            0|          0|   Y|
|R1NSI2ARQNG3OE|          5|            1|          2|   Y|
|R1NCVFQ82FG4OI|          2|            1|          1|   Y|
|R2VKJYVKUNAB0Y|          4|            1|          3|   Y|
| RIHDW8JO4AXEW|          5|            

In [17]:
no_vine_df = vine_table_df.filter(vine_table_df.vine == 'N')
no_vine_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R101OHILN612XM|          5|            0|          0|   N|
|R173K9BNHWJV15|          4|            0|          1|   N|
|R17SCQ76TEJEVB|          4|            0|          0|   N|
|R18VL0S120JE8B|          5|            0|          1|   N|
|R1AR2F1YD6IH16|          5|            0|          0|   N|
|R1BDE0RDHUR9TM|          5|            0|          0|   N|
|R1BIE6H29JOXZL|          1|            0|          0|   N|
|R1BMQELNOB6ONN|          5|            0|          0|   N|
|R1CGJDMTON7NZ8|          5|            0|          0|   N|
|R1CXXT29O4MUW6|          5|            0|          0|   N|
|R1EMLRDGRRUELB|          5|            0|          0|   N|
|R1LDUQAWH79GGO|          5|            0|          0|   N|
|R1QTMMPXP2FMCY|          1|           12|         12|   N|
|R1R5OTCZXP0RGW|          5|            

In [18]:
# Number of reviews for vine
vine_df.count()

10779

In [19]:
# Number of reviews for non-vine
 no_vine_df.count()

2623476

In [20]:
#Number of 5-star reviews for vine
vine_df.filter(vine_df.star_rating == '5').count()

5557

In [21]:
#Number of 5-star reviews for vine
no_vine_df.filter(no_vine_df.star_rating == '5').count()

1650739

In [22]:
#Average Rating for vine
vine_df.agg({'star_rating':'avg'}).show()

+-----------------+
| avg(star_rating)|
+-----------------+
|4.305965302903794|
+-----------------+



In [23]:
#Average Rating for no vine
no_vine_df.agg({'star_rating':'avg'}).show()

+-----------------+
| avg(star_rating)|
+-----------------+
|4.181878927041833|
+-----------------+



In [24]:
 #Number of helpful votes vine
 vine_df.agg({'helpful_votes':'sum'}).show()

+------------------+
|sum(helpful_votes)|
+------------------+
|             28568|
+------------------+



In [25]:
 #Number of helpful votes no vine
 no_vine_df.agg({'helpful_votes':'sum'}).show()

+------------------+
|sum(helpful_votes)|
+------------------+
|           4402065|
+------------------+



In [0]:
# Write DataFrame to products table in RDS
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

In [0]:
# Write DataFrame to review_id_table in RDS
review_id_table.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [0]:
# Write DataFrame to customers table in RDS
customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [0]:
# Write DataFrame to vine_table in RDS
vine_table_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)