In [1]:
import os
# The latest version of spark 3.2  from http://www.apache.org/dist/spark/ 
spark_version = 'spark-3.2.3'
# spark_version = 'spark-3.'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()


!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Waiting for headers] [Waiting for headers] [1 InRelease 3,626 B/3,626 B 1000% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
0% [1 InRelease gpgv 3,626 B] [3 InRelease 14.2 kB/88.7 kB 16%] [Waiting for he                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 http

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("bigDataETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [7]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Apps_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
mobile_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Mobile_Apps_v1_00.tsv.gz"), sep="\t", header=True)
mobile_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|    1774101|R3PMQXEVYR4ZT1|B00DVKOYBM|     263046246|         8 Ball Pool|     Mobile_Apps|          3|            0|          0|   N|                Y|                 Fun|It is a fun game ...| 2015-08-31|
|         US|   13679234|R1I1K3EO98EMUV|B00R1IA4RS|     128868854|Christmas Cookie ...|     Mobile_Apps|          5|    

In [9]:
# Drop null values, duplicates, and count number of rows.
mobile_df = mobile_df.dropna()
mobile_df = mobile_df.dropDuplicates()
mobile_df.count()

5033238

In [10]:
# Show schema to see if it matches the sql schema tables.
mobile_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



In [15]:
# Change schema to fit sql tables
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, DateType
mobile_df = mobile_df.withColumn("review_date", mobile_df["review_date"].cast(DateType()))
mobile_df = mobile_df.withColumn("customer_id", mobile_df["customer_id"].cast(IntegerType()))
mobile_df = mobile_df.withColumn("product_parent", mobile_df["product_parent"].cast(IntegerType()))
mobile_df = mobile_df.withColumn("star_rating", mobile_df["star_rating"].cast(IntegerType()))
mobile_df = mobile_df.withColumn("helpful_votes", mobile_df["helpful_votes"].cast(IntegerType()))
mobile_df = mobile_df.withColumn("total_votes", mobile_df["total_votes"].cast(IntegerType()))
mobile_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



In [16]:
# Create dataframe for review_id_table schema
review_id = mobile_df.select(["review_id", 
                          "customer_id", 
                          "product_id", 
                          "product_parent", 
                          "review_date"])
review_id.show()

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R1004E2Q9GNK0X|   42549410|B005XQLBU4|     464872742| 2015-04-04|
|R1007PYPFDDGVX|    6174184|B009KS4XRO|     490050721| 2014-03-05|
|R1007WCV4QRLNG|   11902108|B00ATDOVNK|     519315001| 2013-01-26|
|R1009WBZG5V78G|   29303557|B00FAI2IUM|     900991309| 2015-07-08|
|R100BR0ZNQVPJK|   51124520|B00NG6CIKA|     985585723| 2015-01-26|
|R100JN8KBE9F9C|   11416458|B00GRBESP4|     609209651| 2014-08-02|
|R100OA8RSE9MFC|   46416338|B00E8KLWB4|     488246640| 2015-07-19|
|R100QCVNR5J92R|   27033827|B0097HTPE4|     554042181| 2012-10-21|
|R100T4P8XWEGN2|   14091623|B00BIK524A|     454994353| 2014-05-08|
|R100UKJQ88UESQ|   42915717|B0052AZJV8|     260966219| 2015-08-16|
|R1012OJCOW2YGR|   31030939|B00CPZ7DBW|     788907793| 2014-08-21|
|R1016YYC2UMN3U|    1252887|B005ZFOOE8|     447864507| 2014-07

In [17]:
# Create products_df

products_df = mobile_df.select(["product_id",
                            "product_title"])
products_df = products_df.dropDuplicates()
products_df.show(5)

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B00NG6CIKA|Solitaire TriPeak...|
|B00ZDVV7Q2|    Frozen Free Fall|
|B00N0RK9JM|How Draw Adventur...|
|B00SYZDSXO|       Bee Brilliant|
|B00HAPRVWS|Sonic The Hedgehog 2|
+----------+--------------------+
only showing top 5 rows



In [25]:
# Create customers_df, need to create customers_count column
customers_df = mobile_df.select(["customer_id"])
customers_df = customers_df.groupBy("customer_id").count()
customers_df = customers_df.dropDuplicates()

# Rename count column to customer_count and change column datatypes
customers_df = customers_df.withColumnRenamed("count", "customer_count")
customers_df = customers_df.withColumn("customer_count", customers_df["customer_count"].cast(IntegerType()))
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: integer (nullable = false)



In [19]:
# Check dataframe
customers_df.show(5)


+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   39055014|             1|
|   19270404|            15|
|   46351928|             1|
|   19029691|            11|
|   48940424|            26|
+-----------+--------------+
only showing top 5 rows



In [29]:
# Create vine_df
vine_df = mobile_df.select(["review_id",
                        "star_rating",
                        "helpful_votes",
                        "total_votes",
                        "vine"])
vine_df = vine_df.dropDuplicates()
vine_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|  R100GFVSX5IV|          5|            0|          0|   N|
|R100GMH6FXXLZX|          5|            0|          1|   N|
|R100UN38Y41RB8|          5|            0|          0|   N|
|R101FZGWPA0TWP|          5|            1|          1|   N|
|R101HPZZ9P804D|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [22]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://maelsse-db.ctgl6yfw7uje.us-east-2.rds.amazonaws.com:5432/bigdata"
config = {"user":"", 
          "password": "", 
          "driver":"org.postgresql.Driver"}

In [23]:
# Write DataFrame to review_id table in RDS
review_id.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [27]:
# Write DataFrame to customers table in RDS
customers_df.write.jdbc(url=jdbc_url, table='customers_extra', mode=mode, properties=config)

In [30]:
# Write DataFrame to vine_table table in RDS
vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)

In [31]:
# Write DataFrame to products table in RDS
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)