In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

#from google.cloud import storage
#!pip install gcsfs
import pandas as pd

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-07-24 20:48:36--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-07-24 20:48:36 (4.80 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [4]:
# Read in data from GCS Buckets
from pyspark import SparkFiles
url ="https://storage.googleapis.com/team5k/health/us-county-health-rankings-2020.csv"
spark.sparkContext.addFile(url)
health_df = spark.read.csv(SparkFiles.get("us-county-health-rankings-2020.csv"), header=True, inferSchema=True)

# Show DataFrame
health_df.show()

+----+-------+--------+----------+---------------------------------+----------------+-----------------+--------+--------------+-------------------------------+--------------------------------+---------------+--------------------------------+---------------------------------+---------------+--------------------------------+---------------------------------+------------------+-----------------------------------+------------------------------------+---------------+--------------------------------+---------------------------------+---------------------------+------------------+-------------------+----------+-------------------------------------------+------------------+-------------------+----------+-----------------------------------------+------------------+-------------------+----------+----------+-----------------------+------------------+-------------------+----------+----------------+---------------------------------+----------------------------------+-----------------+--------------

In [5]:
health_df.dtypes

[('fips', 'int'),
 ('state', 'string'),
 ('county', 'string'),
 ('num_deaths', 'int'),
 ('years_of_potential_life_lost_rate', 'double'),
 ('95percent_ci_low', 'double'),
 ('95percent_ci_high', 'double'),
 ('quartile', 'int'),
 ('ypll_rate_aian', 'double'),
 ('ypll_rate_aian_95percent_ci_low', 'double'),
 ('ypll_rate_aian_95percent_ci_high', 'double'),
 ('ypll_rate_asian', 'double'),
 ('ypll_rate_asian_95percent_ci_low', 'double'),
 ('ypll_rate_asian_95percent_ci_high', 'double'),
 ('ypll_rate_black', 'double'),
 ('ypll_rate_black_95percent_ci_low', 'double'),
 ('ypll_rate_black_95percent_ci_high', 'double'),
 ('ypll_rate_hispanic', 'double'),
 ('ypll_rate_hispanic_95percent_ci_low', 'double'),
 ('ypll_rate_hispanic_95percent_ci_high', 'double'),
 ('ypll_rate_white', 'double'),
 ('ypll_rate_white_95percent_ci_low', 'double'),
 ('ypll_rate_white_95percent_ci_high', 'double'),
 ('percent_fair_or_poor_health', 'double'),
 ('95percent_ci_low_2', 'double'),
 ('95percent_ci_high_2', 'double')

In [6]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://34.67.52.115/team5k"
config = {"user":"postgres", 
          "password": "team5kteam5k", 
          "driver":"org.postgresql.Driver"}

In [7]:
# Write DataFrame to RDS
health_df.write.jdbc(url=jdbc_url, table="health_metrics", mode=mode, properties=config)

In [8]:
# Read in data from GCS Buckets
from pyspark import SparkFiles
url ="https://storage.googleapis.com/team5k/zip_codes/ZIP-COUNTY-FIPS_2017-06.csv"
spark.sparkContext.addFile(url)
zip_df = spark.read.csv(SparkFiles.get("ZIP-COUNTY-FIPS_2017-06.csv"), header=True, inferSchema=True)

# Show DataFrame
zip_df.show()

+-----+--------------+-----+----------+-------+
|  ZIP|    COUNTYNAME|STATE|STCOUNTYFP|CLASSFP|
+-----+--------------+-----+----------+-------+
|36003|Autauga County|   AL|      1001|     H1|
|36006|Autauga County|   AL|      1001|     H1|
|36067|Autauga County|   AL|      1001|     H1|
|36066|Autauga County|   AL|      1001|     H1|
|36703|Autauga County|   AL|      1001|     H1|
|36701|Autauga County|   AL|      1001|     H1|
|36091|Autauga County|   AL|      1001|     H1|
|36051|Autauga County|   AL|      1001|     H1|
|36068|Autauga County|   AL|      1001|     H1|
|36008|Autauga County|   AL|      1001|     H1|
|36022|Autauga County|   AL|      1001|     H1|
|36749|Autauga County|   AL|      1001|     H1|
|36758|Autauga County|   AL|      1001|     H1|
|36550|Baldwin County|   AL|      1003|     H1|
|36551|Baldwin County|   AL|      1003|     H1|
|36527|Baldwin County|   AL|      1003|     H1|
|36577|Baldwin County|   AL|      1003|     H1|
|36559|Baldwin County|   AL|      1003| 

In [10]:
#standardize column names
zip_df=zip_df.withColumnRenamed('ZIP', "zip").withColumnRenamed("COUNTYNAME", "county").withColumnRenamed("STATE", "state").withColumnRenamed("STCOUNTYFP" ,"stcountyfp").withColumnRenamed("CLASSFP" ,"classfp")

In [11]:
# Write DataFrame to RDS
health_df.write.jdbc(url=jdbc_url, table="postal_codes", mode=mode, properties=config)

In [13]:
# Read in data from GCS Buckets
from pyspark import SparkFiles
url ="https://storage.googleapis.com/team5k/donations/individual_contributions.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("individual_contributions.csv"), header=True, inferSchema=True)

# Show DataFrame
df.show()

+---+-----+-------------------+--------------+-------------------+------------+--------------------+-------+---------+-------------------+------+------+-------------+-----+-----+----------+----+------------+---------+------+------------------------------+-----------+--------------------+--------------------+------+
| id|cycle|       fec_trans_id|contributor_id|   contributor_name|recipient_id|            org_name|ult_org|real_code|               date|amount|street|         city|state|  zip|recip_code|type|committee_id| other_id|gender|old_format_employer_occupation|  microfilm|          occupation|            employer|source|
+---+-----+-------------------+--------------+-------------------+------------+--------------------+-------+---------+-------------------+------+------+-------------+-----+-----+----------+----+------------+---------+------+------------------------------+-----------+--------------------+--------------------+------+
|  1| 2010|1010420120009248286|  k00019100751|   

In [14]:
# Write DataFrame to RDS
df.write.jdbc(url=jdbc_url, table="individual_Contributions", mode=mode, properties=config)