In [None]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

#from google.cloud import storage
#!pip install gcsfs
import pandas as pd

In [None]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://storage.googleapis.com/team5k/zip_codes/ZIP-COUNTY-FIPS_2017-06.csv"
spark.sparkContext.addFile(url)
zip_df = spark.read.csv(SparkFiles.get("ZIP-COUNTY-FIPS_2017-06.csv"), header=True, inferSchema=True)

# Show DataFrame
zip_df.show()

+-----+--------------+-----+----------+-------+
|  ZIP|    COUNTYNAME|STATE|STCOUNTYFP|CLASSFP|
+-----+--------------+-----+----------+-------+
|36003|Autauga County|   AL|      1001|     H1|
|36006|Autauga County|   AL|      1001|     H1|
|36067|Autauga County|   AL|      1001|     H1|
|36066|Autauga County|   AL|      1001|     H1|
|36703|Autauga County|   AL|      1001|     H1|
|36701|Autauga County|   AL|      1001|     H1|
|36091|Autauga County|   AL|      1001|     H1|
|36051|Autauga County|   AL|      1001|     H1|
|36068|Autauga County|   AL|      1001|     H1|
|36008|Autauga County|   AL|      1001|     H1|
|36022|Autauga County|   AL|      1001|     H1|
|36749|Autauga County|   AL|      1001|     H1|
|36758|Autauga County|   AL|      1001|     H1|
|36550|Baldwin County|   AL|      1003|     H1|
|36551|Baldwin County|   AL|      1003|     H1|
|36527|Baldwin County|   AL|      1003|     H1|
|36577|Baldwin County|   AL|      1003|     H1|
|36559|Baldwin County|   AL|      1003| 

In [None]:
zip_df.dtypes

[('ZIP', 'int'),
 ('COUNTYNAME', 'string'),
 ('STATE', 'string'),
 ('STCOUNTYFP', 'int'),
 ('CLASSFP', 'string')]

In [None]:
#from pyspark.sql.types import StringType
#zip_df = zip_df.withColumn("zip",zip_df["zip"].cast(StringType()))
#zip_df = zip_df.withColumn("stcountyfp",zip_df["stcountyfp"].cast(StringType()))

In [None]:
zip_df=zip_df.select("STATE")
zip_count=zip_df.groupby("STATE").count()
zip_count.show()

+-----+-----+
|STATE|count|
+-----+-----+
|   AZ|  529|
|   SC|  738|
|   LA|  874|
|   MN| 1427|
|   NJ|  806|
|   DC|  219|
|   OR|  562|
|   VA| 1627|
|   RI|   95|
|   KY| 1418|
|   WY|  215|
|   NH|  312|
|   MI| 1569|
|   NV|  252|
|   WI| 1297|
|   ID|  394|
|   CA| 2632|
|   CT|  440|
|   NE|  970|
|   MT|  508|
+-----+-----+
only showing top 20 rows



In [None]:
#standardize column names
#zip_df=zip_df.withColumnRenamed('ZIP', "zip").withColumnRenamed("COUNTYNAME", "county").withColumnRenamed("STATE", "state").withColumnRenamed("STCOUNTYFP" ,"stcountyfp").withColumnRenamed("CLASSFP" ,"classfp")

In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://34.67.52.115/team5k"
config = {"user":"postgres", 
          "password": "team5kteam5k", 
          "driver":"org.postgresql.Driver"}

In [None]:
# Write DataFrame to RDS
zip_count.write.jdbc(url=jdbc_url, table="postal_code", mode=mode, properties=config)