In [None]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

# Start Spark session
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *

conf = SparkConf().set("spark.files.overwrite", "true").set('spark.driver.extraClassPath', '/content/postgresql-42.2.9.jar')
spark = SparkSession.builder.appName("ETL_Committee").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar",conf=conf).getOrCreate()

--2020-08-04 13:55:37--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-08-04 13:55:37 (4.32 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [None]:
#Create Function

def etl_load(table_name,  gcs_bucket, file_name, year):
  from pyspark import SparkFiles 
  
  #Define Schema
  schema = StructType([
    StructField("CMTE_ID",StringType(),True),
    StructField("CMTE_NM",StringType(),True),
    StructField("TRES_NM",StringType(),True),
    StructField("CMTE_ST1",StringType(),True),
    StructField("CMTE_ST2",StringType(),True),
    StructField("CMTE_CITY",StringType(),True),
    StructField("CMTE_ST",StringType(),True),
    StructField("CMTE_ZIP",StringType(),True),
    StructField("CMTE_DSGN",StringType(),True),
    StructField("CMTE_TP",StringType(),True),
    StructField("CMTE_PTY_AFFILIATION",StringType(),True),
    StructField("CMTE_FILING_FREQ",StringType(),True),
    StructField("ORG_TP",StringType(),True),
    StructField("CONNECTED_ORG_NM",StringType(),True),
    StructField("CAND_ID",StringType(),True)])
      
  #Unzip file and overwrite for the job
  url=gcs_bucket + file_name
  print(url)
  !wget $url
  !unzip -o  $file_name  
  new_file_name=year+file_name
  !mv -f cm.txt $new_file_name

  spark.sparkContext.addFile(new_file_name)
  df = spark.read.csv(SparkFiles.get(new_file_name), sep="|", header=False, inferSchema=True, schema=schema)

  #Limit ETL to 6 States
  df.show()

  mode = "append"
  jdbc_url="jdbc:postgresql://34.67.52.115/team5k"
  postgres_config = {"user":"postgres", 
            "password": "team5kteam5k", 
            "driver":"org.postgresql.Driver"}

  print("Starting " +year)
  # Read in data to dataframe
  #df = pd.read_csv(year+"/itcont.txt", sep="|", low_memory=False)
  #df.head()
  # Write file
  df.write.jdbc(url=jdbc_url, table=table_name, mode=mode, properties=postgres_config)

  print(year + " Complete")

In [None]:
#Run ETL Function
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm00.zip','2000')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm02.zip','2002')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm04.zip','2004')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm06.zip','2006')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm08.zip','2008')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm10.zip','2010')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm12.zip','2012')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm14.zip','2014')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm16.zip','2016')
print("All Years Complete")

https://storage.googleapis.com/team5k/donations/cm00.zip
--2020-08-03 14:05:53--  https://storage.googleapis.com/team5k/donations/cm00.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.128, 172.217.204.128, 172.217.203.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 448764 (438K) [application/x-zip-compressed]
Saving to: ‘cm00.zip’


2020-08-03 14:05:53 (98.5 MB/s) - ‘cm00.zip’ saved [448764/448764]

Archive:  cm00.zip
  inflating: cm.txt                  
+---------+--------------------+--------------------+--------------------+-------------+-------------+-------+--------+---------+-------+--------------------+----------------+------+--------------------+---------+
|  CMTE_ID|             CMTE_NM|             TRES_NM|            CMTE_ST1|     CMTE_ST2|    CMTE_CITY|CMTE_ST|CMTE_ZIP|CMTE_DSGN|CMTE_TP|CMTE_PTY_AFFILIATION|CMTE_FILING_FREQ|ORG_TP|   

In [None]:

etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm18.zip','2018')
etl_load('committees', 'https://storage.googleapis.com/team5k/donations/', 'cm20.zip','2020')

https://storage.googleapis.com/team5k/donations/cm18.zip
--2020-08-04 13:55:59--  https://storage.googleapis.com/team5k/donations/cm18.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.217.128, 64.233.170.128, 108.177.11.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.217.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 809512 (791K) [application/x-zip-compressed]
Saving to: ‘cm18.zip’


2020-08-04 13:55:59 (106 MB/s) - ‘cm18.zip’ saved [809512/809512]

Archive:  cm18.zip
  inflating: cm.txt                  
+---------+--------------------+--------------------+--------------------+---------------+-------------+-------+---------+---------+-------+--------------------+----------------+------+--------------------+---------+
|  CMTE_ID|             CMTE_NM|             TRES_NM|            CMTE_ST1|       CMTE_ST2|    CMTE_CITY|CMTE_ST| CMTE_ZIP|CMTE_DSGN|CMTE_TP|CMTE_PTY_AFFILIATION|CMTE_FILING_FREQ|ORG_TP|