# Spark Local Test

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

import configparser
from pyspark.sql import SparkSession, Window
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf, col, monotonically_increasing_id, row_number
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, DateType
from pyspark.sql import functions as F

from datetime import datetime

import os

## Configure Connection

In [2]:
# sc.stop()

In [3]:
output_data = "test_output/"

In [4]:
configure = SparkConf().setAppName('udac_config').setMaster('local')
sc = SparkContext(conf = configure)

In [5]:
# getOrCreate modifies the parameters of existing Spark Session
spark = SparkSession.builder.appName('udac_cap').config('config option', 'config value').getOrCreate()

In [6]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.driver.port', '61700'),
 ('spark.app.id', 'local-1616188128699'),
 ('spark.sql.warehouse.dir',
  'file:/Users/morgan/Documents/10_Udacity/data_eng_nano/usa-tourism-etl/spark-warehouse'),
 ('spark.app.startTime', '1616188127780'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.0.0.223'),
 ('spark.app.name', 'udac_config'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### Airports

In [7]:
airports = spark.read.option("header", True).csv("data/airport_codes.csv")

In [8]:
lat_long = F.split(airports.coordinates, ",")
airports = airports.withColumn('longitude', lat_long.getItem(0))
airports = airports.withColumn('latitude', lat_long.getItem(1))

region_split = F.split(airports.iso_region, "-")
airports = airports.withColumn('state', region_split.getItem(1))

In [9]:
airports = airports.select(['ident',
                 'iata_code',
                 'name','type',
                 'municipality',
                 'state',
                 'local_code',
                 'latitude',
                 'longitude',
                 'elevation_ft']).where(airports.iso_country=="US")

In [10]:
airports = airports.sort('iata_code', ascending=True)

In [11]:
airports = airports.na.drop(subset='iata_code')

In [12]:
airports = airports.withColumn("latitude", airports.latitude.cast('float'))
airports = airports.withColumn("longitude", airports.longitude.cast('float'))
aiprots = airports.withColumn("elevation_fit", airports.elevation_ft.cast('integer'))

In [13]:
airports.show()

+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+
|ident|iata_code|                name|          type| municipality|state|local_code|latitude|longitude|elevation_ft|
+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+
| KAAF|      AAF|Apalachicola Regi...| small_airport| Apalachicola|   FL|       AAF| 29.7275| -85.0275|          20|
| KAAP|      AAP|      Andrau Airpark|        closed|      Houston|   TX|       AAP| 29.7225| -95.5883|          79|
| KABE|      ABE|Lehigh Valley Int...|medium_airport|    Allentown|   PA|       ABE| 40.6521| -75.4408|         393|
| KABI|      ABI|Abilene Regional ...|medium_airport|      Abilene|   TX|       ABI| 32.4113| -99.6819|        1791|
| PAFM|      ABL|      Ambler Airport|medium_airport|       Ambler|   AK|       AFM| 67.1063| -157.857|         334|
| KABQ|      ABQ|Albuquerque Inter...| large_airport|  Albuquerq

#### Successfully created parquets

In [54]:
#airports.write.mode('overwrite').parquet(os.path.join(output_data, "airports"))

---
## USA Cities Demographics

In [34]:
cities = spark.read.option('header', True) \
        .option('delimiter', ";") \
        .csv("data/us_cities_demographics.csv")

In [36]:
cities = cities.withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_pop") \
        .withColumnRenamed("Female Population", "female_pop") \
        .withColumnRenamed("Total Population", "total_pop") \
        .withColumnRenamed("Number of Veterans", "num_veterans") \
        .withColumnRenamed("Foreign-born", "num_foreigners") \
        .withColumnRenamed("Average Household Size", "avg_household_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "race_pop")

In [37]:
race_count = cities.select(F.concat_ws("_", cities.state_code, cities.city).alias("state_city"), "race", "race_pop" )

In [38]:
race_count.show()

+-------------------+--------------------+--------+
|         state_city|                race|race_pop|
+-------------------+--------------------+--------+
|   MD_Silver Spring|  Hispanic or Latino|   25924|
|          MA_Quincy|               White|   58723|
|          AL_Hoover|               Asian|    4759|
|CA_Rancho Cucamonga|Black or African-...|   24437|
|          NJ_Newark|               White|   76402|
|          IL_Peoria|American Indian a...|    1343|
|        AZ_Avondale|Black or African-...|   11592|
|     CA_West Covina|               Asian|   32716|
|        MO_O'Fallon|  Hispanic or Latino|    2583|
|      NC_High Point|               Asian|   11060|
|          CA_Folsom|  Hispanic or Latino|    5822|
|          CA_Folsom|American Indian a...|     998|
|    PA_Philadelphia|               Asian|  122721|
|         KS_Wichita|  Hispanic or Latino|   65162|
|         KS_Wichita|American Indian a...|    8791|
|      FL_Fort Myers|               White|   50169|
|      PA_Pi

In [26]:
cities2 = cities.select(F.)

In [29]:
cities.show()

+----------+-------+----------+--------+----------+---------+------------+--------------+------------------+----------+--------------------+-----------+----------+
|      city|  state|median_age|male_pop|female_pop|total_pop|num_veterans|num_foreigners|avg_household_size|state_code|    predominant_race|pop_of_race|state_city|
+----------+-------+----------+--------+----------+---------+------------+--------------+------------------+----------+--------------------+-----------+----------+
|Birmingham|Alabama|      35.6|  102122|    112789|   214911|       13212|          8258|              2.21|        AL|  Hispanic or Latino|       8940|      null|
|Birmingham|Alabama|      35.6|  102122|    112789|   214911|       13212|          8258|              2.21|        AL|               White|      51728|      null|
|Birmingham|Alabama|      35.6|  102122|    112789|   214911|       13212|          8258|              2.21|        AL|Black or African-...|     157985|      null|
|Birmingham|Alab