# Spark Local Test

In [16]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

import configparser
from pyspark.sql import SparkSession, Window
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf, col, monotonically_increasing_id, row_number
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, DateType
from pyspark.sql import functions as F

from datetime import datetime

import os

## Configure Connection

In [22]:
sc.stop()

In [23]:
output_data = "test_output/"

In [24]:
configure = SparkConf().setAppName('udac_config').setMaster('local')
sc = SparkContext(conf = configure)

In [25]:
# getOrCreate modifies the parameters of existing Spark Session
spark = SparkSession.builder.appName('udac_cap').config('config option', 'config value').getOrCreate()

In [26]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.sql.warehouse.dir',
  'file:/Users/morgan/Documents/10_Udacity/data_eng_nano/usa-tourism-etl/spark-warehouse'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'local-1616306143864'),
 ('spark.driver.host', '10.0.0.223'),
 ('spark.app.name', 'udac_config'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.startTime', '1616306143637'),
 ('spark.driver.port', '58887'),
 ('spark.ui.showConsoleProgress', 'true')]

### Airports

In [27]:
airports = spark.read.option("header", True).csv("data/airport_codes.csv")

In [28]:
lat_long = F.split(airports.coordinates, ",")
airports = airports.withColumn('longitude', lat_long.getItem(0))
airports = airports.withColumn('latitude', lat_long.getItem(1))

region_split = F.split(airports.iso_region, "-")
airports = airports.withColumn('state', region_split.getItem(1))

In [29]:
airports = airports.select(['ident',
                 'iata_code',
                 'name','type',
                 'municipality',
                 'state',
                 'local_code',
                 'latitude',
                 'longitude',
                 'elevation_ft']).where(airports.iso_country=="US")

In [30]:
airports = airports.sort('iata_code', ascending=True)

In [31]:
airports = airports.na.drop(subset='iata_code')

In [32]:
airports = airports.withColumn("latitude", airports.latitude.cast('float'))
airports = airports.withColumn("longitude", airports.longitude.cast('float'))
aiprots = airports.withColumn("elevation_fit", airports.elevation_ft.cast('integer'))

In [33]:
airports.show()

+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+
|ident|iata_code|                name|          type| municipality|state|local_code|latitude|longitude|elevation_ft|
+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+
| KAAF|      AAF|Apalachicola Regi...| small_airport| Apalachicola|   FL|       AAF| 29.7275| -85.0275|          20|
| KAAP|      AAP|      Andrau Airpark|        closed|      Houston|   TX|       AAP| 29.7225| -95.5883|          79|
| KABE|      ABE|Lehigh Valley Int...|medium_airport|    Allentown|   PA|       ABE| 40.6521| -75.4408|         393|
| KABI|      ABI|Abilene Regional ...|medium_airport|      Abilene|   TX|       ABI| 32.4113| -99.6819|        1791|
| PAFM|      ABL|      Ambler Airport|medium_airport|       Ambler|   AK|       AFM| 67.1063| -157.857|         334|
| KABQ|      ABQ|Albuquerque Inter...| large_airport|  Albuquerq

#### Successfully created parquets

In [34]:
#airports.write.mode('overwrite').parquet(os.path.join(output_data, "airports"))

---
## USA Cities Demographics

In [63]:
cities = spark.read.option('header', True) \
        .option('delimiter', ";") \
        .csv("data/us_cities_demographics.csv")

In [64]:
cities = cities.withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_pop") \
        .withColumnRenamed("Female Population", "female_pop") \
        .withColumnRenamed("Total Population", "total_pop") \
        .withColumnRenamed("Number of Veterans", "num_veterans") \
        .withColumnRenamed("Foreign-born", "num_foreigners") \
        .withColumnRenamed("Average Household Size", "avg_household_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "race_pop")

cities = cities.withColumn("state_city", F.concat_ws("_", cities.state_code, cities.city))

In [65]:
integer_vars = ["male_pop", "female_pop", "total_pop", "num_veterans", "num_foreigners", "race_pop"]
float_vars = ["median_age", "avg_household_size"]

for i_var in integer_vars:
    cities = cities.withColumn(i_var, cities[i_var].cast('integer'))
    
for f_var in float_vars:
    cities = cities.withColumn(f_var, cities[f_var].cast('float'))

In [70]:
cities2 = cities.dropDuplicates(["state_city"])

In [66]:
race_count = cities.select("state_city", "race", "race_pop")
race_count = race_count.withColumn("race_pop", race_count.race_pop.cast('float'))
race_count = race_count.groupBy("state_city").pivot("race").agg(F.first("race_pop"))

In [77]:
cities_final = cities2.join(race_count, cities2.state_city == race_count.state_city)
cities_final = cities_final.drop("race", "race_pop", "state_city", "state_city")

In [82]:
cities_final = cities_final.withColumnRenamed("American Indian and Alaska Native", "native_american_pop") \
                            .withColumnRenamed("Asian", "asian_pop") \
                            .withColumnRenamed("Black or African-American", "black_american_pop") \
                            .withColumnRenamed("Hispanic or Latino", "hispanic_pop") \
                            .withColumnRenamed("White", "white_pop")

In [83]:
cities_final.printSchema()

root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- median_age: float (nullable = true)
 |-- male_pop: integer (nullable = true)
 |-- female_pop: integer (nullable = true)
 |-- total_pop: integer (nullable = true)
 |-- num_veterans: integer (nullable = true)
 |-- num_foreigners: integer (nullable = true)
 |-- avg_household_size: float (nullable = true)
 |-- state_code: string (nullable = true)
 |-- native_american_pop: float (nullable = true)
 |-- asian_pop: float (nullable = true)
 |-- black_american_pop: float (nullable = true)
 |-- hispanic_pop: float (nullable = true)
 |-- white_pop: float (nullable = true)



#### Successfully created parquets

In [85]:

# cities_final.write.mode('overwrite').parquet(os.path.join(output_data, "cities"))