# Spark Local Test

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

import configparser
from pyspark.sql import SparkSession, Window
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf, col, monotonically_increasing_id, row_number
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, DateType
from pyspark.sql import functions as F

from datetime import datetime

import os

## Configure Connection

In [60]:
sc.stop()

In [53]:
output_data = "test_output/"

In [61]:
configure = SparkConf().setAppName('udac_config').setMaster('local')
sc = SparkContext(conf = configure)

In [62]:
# getOrCreate modifies the parameters of existing Spark Session
spark = SparkSession.builder.appName('udac_cap').config('config option', 'config value').getOrCreate()

In [63]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.sql.warehouse.dir',
  'file:/Users/morgan/Documents/10_Udacity/data_eng_nano/usa-tourism-etl/spark-warehouse'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.startTime', '1616116653672'),
 ('spark.driver.host', '10.0.0.223'),
 ('spark.app.name', 'udac_config'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1616116653861'),
 ('spark.driver.port', '62171'),
 ('spark.ui.showConsoleProgress', 'true')]

### Airports

In [58]:
airports = spark.read.option("header", True).csv("data/airport_codes.csv")

In [42]:
airports.printSchema()

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)



In [43]:
lat_long = F.split(airports.coordinates, ",")
airports = airports.withColumn('longitude', lat_long.getItem(0))
airports = airports.withColumn('latitude', lat_long.getItem(1))

region_split = F.split(airports.iso_region, "-")
airports = airports.withColumn('state', region_split.getItem(1))

In [44]:
airports = airports.select(['ident',
                 'iata_code',
                 'name','type',
                 'municipality',
                 'state',
                 'local_code',
                 'latitude',
                 'longitude',
                 'elevation_ft']).where(airports.iso_country=="US")

In [45]:
airports = airports.sort('iata_code', ascending=True)

In [47]:
airports = airports.na.drop(subset='iata_code')

In [50]:
airports = airports.withColumn("latitude", airports.latitude.cast('float'))
airports = airports.withColumn("longitude", airports.longitude.cast('float'))
aiprots = airports.withColumn("elevation_fit", airports.elevation_ft.cast('integer'))

In [51]:
airports.show()

+-----+---------+--------------------+--------------+-------------+-----+----------+---------+---------+------------+
|ident|iata_code|                name|          type| municipality|state|local_code| latitude|longitude|elevation_ft|
+-----+---------+--------------------+--------------+-------------+-----+----------+---------+---------+------------+
| KAAF|      AAF|Apalachicola Regi...| small_airport| Apalachicola|   FL|       AAF| -85.0275|  29.7275|          20|
| KAAP|      AAP|      Andrau Airpark|        closed|      Houston|   TX|       AAP| -95.5883|  29.7225|          79|
| KABE|      ABE|Lehigh Valley Int...|medium_airport|    Allentown|   PA|       ABE| -75.4408|  40.6521|         393|
| KABI|      ABI|Abilene Regional ...|medium_airport|      Abilene|   TX|       ABI| -99.6819|  32.4113|        1791|
| PAFM|      ABL|      Ambler Airport|medium_airport|       Ambler|   AK|       AFM| -157.857|  67.1063|         334|
| KABQ|      ABQ|Albuquerque Inter...| large_airport|  A

#### Successfully created parquets

In [54]:
#airports.write.mode('overwrite').parquet(os.path.join(output_data, "airports"))

---
## USA Cities Demographics

In [68]:
cities = spark.read.option('header', True) \
        .option('delimiter', ";") \
        .csv("data/us_cities_demographics.csv")

In [74]:
cities = cities.withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_pop") \
        .withColumnRenamed("Female Population", "female_pop") \
        .withColumnRenamed("Total Population", "total_pop") \
        .withColumnRenamed("Number of Veterans", "num_veterans") \
        .withColumnRenamed("Foreign-born", "num_foreigners") \
        .withColumnRenamed("Average Household Size", "avg_household_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "predominant_race") \
        .withColumnRenamed("Count", "num_races")