# Spark Local Test

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

import configparser
from pyspark.sql import SparkSession, Window
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf, col, monotonically_increasing_id, row_number
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, DateType, StringType
from pyspark.sql import functions as F

from datetime import datetime

import os

## Configure Connection

In [2]:
# sc.stop()

In [3]:
output_data = "test_output/"

In [4]:
configure = SparkConf().setAppName('udac_config').setMaster('local')
sc = SparkContext(conf = configure)

In [5]:
# getOrCreate modifies the parameters of existing Spark Session
spark = SparkSession.builder.appName('udac_cap').config('config option', 'config value').getOrCreate()

In [6]:
spark.sparkContext.getConf().getAll()

[('spark.master', 'local'),
 ('spark.app.startTime', '1616538822367'),
 ('spark.sql.warehouse.dir',
  'file:/Users/morgan/Documents/10_Udacity/data_eng_nano/usa-tourism-etl/spark-warehouse'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '10.0.0.223'),
 ('spark.app.name', 'udac_config'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '51287'),
 ('spark.app.id', 'local-1616538823268')]

### Airports

In [7]:
airports = spark.read.option("header", True).csv("data/airport_codes.csv")

In [8]:
lat_long = F.split(airports.coordinates, ",")
airports = airports.withColumn('longitude', lat_long.getItem(0))
airports = airports.withColumn('latitude', lat_long.getItem(1))

region_split = F.split(airports.iso_region, "-")
airports = airports.withColumn('state', region_split.getItem(1))

In [9]:
airports = airports.select(['ident',
                 'iata_code',
                 'name','type',
                 'municipality',
                 'state',
                 'local_code',
                 'latitude',
                 'longitude',
                 'elevation_ft']).where(airports.iso_country=="US")

In [10]:
airports = airports.sort('iata_code', ascending=True)

In [11]:
airports = airports.na.drop(subset='iata_code')

In [12]:
airports = airports.withColumn("latitude", airports.latitude.cast('float')) \
                    .withColumn("longitude", airports.longitude.cast('float')) \
                    .withColumn("elevation_fit", airports.elevation_ft.cast('integer'))

In [13]:
airports.show()

+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+-------------+
|ident|iata_code|                name|          type| municipality|state|local_code|latitude|longitude|elevation_ft|elevation_fit|
+-----+---------+--------------------+--------------+-------------+-----+----------+--------+---------+------------+-------------+
| KAAF|      AAF|Apalachicola Regi...| small_airport| Apalachicola|   FL|       AAF| 29.7275| -85.0275|          20|           20|
| KAAP|      AAP|      Andrau Airpark|        closed|      Houston|   TX|       AAP| 29.7225| -95.5883|          79|           79|
| KABE|      ABE|Lehigh Valley Int...|medium_airport|    Allentown|   PA|       ABE| 40.6521| -75.4408|         393|          393|
| KABI|      ABI|Abilene Regional ...|medium_airport|      Abilene|   TX|       ABI| 32.4113| -99.6819|        1791|         1791|
| PAFM|      ABL|      Ambler Airport|medium_airport|       Ambler|   AK|       AFM

#### Successfully created parquets

In [14]:
#airports.write.mode('overwrite').parquet(os.path.join(output_data, "airports"))

---
## USA Cities Demographics

In [15]:
cities = spark.read.option('header', True) \
        .option('delimiter', ";") \
        .csv("data/us_cities_demographics.csv")

In [16]:
cities = cities.withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_pop") \
        .withColumnRenamed("Female Population", "female_pop") \
        .withColumnRenamed("Total Population", "total_pop") \
        .withColumnRenamed("Number of Veterans", "num_veterans") \
        .withColumnRenamed("Foreign-born", "num_foreigners") \
        .withColumnRenamed("Average Household Size", "avg_household_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "race_pop")

cities = cities.withColumn("state_city", F.concat_ws("_", cities.state_code, cities.city))

In [17]:
integer_vars = ["male_pop", "female_pop", "total_pop", "num_veterans", "num_foreigners", "race_pop"]
float_vars = ["median_age", "avg_household_size"]

for i_var in integer_vars:
    cities = cities.withColumn(i_var, cities[i_var].cast('integer'))
    
for f_var in float_vars:
    cities = cities.withColumn(f_var, cities[f_var].cast('float'))

In [18]:
cities2 = cities.dropDuplicates(["state_city"])

In [19]:
race_count = cities.select("state_city", "race", "race_pop")
race_count = race_count.withColumn("race_pop", race_count.race_pop.cast('float'))
race_count = race_count.groupBy("state_city").pivot("race").agg(F.first("race_pop"))

In [20]:
cities_final = cities2.join(race_count, cities2.state_city == race_count.state_city)
cities_final = cities_final.drop("race", "race_pop", "state_city", "state_city")

In [21]:
cities_final = cities_final.withColumnRenamed("American Indian and Alaska Native", "native_american_pop") \
                            .withColumnRenamed("Asian", "asian_pop") \
                            .withColumnRenamed("Black or African-American", "black_american_pop") \
                            .withColumnRenamed("Hispanic or Latino", "hispanic_pop") \
                            .withColumnRenamed("White", "white_pop")

In [22]:
cities_final.printSchema()

root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- median_age: float (nullable = true)
 |-- male_pop: integer (nullable = true)
 |-- female_pop: integer (nullable = true)
 |-- total_pop: integer (nullable = true)
 |-- num_veterans: integer (nullable = true)
 |-- num_foreigners: integer (nullable = true)
 |-- avg_household_size: float (nullable = true)
 |-- state_code: string (nullable = true)
 |-- native_american_pop: float (nullable = true)
 |-- asian_pop: float (nullable = true)
 |-- black_american_pop: float (nullable = true)
 |-- hispanic_pop: float (nullable = true)
 |-- white_pop: float (nullable = true)



#### Successfully created parquets

In [23]:

# cities_final.write.mode('overwrite').parquet(os.path.join(output_data, "cities"))

---
## USA Temperatures

In [24]:
temperatures = spark.read.option('header', True) \
                .csv("data/GlobalLandTemperaturesByCity.csv")

In [25]:
temperatures.printSchema()

root
 |-- dt: string (nullable = true)
 |-- AverageTemperature: string (nullable = true)
 |-- AverageTemperatureUncertainty: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)



In [26]:
temperatures.show(3)

+----------+------------------+-----------------------------+-----+-------+--------+---------+
|        dt|AverageTemperature|AverageTemperatureUncertainty| City|Country|Latitude|Longitude|
+----------+------------------+-----------------------------+-----+-------+--------+---------+
|1743-11-01|             6.068|           1.7369999999999999|Århus|Denmark|  57.05N|   10.33E|
|1743-12-01|              null|                         null|Århus|Denmark|  57.05N|   10.33E|
|1744-01-01|              null|                         null|Århus|Denmark|  57.05N|   10.33E|
+----------+------------------+-----------------------------+-----+-------+--------+---------+
only showing top 3 rows



In [27]:
temperatures = temperatures.select("*").where((temperatures.Country == "United States") & (temperatures.dt > "1969-12-31"))

In [28]:
temperatures = temperatures.withColumnRenamed("dt", "date_time") \
                            .withColumnRenamed("AverageTemperature", "avg_daily_temp") \
                            .withColumnRenamed("AverageTemperatureUncertainty", "avg_temp_temp_uncertainty") \
                            .withColumnRenamed("City", "city") \
                            .withColumnRenamed("Latitude", "latitude") \
                            .withColumnRenamed("Longitude", "longitude")

In [29]:
temperatures = temperatures.withColumn("lat_length", F.length("latitude")) \
                            .withColumn("long_length", F.length("longitude")) \
                            .withColumn("latitude_2", F.expr("""substr(latitude, 1, lat_length-1)""")) \
                            .withColumn("longitude_2", F.expr("""substr(longitude, 1, long_length-1)""")) 

In [30]:
temperatures = temperatures.withColumn("latitude", temperatures.latitude_2.cast('float')) \
                            .withColumn("longitude", temperatures.longitude_2.cast('float'))

temperatures = temperatures.withColumn("longitude", -1 * col("longitude"))

temperatures = temperatures.drop("Country", "lat_length", "long_length", "latitude_2", "longitude_2")

In [31]:
temperatures.show(2)

+----------+--------------+-------------------------+-------+--------+---------+
| date_time|avg_daily_temp|avg_temp_temp_uncertainty|   city|latitude|longitude|
+----------+--------------+-------------------------+-------+--------+---------+
|1970-01-01|         3.969|                    0.289|Abilene|   32.95|  -100.53|
|1970-02-01|         8.463|                    0.177|Abilene|   32.95|  -100.53|
+----------+--------------+-------------------------+-------+--------+---------+
only showing top 2 rows



In [34]:

#temperatures.write.mode('overwrite').parquet(os.path.join(output_data, "temperatures"))

---
## Visits

In [51]:
visits = spark.read.option('header', True) \
            .option('delimiter', ",") \
            .csv("data/immigration_data_sample.csv")

airports_dict = spark.read.option('header', True) \
                .csv("data/airport_dict.csv")

countries_dict = spark.read.option('header', True) \
                .csv("data/country_codes.csv")

In [70]:
airports2_dict = airports_dict.withColumn("city", F.split(col("airport"), ",").getItem(0)) \
                                .withColumn("state", F.split(col("airport"), ",").getItem(1))

countries2_dict = countries_dict.withColumn("country", F.initcap("country"))

In [72]:
deleteWhitespaceUDF = udf(lambda s: s.replace(" ", ""), StringType())
deleteApostropheUDF = udf(lambda s: s.replace("''", ""), StringType())

In [78]:
airports2_dict = airports2_dict.withColumn("state", deleteWhitespaceUDF("state"))
airports2_dict = airports2_dict.withColumn("state", deleteApostropheUDF("state"))

In [79]:
airports2_dict.show(4)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 450, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in <lambda>
    return lambda *a: g(f(*a))
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in <lambda>
    return lambda *a: g(f(*a))
  File "/Users/morgan/opt/anaconda3/envs/udac_cap/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 77, in <lambda>
    return lambda *a: g(f(*a))
  File "<ipython-input-72-2cc272bc0f94>", line 1, in <lambda>
AttributeError: 'NoneType' object has no attribute 'replace'


In [39]:
visits2 = visits.drop("insnum", "dtadfile", "fltno", 'i94bir', "occup", "matflag", "admnum", "entdepu", "visapost")

In [40]:
visits2 = visits2.withColumnRenamed("_c0", "visit_id") \
            .withColumnRenamed("cicid", "citizen_id") \
            .withColumnRenamed("i94yr", "arrival_yr") \
            .withColumnRenamed("i94mon", "arrival_month") \
            .withColumnRename("")

root
 |-- _c0: string (nullable = true)
 |-- cicid: string (nullable = true)
 |-- i94yr: string (nullable = true)
 |-- i94mon: string (nullable = true)
 |-- i94cit: string (nullable = true)
 |-- i94res: string (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: string (nullable = true)
 |-- i94mode: string (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: string (nullable = true)
 |-- i94visa: string (nullable = true)
 |-- count: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- biryear: string (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- visatype: string (nullable = true)



In [None]:
cities = cities.withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_pop") \
        .withColumnRenamed("Female Population", "female_pop") \
        .withColumnRenamed("Total Population", "total_pop") \
        .withColumnRenamed("Number of Veterans", "num_veterans") \
        .withColumnRenamed("Foreign-born", "num_foreigners") \
        .withColumnRenamed("Average Household Size", "avg_household_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "race_pop")