In [16]:
import multiprocessing
import pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import col, column, expr
import pyspark.sql.types as T
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import from_unixtime
from pyspark.sql import functions as F
import pandas as pd
import numpy as np

nprocs = multiprocessing.cpu_count()

spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

spark.read.json
spark.read.parquet

<bound method DataFrameReader.parquet of <pyspark.sql.readwriter.DataFrameReader object at 0x11d206588>>

### Exercises
Using case.csv & dept.csv:

read into spark environment (df_case, df_dept)

write df_case and df_dept back to disk into their own directories (my_cases and my_depts)

Write df_case and df_dept to parquet files (my_cases_parquet and my_depts_parquet)

Read your parquet files back into your spark environment.

Read case.csv and dept.csv into a pandas dataframe. (cases_pdf, depts_pdf)

Convert the pandas dataframes into spark dataframes (cases_sdf, depts_sdf)

Convert the spark dataframes back into pandas dataframes. (cases_pdf1, depts_pdf1)

Write the spark dataframes (cases_sdf, depts_sdf) to Hive tables.

Explore the Hive database/tables you have created using the methods in the lesson.

Read from the tables into two spark dataframes (cases_sdf, depts_sdf)

In [17]:
#  use this if want to explicitly load columns, not when "inferschema"
case_schema = T.StructType([
    T.StructField("case_id", T.StringType()),
    T.StructField("case_opened", T.DateType()),
    T.StructField("case_closed_date", T.DateType()),
    T.StructField("SLA_due_date", T.DateType()),
    T.StructField("case_late", T.BooleanType()),
    T.StructField("num_days_late", T.FloatType()),
    T.StructField("case_closed", T.BooleanType()),
    T.StructField("dept_division", T.StringType()),
    T.StructField("service_request_type", T.StringType()),
    T.StructField("SLA_days", T.FloatType()),
    T.StructField("case_status", T.StringType()),
    T.StructField("source_id", T.StringType()),
    T.StructField("request_address", T.StringType()),
    T.StructField("council_district", T.StringType())
])

In [18]:
df_case = (spark.read
 .option('header', True)
 .option('inferSchema', True)
# .option('schema', case_schema)          
 .format('csv')
 .load('./sa311/case.csv'))

In [19]:
df_dept = (spark.read
 .option('header', True)
 .option('inferSchema', True)
 .format('csv')
 .load('./sa311/dept.csv'))

In [20]:
df_source = (spark.read
 .option('header', True)
 .option('inferSchema', True)
 .format('csv')
 .load('./sa311/source.csv'))

In [21]:
df_case.show(3)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

write df_case and df_dept back to disk into their own directories (my_cases and my_depts)

Write df_case and df_dept to parquet files (my_cases_parquet and my_depts_parquet)

In [29]:
df_case.write.format('com.databricks.spark.csv') \
  .mode('overwrite').option("header", "true").save('./sa311/my_cases')

In [30]:
df_dept.write.format('com.databricks.spark.csv') \
  .mode('overwrite').option("header", "true").save('./sa311/my_depts')

Read your parquet files back into your spark environment.

Read case.csv and dept.csv into a pandas dataframe. (cases_pdf, depts_pdf)