In [5]:
# Data Acquisition
# This exercises uses the case.csv, dept.csv, and source.csv files from the san antonio 311 call dataset.

# Read the case, department, and source data into their own spark dataframes.

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

source = spark.read.csv("source.csv", sep=",", header=True, inferSchema=True)

case = spark.read.csv("case.csv", sep=",", header=True, inferSchema=True)

dept = spark.read.csv("dept.csv", sep=",", header=True, inferSchema=True)

source.show()

case.show()

dept.show()

+---------+--------------------+
|source_id|     source_username|
+---------+--------------------+
|   100137|    Merlene Blodgett|
|   103582|         Carmen Cura|
|   106463|     Richard Sanchez|
|   119403|      Betty De Hoyos|
|   119555|      Socorro Quiara|
|   119868| Michelle San Miguel|
|   120752|      Eva T. Kleiber|
|   124405|           Lori Lara|
|   132408|       Leonard Silva|
|   135723|        Amy Cardenas|
|   136202|    Michelle Urrutia|
|   136979|      Leticia Garcia|
|   137943|    Pamela K. Baccus|
|   138605|        Marisa Ozuna|
|   138650|      Kimberly Green|
|   138650|Kimberly Green-Woods|
|   138793| Guadalupe Rodriguez|
|   138810|       Tawona Martin|
|   139342|     Jessica Mendoza|
|   139344|        Isis Mendoza|
+---------+--------------------+
only showing top 20 rows

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+------------------+-----------+---------+--

In [8]:
# Let's see how writing to the local disk works in spark:

# Write the code necessary to store the source data in both csv and json format, 
# store these as sources_csv and sources_json
# Inspect your folder structure. What do you notice?

source.write.json("sources_json", mode="overwrite")

source.write.csv("sources_csv", mode="overwrite")

In [9]:
# Inspect the data in your dataframes. Are the data types appropriate? 
# Write the code necessary to cast the values to the appropriate types.

from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(
    [
        StructField("source_id", StringType()),
        StructField("source_username", StringType()),
    ]
)

spark.read.csv("sources_csv", header=True, schema=schema).show()

+---------+--------------------+
|source_id|     source_username|
+---------+--------------------+
|   103582|         Carmen Cura|
|   106463|     Richard Sanchez|
|   119403|      Betty De Hoyos|
|   119555|      Socorro Quiara|
|   119868| Michelle San Miguel|
|   120752|      Eva T. Kleiber|
|   124405|           Lori Lara|
|   132408|       Leonard Silva|
|   135723|        Amy Cardenas|
|   136202|    Michelle Urrutia|
|   136979|      Leticia Garcia|
|   137943|    Pamela K. Baccus|
|   138605|        Marisa Ozuna|
|   138650|      Kimberly Green|
|   138650|Kimberly Green-Woods|
|   138793| Guadalupe Rodriguez|
|   138810|       Tawona Martin|
|   139342|     Jessica Mendoza|
|   139344|        Isis Mendoza|
|   139345|      Andrea Alvarez|
+---------+--------------------+
only showing top 20 rows



In [12]:
case.show(2, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [18]:
# How old is the latest (in terms of days past SLA) currently open issue? 

case.filter(case.case_closed=="NO").select(round(max(case.SLA_days)).alias("longest_SLA_days")).show()



+----------------+
|longest_SLA_days|
+----------------+
|          1419.0|
+----------------+



In [23]:
# How many Stray Animal cases are there?

case.filter(case.service_request_type=="Stray Animal").select(count(case.service_request_type).alias("number of stray animal cases")).show()

+----------------------------+
|number of stray animal cases|
+----------------------------+
|                       26760|
+----------------------------+



In [26]:
# How many service requests that are assigned to the Field Operations department (dept_division) 
# are not classified as "Officer Standby" request type (service_request_type)?

case.filter(case.dept_division=="Field Operations").filter(case.service_request_type!="Officer Standby").select(count(case.dept_division).alias("cases")).show()


+------+
| cases|
+------+
|113902|
+------+



In [None]:
# Convert the council_district column to a string column.


In [None]:
# Extract the year from the case_closed_date column.

In [None]:
# Convert num_days_late from days to hours in new columns num_hours_late.

In [None]:
# Join the case data with the source and department data.

In [None]:
# Are there any cases that do not have a request source?

In [None]:

# What are the top 10 service request types in terms of number of requests?

In [None]:
# What are the top 10 service request types in terms of average days late?

In [None]:
# Does number of days late depend on department?

In [None]:
# How do number of days late depend on department and request type?