### Spark Data Wrangling Exercises

In [4]:
import pandas as pd
import numpy as np
import pyspark
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pydataset import data
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.getOrCreate()

In [7]:
source_df = spark.read.csv("source.csv", sep=",", header=True, inferSchema=True)
case_df = spark.read.csv("case.csv", sep=",", header=True, inferSchema=True)
dept_df = spark.read.csv("dept.csv", sep=",", header=True, inferSchema=True)

                                                                                

In [9]:
source_df.show(5)
case_df.show(5)
dept_df.show(5)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
|   106463| Richard Sanchez|
|   119403|  Betty De Hoyos|
|   119555|  Socorro Quiara|
+---------+----------------+
only showing top 5 rows

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.508

In [10]:
source_df.write.json('source_json', mode='overwrite')
dept_df.write.json('dept_json', mode='overwrite')
case_df.write.json('case_json', mode='overwrite')

                                                                                

---

In [19]:
case_df.groupBy("case_closed", "case_late").count().show()

+-----------+---------+------+
|case_closed|case_late| count|
+-----------+---------+------+
|         NO|      YES|  6525|
|        YES|      YES| 87978|
|         NO|       NO| 11585|
|        YES|       NO|735616|
+-----------+---------+------+



In [23]:
case_df = case_df.withColumnRenamed("SLA_due_date", "case_due_date")
case_df = case_df.withColumn("case_closed", f.expr('case_closed == "YES"')).withColumn("case_late", f.expr('case_late == "YES"'))
case_df = case_df.withColumn("council_district", f.col("council_district").cast("string"))

fmt = "M/d/yy H:mm"
case_df = (
    case_df.withColumn("case_opened_date", f.to_timestamp("case_opened_date", fmt))
     .withColumn("case_closed_date", f.to_timestamp("case_closed_date", fmt))
    .withColumn("case_due_date", f.to_timestamp("case_due_date", fmt))
)


In [25]:
case_df.show(2, vertical=True, truncate=False)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 case_due_date        | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  EL PASO ST, San Antonio, 78207 
 council_district     | 5                                    
-RECORD 1----------------------------------------------------
 case_id

---

In [26]:
#How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest 
#(in terms of days since opened) currently opened issue been open?

In [29]:
case_df.groupby(case_df.service_request_type == 'Stray Animal').count().show()

+-------------------------------------+------+
|(service_request_type = Stray Animal)| count|
+-------------------------------------+------+
|                                 true| 26760|
|                                false|814944|
+-------------------------------------+------+



In [40]:
case_df.groupby(case_df.dept_division).count().show(truncate=False)

+----------------------------+------+
|dept_division               |count |
+----------------------------+------+
|Miscellaneous               |45123 |
|Solid Waste                 |813   |
|Field Operations            |116915|
|Streets                     |38510 |
|Waste Collection            |215122|
|Code Enforcement (IntExp)   |2189  |
|Vector                      |538   |
|Dangerous Premise           |15479 |
|311 Call Center             |2849  |
|Brush                       |18212 |
|Dangerous Premise (IntExp)  |36    |
|Traffic Engineering Design  |4334  |
|Code Enforcement (Internal) |198   |
|District 2                  |3     |
|Signals                     |20700 |
|Engineering Division        |1375  |
|Director's Office Horizontal|515   |
|Storm Water                 |13769 |
|Shops                       |112   |
|Storm Water Engineering     |494   |
+----------------------------+------+
only showing top 20 rows



In [39]:
(
    case_df.filter(case_df.dept_division == 'Field Operations')
    .groupby(case_df.service_request_type == 'Officer Standby')
    .count()
    .show()
)

+----------------------------------------+------+
|(service_request_type = Officer Standby)| count|
+----------------------------------------+------+
|                                    true|  3013|
|                                   false|113902|
+----------------------------------------+------+



In [42]:
case_df.withColumn('Year_closed', f.year('case_closed_date')).show(2, vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
 Year_closed          | 2018                 
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 2018-01-01 00:46:00  
 case_closed_date     | 2018-01-03 08:11:00  
 case_due_date        | 2018-01-05 08:30:00  
 case_late            | false     