In [1]:
import pyspark.sql
from pyspark.sql.functions import *

In [2]:
# firing up our spark session

spark = pyspark.sql.SparkSession.builder.getOrCreate()

**Getting all the data prep done before exploring**

In [3]:
# reading the case data
case_df = spark.read.csv('case.csv', header = True, inferSchema = True)

# reading the department data
department_df = spark.read.csv('dept.csv', header = True, inferSchema = True)


# reading the source data
source_df = spark.read.csv('source.csv', header = True, inferSchema = True)

In [4]:
# to rename a column use the withColumnRenamed(col_to_rename, new_name)

case_df = case_df.withColumnRenamed( 'SLA_due_date', 'case_due_date')

In [5]:
# lets change the case late and case closed to boolean

case_df = case_df.withColumn("case_late", expr("case_late == 'YES'"))

case_df = case_df.withColumn("case_closed", expr("case_closed == 'YES'"))

In [6]:
# lets change the date time format
fmt = "M/d/yy H:mm"

case_df = case_df.withColumn("case_opened_date", to_timestamp("case_opened_date", fmt))
case_df = case_df.withColumn("case_closed_date", to_timestamp("case_closed_date", fmt))
case_df = case_df.withColumn("case_due_date", to_timestamp("case_due_date", fmt))

In [7]:

# formatting the address column
case_df=case_df.withColumn('request_address', lower(trim('request_address')))

In [8]:
# lets create a new column to hold the zipcode only

case_df = case_df.withColumn('zip_code', regexp_extract('request_address', r"(\d+)$", 1))

In [9]:

# format the council district to hold 3 digits with padding
case_df = case_df.withColumn("council_district", format_string("%03d", col("council_district")))


In [10]:

# create columns case_age, days_to_close, and case_lifetime

# create a case age column whihc is the difference in time between the current date and the date the case was opened
case_df = case_df.withColumn("case_age", datediff(current_timestamp(), "case_opened_date"))

# create days to close which is the difference in time between the day case was opened and the day the case was closed
case_df = case_df.withColumn("days_to_close", datediff("case_closed_date", "case_opened_date"))


# case_lifetime column is created by:
    # if the case is closed, this column holds the value from days_to_close columns
    # if the case in not closed, this column holds the value of case age
case_df = case_df.withColumn("case_lifetime", when(col("case_closed"), col("days_to_close")).otherwise(col("case_age")))

In [11]:
case_df = case_df.join(department_df,'dept_division', 'left' )

In [12]:
case_df = case_df.join(source_df, 'source_id', 'left')

## Exploration

In [13]:
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
case_df.show(1, vertical = True)

-RECORD 0--------------------------------------
 source_id              | svcCRMLS             
 dept_division          | Field Operations     
 case_id                | 1014127332           
 case_opened_date       | 2018-01-01 00:42:00  
 case_closed_date       | 2018-01-01 12:29:00  
 case_due_date          | 2020-09-26 00:42:00  
 case_late              | false                
 num_days_late          | -998.5087616000001   
 case_closed            | true                 
 service_request_type   | Stray Animal         
 SLA_days               | 999.0                
 case_status            | Closed               
 request_address        | 2315  el paso st,... 
 council_district       | 005                  
 zip_code               | 78207                
 case_age               | 1065                 
 days_to_close          | 0                    
 case_lifetime          | 0                    
 dept_name              | Animal Care Services 
 standardized_dept_name | Animal Care Se