In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [22]:
spark = SparkSession.builder.getOrCreate()

In [23]:
df = spark.read.csv('case.csv', sep = ',', header = True, inferSchema = True)

In [24]:
df.printSchema()
df.show()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+------------------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|          

In [25]:
# to fix the messiness we can
df.show(5, vertical = True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [26]:
# renamig the column
df = df.withColumnRenamed('SLA_due_date', 'case_due_date')

In [28]:
df.explain()

== Physical Plan ==
*(1) Project [case_id#606, case_opened_date#607, case_closed_date#608, SLA_due_date#609 AS case_due_date#776, case_late#610, num_days_late#611, case_closed#612, dept_division#613, service_request_type#614, SLA_days#615, case_status#616, source_id#617, request_address#618, council_district#619]
+- FileScan csv [case_id#606,case_opened_date#607,case_closed_date#608,SLA_due_date#609,case_late#610,num_days_late#611,case_closed#612,dept_division#613,service_request_type#614,SLA_days#615,case_status#616,source_id#617,request_address#618,council_district#619] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/bibekmainali/codeup-data-science/distributed-ml-exercises/case.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<case_id:int,case_opened_date:string,case_closed_date:string,SLA_due_date:string,case_late:...




In [30]:
# we want to convert Yes or No to bool val, but first lets check if there are any other values in these columns

df.groupBy('case_late', 'case_closed').count().show()

+---------+-----------+------+
|case_late|case_closed| count|
+---------+-----------+------+
|       NO|        YES|735616|
|      YES|        YES| 87978|
|       NO|         NO| 11585|
|      YES|         NO|  6525|
+---------+-----------+------+



In [34]:
df = df.withColumn('case_late', expr("case_late == 'Yes'"))

In [35]:
df = df.withColumn('case_closed', expr("case_closed == 'Yes'"))

In [37]:
# lets check if it worked
df.show(vertical = True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 case_due_date        | 9/26/20 0:42         
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | false                
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 case_due_date        | 1/5/18 8:30          
 case_late            | false                
 num_days_late        | -2.0126041

In [41]:
# padding the council dirstricts

df = df.withColumn("council_district", format_string("%03d", col("council_district")))

In [None]:
# lets handle date with spark


