In [2]:
import findspark

In [3]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [4]:
from pyspark.sql import SparkSession, functions as F

In [5]:
spark = SparkSession.builder \
.appName("Rename, Add, Drop") \
.master("local[2]") \
.getOrCreate()

In [6]:
# Data source: https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric/data

In [7]:
# ! wget -P ~/datasets \
# https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz

In [8]:
! ls -l ~/datasets | grep Fire

-rw-rw-r--. 1 train train  41002480 Oct  6 12:18 Fire_Incidents.csv.gz


In [9]:
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.option("header", True) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Fire_Incidents.csv.gz")

In [10]:
df.count()

533598

In [11]:
len(df.columns)

80

In [12]:
# Since there are many columns just pick a few of them

df.select(df.columns[30:]).limit(5).toPandas()

Unnamed: 0,Action Taken Primary,Action Taken Secondary,Action Taken Other,Detector Alerted Occupants,Property Use,Area of Fire Origin,Ignition Cause,Ignition Factor Primary,Ignition Factor Secondary,Heat Source,...,2017 Fix It Zones,HSOC Zones,Central Market/Tenderloin Boundary,Central Market/Tenderloin Boundary Polygon - Updated,HSOC Zones as of 2018-06-05,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
0,86 investigate,,,,"000 property use, other",,,,,,...,,,,,,,,,,
1,"00 action taken, other",,,,nnn none,,,,,,...,,,,,,,,,,
2,86 investigate,,,,"960 street, other",,,,,,...,,,,,,,,,,
3,92 - standby,-,-,-,963 - street or road in commercial area,,,,,,...,,,,,,,,,,
4,86 investigate,,,,"000 property use, other",,,,,,...,,,,,,,,,,


In [13]:
# time related columns

ts_cols = ['Incident Date','Alarm DtTm','Arrival DtTm','Close DtTm']

In [14]:
df.printSchema()

root
 |-- Incident Number: integer (nullable = true)
 |-- Exposure Number: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- Incident Date: string (nullable = true)
 |-- Call Number: integer (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIP Code: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Station Area: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- Suppression Units: integer (nullable = true)
 |-- Suppression Personnel: integer (nullable = true)
 |-- EMS Units: integer (nullable = true)
 |-- EMS Personnel: integer (nullable = true)
 |-- Other Units: integer (nullable = true)
 |-- Other Personnel: integer (nullable = true)
 |-- First Unit On Scene: string (nullable = true)
 |-- Estimated Property Loss: integer (nullable = true)
 |-- Estimated Conten

In [15]:
# Create new dataframe which includes only date time columns

df2 = df.select(ts_cols)

In [16]:
df2.dtypes

[('Incident Date', 'string'),
 ('Alarm DtTm', 'string'),
 ('Arrival DtTm', 'string'),
 ('Close DtTm', 'string')]

<h1 style="color:blue;">From String to Date Time Conversion</h1>

# Date and Time Operations - to_timestamp

In [19]:
# When spark reads text or csv schema it accepts every column as string type.
# When we tell spark to inferSchema it takes sample and infers datatype.
# Most of time it infers as we expected but in case date and time things get complicated due to datetime formats
# If datetime format is out of standard spark considers it as string
# Therefore most of the strugle is just convert string into datetime types
# To convert properly you have to define existing format according to this:
# https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
# Otherwise your conversion results in None or null


# In our practice dataset;
# ('Incident Date', 'string'),
# ('Alarm DtTm', 'string'),
# ('Arrival DtTm', 'string'),
# ('Close DtTm', 'string'),
# These columns ts but spark inferred them as string due out of standard ts format
# We have to handle manually and modify schema by casting/converting them into ts type

In [17]:
df2.limit(5).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM
3,12/30/2005,12/30/2005 10:40:27 PM,12/30/2005 10:46:33 PM,12/30/2005 11:37:23 PM
4,09/13/2018,09/13/2018 08:30:38 PM,09/13/2018 08:34:11 PM,09/13/2018 08:34:29 PM


## Wrong definition of format

In [18]:
df3 = df2.withColumn("Alarm_DtTm", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy HH:mm:ss a'))
df3.limit(2).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,NaT
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,NaT


In [21]:
# Time is AM-PM not 24 hour so the hour should be hh not HH
# As you see even slight mistake results in None

## Correct definition of format

In [19]:

df3 = df2.withColumn("Alarm_DtTm", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))

In [20]:
df3.limit(2).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05 18:38:01
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29 20:09:25


In [21]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)



# Date and Time Operations - unix_timestamp and from_unixtime 

In [23]:
df3 = df2.withColumn("Alarm_DtTm", 
                     F.unix_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,1528213081
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,1567098565
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,1528997876


In [24]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm_DtTm: long (nullable = true)



In [27]:
df3 = df2.withColumn("Alarm_DtTm", 
                     F.to_timestamp(
                     F.from_unixtime(
                     F.unix_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a')
                     )))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05 18:38:01
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29 20:09:25
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14 20:37:56


In [28]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)



# Date and Time Operations - to_date

In [29]:
df3 = df2.withColumn("Incident_Date", 
                     F.to_date(F.col("Incident Date"), 'MM/dd/yyyy'))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14


In [30]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Incident_Date: date (nullable = true)



<h1 style="color:blue;">From Date Time to String  Conversion</h1>

In [28]:
# First we need date and timestamp columns

In [31]:
df3 = df2.withColumn("Incident_Date", 
                     F.to_date(F.col("Incident Date"), 'MM/dd/yyyy')) \
.withColumn("Alarm_DtTm", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56


In [32]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Incident_Date: date (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)



## Convert date and timestamp columns to string in a desired format

In [33]:
df4 = df3.withColumn("Incident_Date_Str", 
                     F.date_format(F.col("Incident_Date"), 'MM*dd*yyyy'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Incident_Date_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,06*05*2018
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,08*29*2019
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,06*14*2018


In [34]:
df4 = df3.withColumn("Incident_Date_Str", 
                     F.date_format(F.col("Incident_Date"), 'yyyy:MM:dd'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Incident_Date_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,2018:06:05
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,2019:08:29
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,2018:06:14


In [35]:
df4 = df3.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy-MM-dd HH:ss'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,2018-06-05 18:01
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,2019-08-29 20:25
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,2018-06-14 20:56


In [37]:
df4 = df3.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd E'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,2018 June 05 Tue
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,2019 August 29 Thu
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,2018 June 14 Thu


In [38]:
df4 = df3.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd VV'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,2018 June 05 Europe/Istanbul
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,2019 August 29 Europe/Istanbul
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,2018 June 14 Europe/Istanbul


In [39]:
df4 = df3.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd OOOO'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Incident_Date,Alarm_DtTm,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018-06-05,2018-06-05 18:38:01,2018 June 05 GMT+03:00
1,08/29/2019,08/29/2019 08:09:25 PM,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019-08-29,2019-08-29 20:09:25,2019 August 29 GMT+03:00
2,06/14/2018,06/14/2018 08:37:56 PM,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018-06-14,2018-06-14 20:37:56,2018 June 14 GMT+03:00
