In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F

In [4]:
spark = SparkSession.builder \
.appName("Rename, Add, Drop") \
.master("local[2]") \
.getOrCreate()

In [5]:
# Data source: https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric/data

In [6]:
# ! wget -P ~/datasets \
# https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz

In [7]:
! ls -l ~/datasets | grep Fire

-rw-rw-r--. 1 train train  41002480 Oct  6 12:18 Fire_Incidents.csv.gz


In [8]:
df = spark.read.option("header", True) \
.option("inferSchema", True) \
.option("header", True) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Fire_Incidents.csv.gz")

In [9]:
ts_cols = ['Incident Date','Alarm DtTm','Arrival DtTm','Close DtTm']

In [10]:
# Create new dataframe which includes only date time columns
df2 = df.select(ts_cols)

In [11]:
df2.dtypes

[('Incident Date', 'string'),
 ('Alarm DtTm', 'string'),
 ('Arrival DtTm', 'string'),
 ('Close DtTm', 'string')]

In [12]:
# First we need date and timestamp columns

In [12]:
df3 = df2.withColumn("Incident Date", 
                     F.to_date(F.col("Incident Date"), 'MM/dd/yyyy')) \
.withColumn("Alarm DtTm", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM


In [13]:
df3.printSchema()

root
 |-- Incident Date: date (nullable = true)
 |-- Alarm DtTm: timestamp (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)



## Get year

In [14]:
df4 = df3.withColumn("Alarm_DtTm_Year", 
                     F.year(F.col("Alarm DtTm")))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_Year
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,2018
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,2019
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,2018


In [16]:
df3.select(F.year("Alarm DtTm").alias("Alarm_DtTm_Year")).show(3)

+---------------+
|Alarm_DtTm_Year|
+---------------+
|           2018|
|           2019|
|           2018|
+---------------+
only showing top 3 rows



## Get month

In [17]:
df4 = df3.withColumn("Alarm_DtTm_Month", 
                     F.month(F.col("Alarm DtTm")))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_Month
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,6
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,8
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,6


In [18]:
df3.select(F.month("Alarm DtTm")).show(3)

+-----------------+
|month(Alarm DtTm)|
+-----------------+
|                6|
|                8|
|                6|
+-----------------+
only showing top 3 rows



## Get week of year

In [19]:
df4 = df3.withColumn("Alarm_DtTm_Week", 
                     F.weekofyear(F.col("Alarm DtTm")))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_Week
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,23
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,35
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,24


In [20]:
df3.select(F.weekofyear("Alarm DtTm")).show(3)

+----------------------+
|weekofyear(Alarm DtTm)|
+----------------------+
|                    23|
|                    35|
|                    24|
+----------------------+
only showing top 3 rows



## Get day of month

In [20]:
df4 = df3.withColumn("Alarm_DtTm_DayofMonth", 
                     F.dayofmonth(F.col("Alarm DtTm")))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_DayofMonth
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,5
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,29
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,14


## Get day of week

In [21]:
df4 = df3.withColumn("Alarm_DtTm_DayofWeek", 
                     F.dayofweek(F.col("Alarm DtTm")))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_DayofWeek
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,3
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,5
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,5


# Get day name

In [22]:
df4 = df3.withColumn("Alarm_DtTm_DayName", 
                     F.date_format(F.col("Alarm DtTm"), 'E'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Arrival DtTm,Close DtTm,Alarm_DtTm_DayName
0,2018-06-05,2018-06-05 18:38:01,06/05/2018 06:41:59 PM,06/05/2018 06:42:12 PM,Tue
1,2019-08-29,2019-08-29 20:09:25,08/29/2019 08:11:54 PM,08/29/2019 08:12:24 PM,Thu
2,2018-06-14,2018-06-14 20:37:56,06/14/2018 08:40:37 PM,06/14/2018 08:40:52 PM,Thu


# Get day names other than English

In [23]:
from pyspark.sql.types import StringType

In [24]:
# write a function that converts day number to day name
def switch_tr_day(day_index):
    my_dict = {
        1: 'Pazartesi',
        2: 'Salı',
        3: 'Çarşamba',
        4: 'Perşembe',
        5: 'Cuma',
        6: 'Cumartesi',
        7: 'Pazar'
    }
    
    return my_dict.get(day_index)

In [26]:
switch_tr_day(3)

'Çarşamba'

In [27]:
switch_day_func = F.udf(lambda z: switch_tr_day(z), StringType())

In [28]:
# Register your function


spark.udf.register("switch_day_func", switch_day_func)

<function __main__.<lambda>(z)>

In [29]:
# Use your registered function

df5 = df4 \
.withColumn("Alarm_DtTm_DayofWeek", F.dayofweek(F.col("Alarm DtTm")))


df5.select(switch_day_func('Alarm_DtTm_DayofWeek')).limit(5).toPandas()

Unnamed: 0,<lambda>(Alarm_DtTm_DayofWeek)
0,Çarşamba
1,Cuma
2,Cuma
3,Cumartesi
4,Cuma
