#### Importing the neccessary libraries and dependencies

In [8]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 pytz-2025.1 tzdata-2025.1


In [9]:
#Import libraries
import pandas as pd 
from sqlalchemy import create_engine

# Import PySpark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, to_date

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("NYC Payroll ETL") \
    .getOrCreate()


In [10]:
spark

### Data Extraction

In [None]:
#extract and convert the cvs into dataframe using spark
AgencyMaster_df = spark.read.csv(r'C:\Users\abiol\OneDrive\Desktop\10ANALYTICS DATA ENGINEERING\NYC PAYROLL PROJECT\NYC-PAYROLL\datasets\AgencyMaster.csv', header=True, inferSchema=True)
EmpMaster_df = spark.read.csv(r'datasets\EmpMaster.csv', header=True, inferSchema=True)
TitleMaster_df = spark.read.csv(r'datasets\TitleMaster.csv', header=True, inferSchema=True)


In [None]:
AgencyMaster_df.show(5)
EmpMaster_df.show(5)
TitleMaster_df.show(5)


+--------+--------------------+
|AgencyID|          AgencyName|
+--------+--------------------+
|    2001|ADMIN FOR CHILDRE...|
|    2002|ADMIN TRIALS AND ...|
|    2003| BOARD OF CORRECTION|
|    2004|   BOARD OF ELECTION|
|    2005|BOARD OF ELECTION...|
+--------+--------------------+
only showing top 5 rows

+----------+--------+---------+
|EmployeeID|LastName|FirstName|
+----------+--------+---------+
|    100001|  AACHEN|    DAVID|
|    100002|  AACHEN|   MONICA|
|    100003|  AADAMS|  LAMMELL|
|    100004|   AADIL|     IRIS|
|    100005|  AALAAM|     AMIR|
+----------+--------+---------+
only showing top 5 rows

+---------+--------------------+
|TitleCode|    TitleDescription|
+---------+--------------------+
|    40001|*ADM SCHOOL SECUR...|
|    40002|*ADMIN SCHL SECUR...|
|    40003|    *AGENCY ATTORNEY|
|    40004|*ASSISTANT ADVOCA...|
|    40005|*ASSOCIATE EDUCAT...|
+---------+--------------------+
only showing top 5 rows



In [None]:
#checking for the right datatype for each column in various dataframes
AgencyMaster_df.printSchema()
EmpMaster_df.printSchema()
TitleMaster_df.printSchema()

root
 |-- AgencyID: integer (nullable = true)
 |-- AgencyName: string (nullable = true)

root
 |-- EmployeeID: integer (nullable = true)
 |-- LastName: string (nullable = true)
 |-- FirstName: string (nullable = true)

root
 |-- TitleCode: integer (nullable = true)
 |-- TitleDescription: string (nullable = true)



In [None]:
#Checking for missing values in the dataframe
from pyspark.sql.functions import col, count, when

AgencyMaster_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in AgencyMaster_df.columns]
).show()


+--------+----------+
|AgencyID|AgencyName|
+--------+----------+
|       0|         0|
+--------+----------+



In [None]:
#Checking for missing values in the dataframe
EmpMaster_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in EmpMaster_df.columns]
).show()

+----------+--------+---------+
|EmployeeID|LastName|FirstName|
+----------+--------+---------+
|         0|       0|        0|
+----------+--------+---------+



In [None]:
#Checking for missing values in the dataframe
TitleMaster_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in TitleMaster_df.columns]
).show()

+---------+----------------+
|TitleCode|TitleDescription|
+---------+----------------+
|        0|               1|
+---------+----------------+



In [None]:
#dropping the column with missing values
TitleMaster_df = TitleMaster_df.dropna(subset=["TitleDescription"])


In [34]:
TitleMaster_df.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in TitleMaster_df.columns]
).show()

+---------+----------------+
|TitleCode|TitleDescription|
+---------+----------------+
|        0|               0|
+---------+----------------+



In [21]:
#extract and convert the cvs into dataframe using spark
nycpayroll2020_df = spark.read.csv(r'datasets\nycpayroll_2020.csv', header=True, inferSchema=True)
nycpayroll2021_df = spark.read.csv(r'datasets\nycpayroll_2021.csv', header=True, inferSchema=True)

In [22]:
nycpayroll2020_df.show(5)
nycpayroll2021_df.show(5)

+----------+-------------+--------+--------------------+----------+----------+---------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|FiscalYear|PayrollNumber|AgencyID|          AgencyName|EmployeeID|  LastName|FirstName|AgencyStartDate|WorkLocationBorough|TitleCode|    TitleDescription|LeaveStatusasofJune30|BaseSalary| PayBasis|RegularHours|RegularGrossPaid|OTHours|TotalOTPaid|TotalOtherPay|
+----------+-------------+--------+--------------------+----------+----------+---------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|      2020|           17|    2120|OFFICE OF EMERGEN...|     10001|    GEAGER| VERONICA|      9/12/2016|           BROOKLYN|    40447|EMERGENCY PREPARE...|               ACTIVE|   86005.0|per Annum|      1820.0|