In [15]:
#Import necessary libraries
import pandas as pd
import numpy as np

#### Data Extraction

In [2]:
nyc2020_df = pd.read_csv(r"datasets\nycpayroll_2020.csv")
nyc2021_df = pd.read_csv(r"datasets\nycpayroll_2021.csv")

In [16]:
nyc2020_df.head(2)
nyc2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FiscalYear             100 non-null    int64  
 1   PayrollNumber          100 non-null    int64  
 2   AgencyID               100 non-null    int64  
 3   AgencyName             100 non-null    object 
 4   EmployeeID             100 non-null    int64  
 5   LastName               100 non-null    object 
 6   FirstName              100 non-null    object 
 7   AgencyStartDate        100 non-null    object 
 8   WorkLocationBorough    100 non-null    object 
 9   TitleCode              100 non-null    int64  
 10  TitleDescription       100 non-null    object 
 11  LeaveStatusasofJune30  100 non-null    object 
 12  BaseSalary             100 non-null    float64
 13  PayBasis               100 non-null    object 
 14  RegularHours           100 non-null    float64
 15  Regular

In [17]:
nyc2021_df.head(3)
nyc2021_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FiscalYear             101 non-null    int64  
 1   PayrollNumber          101 non-null    int64  
 2   AgencyCode             101 non-null    int64  
 3   AgencyName             101 non-null    object 
 4   EmployeeID             101 non-null    int64  
 5   LastName               101 non-null    object 
 6   FirstName              101 non-null    object 
 7   AgencyStartDate        101 non-null    object 
 8   WorkLocationBorough    101 non-null    object 
 9   TitleCode              101 non-null    int64  
 10  TitleDescription       101 non-null    object 
 11  LeaveStatusasofJune30  101 non-null    object 
 12  BaseSalary             101 non-null    float64
 13  PayBasis               101 non-null    object 
 14  RegularHours           101 non-null    int64  
 15  Regula

In [21]:
#print the numbers of columns in each dataset
print("NYC2 020 Columns:", nyc2020_df.columns)
print("NYC 2021 Columns:", nyc2021_df.columns)

NYC2 020 Columns: Index(['FiscalYear', 'PayrollNumber', 'AgencyID', 'AgencyName', 'EmployeeID',
       'LastName', 'FirstName', 'AgencyStartDate', 'WorkLocationBorough',
       'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30', 'BaseSalary',
       'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours',
       'TotalOTPaid', 'TotalOtherPay'],
      dtype='object')
NYC 2021 Columns: Index(['FiscalYear', 'PayrollNumber', 'AgencyCode', 'AgencyName', 'EmployeeID',
       'LastName', 'FirstName', 'AgencyStartDate', 'WorkLocationBorough',
       'TitleCode', 'TitleDescription', 'LeaveStatusasofJune30', 'BaseSalary',
       'PayBasis', 'RegularHours', 'RegularGrossPaid', 'OTHours',
       'TotalOTPaid', 'TotalOtherPay'],
      dtype='object')


In [22]:
#Checking id the columns mismatch and merging both into a dataset
# Check for column mismatches
set_2020 = set(nyc2020_df.columns)
set_2021 = set(nyc2021_df.columns)

if set_2020 != set_2021:
    print("Columns do not match! Adjust before merging.")
    print("Extra columns in 2020:", set_2020 - set_2021)
    print("Extra columns in 2021:", set_2021 - set_2020)
else:
    print("Columns match, ready to merge.")

# Merge datasets
Mergedpayroll_df = pd.concat([nyc2020_df, nyc2021_df], ignore_index=True)
print(f"Combined dataset has {Mergedpayroll_df.shape[0]} rows and {Mergedpayroll_df.shape[1]} columns.")

Columns do not match! Adjust before merging.
Extra columns in 2020: {'AgencyID'}
Extra columns in 2021: {'AgencyCode'}
Combined dataset has 201 rows and 20 columns.


In [23]:
# Ensure both dataframes have the same columns
all_columns = set(nyc2020_df.columns).union(set(nyc2021_df.columns))

# Adding missing columns to each dataframe with NaN values
for col in all_columns:
    if col not in nyc2020_df.columns:
        nyc2020_df[col] = None  # Add missing column in 2020
    if col not in nyc2021_df.columns:
        nyc2021_df[col] = None  # Add missing column in 2021

# Reorder columns to match before merging
nyc2020_df = nyc2020_df[sorted(all_columns)]
nyc2021_df = nyc2021_df[sorted(all_columns)]

# Merge datasets
Mergedpayroll_df = pd.concat([nyc2020_df, nyc2021_df], ignore_index=True)

# Confirm the merge
print(f"✅ Merged dataset has {Mergedpayroll_df.shape[0]} rows and {Mergedpayroll_df.shape[1]} columns.")


✅ Merged dataset has 201 rows and 20 columns.


In [24]:
Mergedpayroll_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   AgencyCode             101 non-null    object 
 1   AgencyID               100 non-null    object 
 2   AgencyName             201 non-null    object 
 3   AgencyStartDate        201 non-null    object 
 4   BaseSalary             201 non-null    float64
 5   EmployeeID             201 non-null    int64  
 6   FirstName              201 non-null    object 
 7   FiscalYear             201 non-null    int64  
 8   LastName               201 non-null    object 
 9   LeaveStatusasofJune30  201 non-null    object 
 10  OTHours                201 non-null    float64
 11  PayBasis               201 non-null    object 
 12  PayrollNumber          201 non-null    int64  
 13  RegularGrossPaid       201 non-null    float64
 14  RegularHours           201 non-null    float64
 15  TitleC

#### Data Cleaning

In [None]:
#Filling two columns 'AgencyID' and 'AgencyCode' having  null values for a clean dataset
Mergedpayroll_df[['AgencyID', 'AgencyCode']] = Mergedpayroll_df[['AgencyID', 'AgencyCode']].fillna(0.0)


In [None]:
# verifyimg all dataset are clean with no missing values
Mergedpayroll_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   AgencyCode             201 non-null    float64
 1   AgencyID               201 non-null    float64
 2   AgencyName             201 non-null    object 
 3   AgencyStartDate        201 non-null    object 
 4   BaseSalary             201 non-null    float64
 5   EmployeeID             201 non-null    int64  
 6   FirstName              201 non-null    object 
 7   FiscalYear             201 non-null    int64  
 8   LastName               201 non-null    object 
 9   LeaveStatusasofJune30  201 non-null    object 
 10  OTHours                201 non-null    float64
 11  PayBasis               201 non-null    object 
 12  PayrollNumber          201 non-null    int64  
 13  RegularGrossPaid       201 non-null    float64
 14  RegularHours           201 non-null    float64
 15  TitleC

In [28]:
Mergedpayroll_df = Mergedpayroll_df.drop_duplicates()

In [30]:
Mergedpayroll_df['AgencyStartDate'] = pd.to_datetime(Mergedpayroll_df['AgencyStartDate'])

In [32]:
Mergedpayroll_df.info()
Mergedpayroll_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   AgencyCode             201 non-null    float64       
 1   AgencyID               201 non-null    float64       
 2   AgencyName             201 non-null    object        
 3   AgencyStartDate        201 non-null    datetime64[ns]
 4   BaseSalary             201 non-null    float64       
 5   EmployeeID             201 non-null    int64         
 6   FirstName              201 non-null    object        
 7   FiscalYear             201 non-null    int64         
 8   LastName               201 non-null    object        
 9   LeaveStatusasofJune30  201 non-null    object        
 10  OTHours                201 non-null    float64       
 11  PayBasis               201 non-null    object        
 12  PayrollNumber          201 non-null    int64         
 13  Regul

(201, 20)

In [40]:
#Agency table
agency = Mergedpayroll_df[['AgencyID','AgencyName','AgencyCode','AgencyStartDate']].copy().drop_duplicates().reset_index(drop=True)

agency.head(2)

Unnamed: 0,AgencyID,AgencyName,AgencyCode,AgencyStartDate
0,2120.0,OFFICE OF EMERGENCY MANAGEMENT,0.0,2016-09-12
1,2120.0,OFFICE OF EMERGENCY MANAGEMENT,0.0,2013-09-16


In [37]:
agency.shape

(184, 5)

In [38]:
#employee table
employee = Mergedpayroll_df[['EmployeeID','LastName', 'FirstName','TitleCode','TitleDescription']].copy().drop_duplicates().reset_index()

employee.head(2)

Unnamed: 0,index,EmployeeID,LastName,FirstName,TitleCode,TitleDescription
0,0,10001,GEAGER,VERONICA,40447,EMERGENCY PREPAREDNESS MANAGER
1,1,149612,ROTTA,JONATHAN,40447,EMERGENCY PREPAREDNESS MANAGER


In [None]:
#facts table
facts_table = Mergedpayroll_df[['EmployeeID','LastName', 'FirstName','TitleCode','TitleDescription']].copy().drop_duplicates().reset_index()

employee.head(2)

(200, 6)