# People Analytics Case Study

## 1. Libraries

In [616]:
# import libraries
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import getpass

## 2. Data Overview

In [617]:
# load data
df = pd.read_csv('../data/raw/hr_data.csv')

In [618]:
# View the DataFrame
df

Unnamed: 0,id,first_name,last_name,birthdate,gender,race,department,jobtitle,location,hire_date,termdate,location_city,location_state
0,00-0037846,Kimmy,Walczynski,6/4/1991,Male,Hispanic or Latino,Engineering,Programmer Analyst I,Headquarters,1/20/2002,,Cleveland,Ohio
1,00-0041533,Ignatius,Springett,6/29/1984,Male,White,Business Development,Business Analyst,Headquarters,4/8/2019,,Cleveland,Ohio
2,00-0045747,Corbie,Bittlestone,7/29/1989,Male,Black or African American,Sales,Solutions Engineer Manager,Headquarters,10/12/2010,,Cleveland,Ohio
3,00-0055274,Baxy,Matton,9/14/1982,Female,White,Services,Service Tech,Headquarters,4/10/2005,,Cleveland,Ohio
4,00-0076100,Terrell,Suff,4/11/1994,Female,Two or More Races,Product Management,Business Analyst,Remote,9/29/2010,2029-10-29 06:09:38 UTC,Flint,Michigan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37403,,,,,,,,,,,,,
37404,,,,,,,,,,,,,
37405,,,,,,,,,,,,,
37406,,,,,,,,,,,,,


In [619]:
# General information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37408 entries, 0 to 37407
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              22214 non-null  object
 1   first_name      22214 non-null  object
 2   last_name       22214 non-null  object
 3   birthdate       22214 non-null  object
 4   gender          22214 non-null  object
 5   race            22214 non-null  object
 6   department      22214 non-null  object
 7   jobtitle        22267 non-null  object
 8   location        22214 non-null  object
 9   hire_date       22214 non-null  object
 10  termdate        3929 non-null   object
 11  location_city   22214 non-null  object
 12  location_state  22214 non-null  object
dtypes: object(13)
memory usage: 3.7+ MB


## 3. Data Cleaning

In [620]:
# Rename columns in the DataFrame
df = df.rename(columns={
    'birthdate': 'birth_date',
    'jobtitle': 'job_title',
    'termdate': 'term_date',
    'id': 'emp_id'
})

In [621]:
# Remove rows where all values are NaN (empty)
df = df.dropna(how='all')

In [622]:
df

Unnamed: 0,emp_id,first_name,last_name,birth_date,gender,race,department,job_title,location,hire_date,term_date,location_city,location_state
0,00-0037846,Kimmy,Walczynski,6/4/1991,Male,Hispanic or Latino,Engineering,Programmer Analyst I,Headquarters,1/20/2002,,Cleveland,Ohio
1,00-0041533,Ignatius,Springett,6/29/1984,Male,White,Business Development,Business Analyst,Headquarters,4/8/2019,,Cleveland,Ohio
2,00-0045747,Corbie,Bittlestone,7/29/1989,Male,Black or African American,Sales,Solutions Engineer Manager,Headquarters,10/12/2010,,Cleveland,Ohio
3,00-0055274,Baxy,Matton,9/14/1982,Female,White,Services,Service Tech,Headquarters,4/10/2005,,Cleveland,Ohio
4,00-0076100,Terrell,Suff,4/11/1994,Female,Two or More Races,Product Management,Business Analyst,Remote,9/29/2010,2029-10-29 06:09:38 UTC,Flint,Michigan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22433,,,,,,,,Support Staff III,,,,,
22434,,,,,,,,Support Staff III,,,,,
22435,,,,,,,,Support Staff III,,,,,
22436,,,,,,,,Support Staff III,,,,,


In [623]:
# Check for duplicated rows
df.duplicated().sum()

50

In [624]:
# Show duplicated rows
df[df.duplicated()]

Unnamed: 0,emp_id,first_name,last_name,birth_date,gender,race,department,job_title,location,hire_date,term_date,location_city,location_state
22322,,,,,,,,Support Staff II,,,,,
22323,,,,,,,,Support Staff II,,,,,
22324,,,,,,,,Support Staff II,,,,,
22325,,,,,,,,Support Staff II,,,,,
22326,,,,,,,,Support Staff II,,,,,
22327,,,,,,,,Support Staff II,,,,,
22328,,,,,,,,Support Staff II,,,,,
22329,,,,,,,,Support Staff II,,,,,
22330,,,,,,,,Support Staff II,,,,,
22331,,,,,,,,Support Staff II,,,,,


In [625]:
# Remove rows where all columns except 'jobtitle' are NaN
df = df.dropna(how='all', subset=df.columns.difference(['job_title']))

In [626]:
# Count the occurrences of each value in the 'id' column
duplicate_counts = df['emp_id'].value_counts()

# Filter the counts to show only duplicates
duplicates = duplicate_counts[duplicate_counts > 1]

# Show the duplicated ids and their counts
print(duplicates)

Series([], Name: count, dtype: int64)


In [627]:
# Dropping the 'location_city' column as it's not relevant for the analysis
df.drop(columns=['location_city'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['location_city'], inplace=True)


In [628]:
# Count unique values in 'location_state'
df['location_state'].value_counts()

location_state
Ohio            18025
Pennsylvania     1115
Illinois          868
Indiana           700
Michigan          673
Kentucky          451
Wisconsin         382
Name: count, dtype: int64

In [629]:
# Mapping states to California and neighboring states for storytelling purposes.
# The original dataset is fictitious, allowing for creative flexibility in the analysis.

state_mapping = {
    'Ohio': 'California',
    'Pennsylvania': 'Oregon',
    'Illinois': 'Nevada',
    'Indiana': 'Arizona',
    'Michigan': 'Utah',
    'Kentucky': 'New Mexico',
    'Wisconsin': 'Idaho'
}

# Apply the mapping to the 'location_state' column
df['location_state'] = df['location_state'].replace(state_mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location_state'] = df['location_state'].replace(state_mapping)


In [630]:
# Count unique values in 'location_state'
df['location_state'].value_counts()

location_state
California    18025
Oregon         1115
Nevada          868
Arizona         700
Utah            673
New Mexico      451
Idaho           382
Name: count, dtype: int64

In [631]:
df.dtypes

emp_id            object
first_name        object
last_name         object
birth_date        object
gender            object
race              object
department        object
job_title         object
location          object
hire_date         object
term_date         object
location_state    object
dtype: object

In [632]:
# Count the number of valid (non-null) entries in 'birth_date', 'hire_date', and 'term_date'
valid_counts = df[['birth_date', 'hire_date', 'term_date']].count()

# Print the counts of valid entries for each column
print(valid_counts)

birth_date    22214
hire_date     22214
term_date      3929
dtype: int64


In [633]:
# Convert 'birth_date' and 'hire_date' to datetime format
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
df['hire_date'] = pd.to_datetime(df['hire_date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hire_date'] = pd.to_datetime(df['hire_date'], errors='coerce')


In [634]:
# Convert the 'term_date' column from string to datetime format and extract only the date
df['term_date'] = pd.to_datetime(df['term_date']).dt.date

# Convert the 'term_date' column back to datetime format with default time (00:00:00)
df['term_date'] = pd.to_datetime(df['term_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['term_date'] = pd.to_datetime(df['term_date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['term_date'] = pd.to_datetime(df['term_date'])


In [635]:
# Count the number of valid (non-null) entries in 'birth_date', 'hire_date', and 'term_date'
valid_counts = df[['birth_date', 'hire_date', 'term_date']].count()

# Print the counts of valid entries for each column
print(valid_counts)

birth_date    22214
hire_date     22214
term_date      3929
dtype: int64


In [636]:
# General information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22214 entries, 0 to 22213
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   emp_id          22214 non-null  object        
 1   first_name      22214 non-null  object        
 2   last_name       22214 non-null  object        
 3   birth_date      22214 non-null  datetime64[ns]
 4   gender          22214 non-null  object        
 5   race            22214 non-null  object        
 6   department      22214 non-null  object        
 7   job_title       22214 non-null  object        
 8   location        22214 non-null  object        
 9   hire_date       22214 non-null  datetime64[ns]
 10  term_date       3929 non-null   datetime64[ns]
 11  location_state  22214 non-null  object        
dtypes: datetime64[ns](3), object(9)
memory usage: 2.2+ MB


In [637]:
# Create a dictionary to store the min and max values for clarity
date_ranges = {
    'Birth Date': {'Min': df['birth_date'].min(), 'Max': df['birth_date'].max()},
    'Hire Date': {'Min': df['hire_date'].min(), 'Max': df['hire_date'].max()},
    'Term Date': {'Min': df['term_date'].min(), 'Max': df['term_date'].max()}
}

# Convert to a DataFrame for a cleaner output
date_ranges_df = pd.DataFrame(date_ranges)

# Show the date ranges
date_ranges_df

Unnamed: 0,Birth Date,Hire Date,Term Date
Min,1965-10-16,2000-10-17,2001-04-15
Max,2002-09-13,2020-12-13,2041-10-29


In [638]:
# Get the current timestamp
current_timestamp = pd.Timestamp('now')

# Filter rows where 'term_date' is greater than current date
term_date_invalid = df[df['term_date'] > current_timestamp]

# Show the filtered rows
term_date_invalid

Unnamed: 0,emp_id,first_name,last_name,birth_date,gender,race,department,job_title,location,hire_date,term_date,location_state
4,00-0076100,Terrell,Suff,1994-04-11,Female,Two or More Races,Product Management,Business Analyst,Remote,2010-09-29,2029-10-29,Utah
27,00-1268049,Fay,Monnelly,1966-07-09,Male,Native Hawaiian or Other Pacific Islander,Engineering,Software Engineer I,Headquarters,2010-02-24,2030-03-21,California
40,00-1792130,Nobe,Leathe,1993-07-23,Male,Black or African American,Engineering,Developer III,Headquarters,2011-01-23,2024-12-07,California
57,00-2623755,Chrysa,Brownell,1983-04-25,Male,White,Engineering,Administrative Officer,Headquarters,2018-02-22,2027-02-01,California
139,00-6479395,Aura,Steagall,1978-07-19,Male,White,Accounting,Staff Accountant I,Headquarters,2013-03-28,2030-02-23,California
...,...,...,...,...,...,...,...,...,...,...,...,...
22038,99-1005402,Cornela,Livermore,1969-10-14,Female,Two or More Races,Engineering,Software Test Engineer I,Remote,2013-06-02,2030-01-30,Nevada
22048,99-1707394,Patrick,Musicka,1976-04-23,Female,Two or More Races,Human Resources,Senior Recruiter,Headquarters,2016-11-25,2026-08-01,California
22083,99-3706255,Nappy,Burchess,1999-10-22,Male,Native Hawaiian or Other Pacific Islander,Accounting,Budget/Accounting Analyst II,Headquarters,2017-12-31,2035-08-20,California
22095,99-4396036,Flory,Hardy-Piggin,1989-03-28,Male,Black or African American,Accounting,Administrative Officer,Remote,2019-03-26,2027-03-03,Arizona


In [639]:
# Define the cutoff date as 31st December 2020
cutoff_date = pd.Timestamp('2020-12-31')

# Drop rows where 'term_date' is greater than the cutoff date
df = df.drop(df[df['term_date'] > cutoff_date].index)

# Check the number of rows after the removal
remaining_rows = len(df)
remaining_rows

20183

In [640]:
# Define a timedelta of 3 years using pd.DateOffset
# We are adding 3 years to the existing dates for storytelling purposes, so that events that happened in the past
# (such as 2020) will now appear as if they occurred in 2023. This gives the dataset a more current context.
three_years = pd.DateOffset(years=3)

# Add 3 years to each of the date columns ('birth_date', 'hire_date', 'term_date')
# This shifts all dates forward by 3 years, making the dataset feel more relevant to the current time period.
df['birth_date'] = df['birth_date'] + three_years
df['hire_date'] = df['hire_date'] + three_years
df['term_date'] = df['term_date'] + three_years

In [654]:
#BORRAAAAAAAAAAAAAAARRRRRRRRRRRRRRR

df.sort_values(by='hire_date', ascending=False).head(50)

Unnamed: 0,emp_id,first_name,last_name,birth_date,age,gender,race,department,job_title,location,hire_date,term_date,location_state
9802,44-1463409,Jehu,Caroll,1987-06-11,37,Female,Two or More Races,Legal,Executive Assistant,Headquarters,2023-12-13,NaT,California
13278,59-7815002,Kennith,Marjoribanks,1989-04-16,35,Male,Black or African American,Human Resources,HR Manager,Headquarters,2023-12-13,NaT,California
18867,84-5815265,Nanette,Kirgan,2003-04-18,21,Male,White,Business Development,Business Analyst,Remote,2023-12-13,NaT,California
2467,11-2348367,Reese,Nattrass,1992-07-19,32,Male,Black or African American,Engineering,Programmer Analyst II,Headquarters,2023-12-12,NaT,California
6956,31-4715846,Tucky,Crosby,1981-10-14,43,Male,Black or African American,Engineering,Data Visualization Specialist,Headquarters,2023-12-11,NaT,California
17024,76-3309194,Jakie,Jent,1990-09-08,34,Female,Asian,Accounting,Financial Analyst,Headquarters,2023-12-10,NaT,California
2543,11-5197257,Laurens,Law,1977-04-07,47,Male,Two or More Races,Accounting,Budget/Accounting Analyst IV,Headquarters,2023-12-10,NaT,California
19801,89-0545531,Quent,Taffs,1983-08-09,41,Male,Two or More Races,Support,Help Desk Operator,Headquarters,2023-12-09,NaT,California
13440,60-5239423,Angelia,Vlasov,1999-05-31,25,Female,Hispanic or Latino,Services,Service Coordinator,Headquarters,2023-12-09,NaT,California
18783,84-2343788,Aveline,Coffin,1988-10-16,36,Non-Conforming,White,Engineering,Programmer Analyst IV,Headquarters,2023-12-09,NaT,California


In [642]:
# Get distinct values in the 'gender' column
distinct_gender = df['gender'].unique()

# Show the unique values
distinct_gender

array(['Male', 'Female', 'Non-Conforming'], dtype=object)

In [643]:
# Get distinct values in the 'race' column
distinct_race = df['race'].unique()

# Show the unique values
distinct_race

array(['Hispanic or Latino', 'White', 'Black or African American',
       'Asian', 'Two or More Races', 'American Indian or Alaska Native',
       'Native Hawaiian or Other Pacific Islander'], dtype=object)

In [644]:
# Check for empty values in 'race' and 'gender' columns
empty_values = df[df['race'].isnull() | df['gender'].isnull()]

# Show the rows with empty values
empty_values

Unnamed: 0,emp_id,first_name,last_name,birth_date,gender,race,department,job_title,location,hire_date,term_date,location_state


In [645]:
# Check for null values in all columns
null_counts = df.isnull().sum()

# Show columns with null values
null_counts[null_counts > 0]

term_date    18285
dtype: int64

In [646]:
# Calculate age based on 'birth_date'
current_year = pd.Timestamp('now').year
age = current_year - df['birth_date'].dt.year

# Insert the 'age' column at index 4 (fifth position)
df.insert(4, 'age', age)

# Display the DataFrame with the new 'age' column
df

Unnamed: 0,emp_id,first_name,last_name,birth_date,age,gender,race,department,job_title,location,hire_date,term_date,location_state
0,00-0037846,Kimmy,Walczynski,1994-06-04,30,Male,Hispanic or Latino,Engineering,Programmer Analyst I,Headquarters,2005-01-20,NaT,California
1,00-0041533,Ignatius,Springett,1987-06-29,37,Male,White,Business Development,Business Analyst,Headquarters,2022-04-08,NaT,California
2,00-0045747,Corbie,Bittlestone,1992-07-29,32,Male,Black or African American,Sales,Solutions Engineer Manager,Headquarters,2013-10-12,NaT,California
3,00-0055274,Baxy,Matton,1985-09-14,39,Female,White,Services,Service Tech,Headquarters,2008-04-10,NaT,California
5,00-0116166,Kacie,Offiler,1974-01-18,50,Male,Asian,Engineering,Developer III,Headquarters,2021-09-01,NaT,California
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22209,99-9797418,Dorella,Garvan,2001-07-08,23,Female,Hispanic or Latino,Research and Development,Research Assistant I,Headquarters,2015-02-08,NaT,California
22210,99-9869877,Dasie,Thorsby,2004-04-19,20,Female,Two or More Races,Services,Service Manager,Headquarters,2020-10-06,NaT,California
22211,99-9919822,Nerty,Wilding,1973-02-09,51,Female,Two or More Races,Training,Junior Trainer,Headquarters,2004-02-08,NaT,California
22212,99-9960380,Mabelle,Dawks,1988-09-02,36,Male,Two or More Races,Accounting,Staff Accountant I,Headquarters,2008-04-03,2015-12-10,California


In [647]:
# Calculate min, avg, and max for the 'age' column
min_age = df['age'].min()
avg_age = df['age'].mean()
max_age = df['age'].max()

# Display the results
age_statistics = {
    'min_age': min_age,
    'avg_age': avg_age,
    'max_age': max_age
}

# Count the number of rows where age is less than 18
count_under_18 = df[df['age'] < 18].shape[0]

# Display the count
age_statistics, count_under_18


({'min_age': 19, 'avg_age': 37.20091165832631, 'max_age': 56}, 0)

In [648]:
df.sort_values(by='term_date', ascending=True)

Unnamed: 0,emp_id,first_name,last_name,birth_date,age,gender,race,department,job_title,location,hire_date,term_date,location_state
14374,64-6403747,Briggs,Eastway,1971-03-13,53,Female,American Indian or Alaska Native,Business Development,Research Assistant II,Headquarters,2003-12-03,2004-04-15,California
19423,87-3003781,Menard,Bril,1992-09-25,32,Female,American Indian or Alaska Native,Services,Service Manager,Headquarters,2004-08-28,2004-10-06,California
8835,39-8448824,Keir,Dechelle,2005-07-22,19,Male,White,Sales,Relationshiop Manager,Headquarters,2004-05-31,2004-10-24,California
6854,31-0270355,Joey,Agney,1980-03-08,44,Female,Black or African American,Support,Support Staff,Headquarters,2004-09-22,2004-11-11,California
4049,18-3127889,Faydra,Matushevich,1999-12-29,25,Male,Two or More Races,Human Resources,Human Resources Analyst,Headquarters,2004-04-29,2004-11-30,California
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22208,99-9610988,Hillie,Renbold,1968-10-30,56,Female,Two or More Races,Engineering,Web Developer II,Headquarters,2004-12-03,NaT,California
22209,99-9797418,Dorella,Garvan,2001-07-08,23,Female,Hispanic or Latino,Research and Development,Research Assistant I,Headquarters,2015-02-08,NaT,California
22210,99-9869877,Dasie,Thorsby,2004-04-19,20,Female,Two or More Races,Services,Service Manager,Headquarters,2020-10-06,NaT,California
22211,99-9919822,Nerty,Wilding,1973-02-09,51,Female,Two or More Races,Training,Junior Trainer,Headquarters,2004-02-08,NaT,California


In [649]:
# Filter and count rows where the year in 'hire_date' is 2023
hire_date_2023 = df[df['hire_date'].dt.year == 2023].shape[0]

# Filter and count rows where the year in 'term_date' is 2023
term_date_2023 = df[df['term_date'].dt.year == 2023].shape[0]

# Filter and count rows where the year in 'hire_date' is 2003
hire_date_2003 = df[df['hire_date'].dt.year == 2003].shape[0]

# Filter and count rows where the year in 'term_date' is 2003
term_date_2003 = df[df['term_date'].dt.year == 2003].shape[0]

# Print the results separately
print(f"Number of rows where hire_date is in 2023: {hire_date_2023}")
print(f"Number of rows where term_date is in 2023: {term_date_2023}")
print(f"Number of rows where hire_date is in 2003: {hire_date_2003}")
print(f"Number of rows where term_date is in 2003: {term_date_2003}")

Number of rows where hire_date is in 2023: 842
Number of rows where term_date is in 2023: 166
Number of rows where hire_date is in 2003: 219
Number of rows where term_date is in 2003: 0


In [650]:
df.dtypes

emp_id                    object
first_name                object
last_name                 object
birth_date        datetime64[ns]
age                        int32
gender                    object
race                      object
department                object
job_title                 object
location                  object
hire_date         datetime64[ns]
term_date         datetime64[ns]
location_state            object
dtype: object

## 4. Exporting the DataFrame to MySQL Workbench

In [651]:
# Connection parameters
bd = "hr_data"
password = "password"
connection_string = 'mysql+pymysql://root:' + password + '@localhost/' + bd

# Create the connection engine
engine = create_engine(connection_string)

# Send the DataFrame named 'df' to MySQL
table_name = 'employees'
df.to_sql(table_name, con=engine, if_exists='replace', index=False)

20183