# People Analytics Case Study

## Libraries

In [48]:
# import libraries
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import getpass

## 1. Importing Data from MySQL

In [49]:
# Connection parameters
bd = "hr_data"
password = "password"
connection_string = 'mysql+pymysql://root:' + password + '@localhost/' + bd

# Create the connection engine
engine = create_engine(connection_string)

## 2. SQL Queries for Data Analysis

In [50]:
original = pd.read_csv('/Users/josemi/Desktop/hr_clean.csv', encoding='UTF-16', sep='\t')
original

Unnamed: 0,Age Bin,Age group,Birthdate,Department,Emp Id,First Name,Gender,Hire Date,Jobtitle,Last Name,Location,Location City,Location State,Race,Termdate,Age,Employment Length,Number of Terminations,Total Employee,Turnovers
0,40,40-49,14/6/1982,Sales,00-3597335,Amara,Male,7/2/2006,Solutions Engineer,Jacobs,Remote,Columbus,Ohio,White,,41,,0,1,"0,0%"
1,30,30-39,18/3/1988,Engineering,05-7126762,Darcy,Male,20/12/2000,Developer II,Hurl,Headquarters,Cleveland,Ohio,Asian,,35,,0,1,"0,0%"
2,40,40-49,9/5/1976,Services,06-6345157,Charles,Female,10/5/2002,Service Tech,Stowell,Headquarters,Cleveland,Ohio,White,,47,,0,1,"0,0%"
3,40,40-49,18/3/1976,Services,02-9395540,Shirleen,Male,7/4/2005,Service Tech,Vergo,Remote,Philadelphia,Pennsylvania,White,,47,,0,1,"0,0%"
4,40,40-49,15/7/1979,Engineering,99-4836108,Ynez,Female,26/2/2014,Developer I,Hembery,Remote,Peoria,Illinois,White,,44,,0,1,"0,0%"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19771,30,30-39,3/4/1986,Accounting,75-8249434,Kaia,Non-Conforming,15/1/2002,Accountant I,Meecher,Remote,Madison,Wisconsin,White,7/2/2014 1:16:38,37,12.0,1,1,"100,0%"
19772,30,30-39,7/8/1984,Accounting,98-0759640,Hyacinthe,Non-Conforming,24/8/2001,VP Accounting,Pilipets,Remote,Madison,Wisconsin,White,3/1/2019 1:28:46,39,18.0,1,1,"100,0%"
19773,50,50-59,20/1/1968,Product Management,64-0283298,Teddie,Non-Conforming,27/8/2012,Project Manager,Lidgard,Remote,Madison,Wisconsin,White,,55,,0,1,"0,0%"
19774,20,20-29,28/4/1998,Sales,61-7631824,Correy,Non-Conforming,19/6/2018,Pre-Sales Consultant,Bulch,Remote,Green Bay,Wisconsin,White,,25,,0,1,"0,0%"


In [51]:
# 1. What is the gender breakdown of employees in the company?
query = """
SELECT gender, COUNT(*) AS count
FROM employees
GROUP BY gender;
"""

# Execute the query and display the result
gender_breakdown = pd.read_sql(query, engine)

# Visualize the result
gender_breakdown

Unnamed: 0,gender,count
0,Male,10631
1,Female,9686
2,Non-Conforming,567


In [52]:
# 2. What is the race/ethnicity breakdown of employees in the company?
query = """
SELECT race, COUNT(*) AS count
FROM employees
GROUP BY race
ORDER BY count DESC;
"""

# Execute the query and display the result
race_breakdown = pd.read_sql(query, engine)

# Visualize the result
race_breakdown

Unnamed: 0,race,count
0,White,5950
1,Two or More Races,3417
2,Black or African American,3413
3,Asian,3357
4,Hispanic or Latino,2345
5,American Indian or Alaska Native,1252
6,Native Hawaiian or Other Pacific Islander,1150


In [53]:
# 3. What is the age distribution of employees in the company?

# Get the max and min age
query_age_range = """
SELECT MAX(age) AS max_age, MIN(age) AS min_age
FROM employees;
"""

# Execute the query and store the result in a DataFrame
age_range = pd.read_sql(query_age_range, engine)

# Age distribution by group
query_age_distribution = """
SELECT 
  CASE 
    WHEN age < 30 THEN '20-29'
    WHEN age < 40 THEN '30-39'
    WHEN age < 50 THEN '40-49'
    ELSE '50-59'
  END AS age_group, COUNT(*) AS count
FROM employees
GROUP BY age_group
ORDER BY count DESC;
"""

# Execute the query and store the result in a DataFrame
age_distribution = pd.read_sql(query_age_distribution, engine)

# Visualize the results
age_range, age_distribution

(   max_age  min_age
 0       59       22,
   age_group  count
 0     30-39   5803
 1     40-49   5530
 2     50-59   5200
 3     20-29   4351)

In [54]:
# 4. How many employees work at headquarters versus remote locations?
query = """
SELECT location, COUNT(*) AS count
FROM employees
GROUP BY location;
"""

# Execute the query and store the result in a DataFrame
location_breakdown = pd.read_sql(query, engine)

# Visualize the result
location_breakdown

Unnamed: 0,location,count
0,Headquarters,15697
1,Remote,5187


In [55]:
# 5. What is the average length of employment for employees who have been terminated?
query = """
SELECT round(avg(DATEDIFF(term_date, hire_date) / 365.25), 0) AS avg_emp_length
FROM employees
WHERE term_date IS NOT NULL;
"""

# Get the average employment length
avg_emp_length_df = pd.read_sql(query, engine)

# Display the result
print(avg_emp_length_df)

   avg_emp_length
0             8.0


In [56]:
# 6. How does the gender distribution vary across departments?
query = """
SELECT department, gender, count(*) AS employees
FROM employees
GROUP BY department, gender
ORDER BY department, employees DESC;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,department,gender,employees
0,Accounting,Male,1610
1,Accounting,Female,1435
2,Accounting,Non-Conforming,89
3,Auditing,Male,27
4,Auditing,Female,22
5,Business Development,Male,792
6,Business Development,Female,702
7,Business Development,Non-Conforming,46
8,Engineering,Male,3177
9,Engineering,Female,2936


In [57]:
# 7. What is the distribution of job titles across the company?
query = """
SELECT job_title, COUNT(*) AS employees
FROM employees
GROUP BY job_title
ORDER BY employees DESC
LIMIT 10;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,job_title,employees
0,Research Assistant II,705
1,Business Analyst,660
2,Human Resources Analyst II,570
3,Research Assistant I,501
4,Account Executive,469
5,Data Visualization Specialist,429
6,Staff Accountant I,415
7,Human Resources Analyst,387
8,Software Engineer I,369
9,Systems Administrator I,355


In [58]:
# 8. Which department has the highest turnover rate?
query = """
WITH department_count AS (
    SELECT department, COUNT(*) AS total_count,
           SUM(CASE WHEN term_date IS NOT NULL THEN 1 ELSE 0 END) AS termination_count
    FROM employees
    GROUP BY department
)

SELECT department, 
       ROUND((termination_count / total_count) * 100, 1) AS turnover_rate
FROM department_count
ORDER BY turnover_rate DESC
LIMIT 1;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,department,turnover_rate
0,Auditing,18.4


In [59]:
# 9. What is the turnover rate across job titles?
query = """
WITH job_title_count AS (
    SELECT job_title, COUNT(*) AS total_count,
           SUM(CASE WHEN term_date IS NOT NULL THEN 1 ELSE 0 END) AS termination_count
    FROM employees
    GROUP BY job_title
)

SELECT job_title, 
       ROUND((termination_count / total_count) * 100, 1) AS turnover_rate
FROM job_title_count
ORDER BY turnover_rate DESC;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,job_title,turnover_rate
0,Office Assistant II,100.0
1,Executive Secretary,100.0
2,Statistician IV,100.0
3,Statistician III,50.0
4,Sales Representative,40.0
...,...,...
180,Account Coordinator,0.0
181,Associate Professor,0.0
182,VP of Training and Development,0.0
183,Office Assistant IV,0.0


In [60]:
# 10. How have turnover rates changed each year?
query = """
WITH year_cte AS (
    SELECT YEAR(hire_date) AS year,
           COUNT(*) AS total_count,
           SUM(CASE WHEN term_date IS NOT NULL THEN 1 ELSE 0 END) AS termination_count
    FROM employees
    GROUP BY YEAR(hire_date)
)

SELECT year, 
       ROUND((termination_count / total_count) * 100, 1) AS turnover_rate
FROM year_cte
ORDER BY turnover_rate DESC;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,year,turnover_rate
0,2001,18.1
1,2004,18.1
2,2005,17.7
3,2003,17.7
4,2006,17.3
5,2002,16.3
6,2007,14.6
7,2009,14.5
8,2000,14.1
9,2008,13.9


In [61]:
# 11. What is the distribution of employees across states?
query = """
SELECT location_state, COUNT(*) AS employees
FROM employees
GROUP BY location_state
ORDER BY employees DESC, location_state;
"""

# Execute the query and fetch the results
results = pd.read_sql(query, engine)
results

Unnamed: 0,location_state,employees
0,California,16922
1,Oregon,1044
2,Nevada,825
3,Arizona,654
4,Utah,644
5,New Mexico,430
6,Idaho,365


## 3. Exporting Data for Tableau Visualization

In [63]:
# SQL query to execute
consulta = "SELECT * FROM hr_data.employees"

# Read the SQL query into a DataFrame
df_sql = pd.read_sql(consulta, con=engine)

# Save the DataFrame to a CSV file
df_sql.to_csv('../data/cleaned/hr_data_cleaned.csv', index=False)

# Display the first rows of the DataFrame
df_sql

Unnamed: 0,emp_id,first_name,last_name,birth_date,age,gender,race,department,job_title,location,hire_date,term_date,location_state
0,00-0037846,Kimmy,Walczynski,1991-06-04,33,Male,Hispanic or Latino,Engineering,Programmer Analyst I,Headquarters,2002-01-20,NaT,California
1,00-0041533,Ignatius,Springett,1984-06-29,40,Male,White,Business Development,Business Analyst,Headquarters,2019-04-08,NaT,California
2,00-0045747,Corbie,Bittlestone,1989-07-29,35,Male,Black or African American,Sales,Solutions Engineer Manager,Headquarters,2010-10-12,NaT,California
3,00-0055274,Baxy,Matton,1982-09-14,42,Female,White,Services,Service Tech,Headquarters,2005-04-10,NaT,California
4,00-0116166,Kacie,Offiler,1971-01-18,53,Male,Asian,Engineering,Developer III,Headquarters,2018-09-01,NaT,California
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20879,99-9797418,Dorella,Garvan,1998-07-08,26,Female,Hispanic or Latino,Research and Development,Research Assistant I,Headquarters,2012-02-08,NaT,California
20880,99-9869877,Dasie,Thorsby,2001-04-19,23,Female,Two or More Races,Services,Service Manager,Headquarters,2017-10-06,NaT,California
20881,99-9919822,Nerty,Wilding,1970-02-09,54,Female,Two or More Races,Training,Junior Trainer,Headquarters,2001-02-08,NaT,California
20882,99-9960380,Mabelle,Dawks,1985-09-02,39,Male,Two or More Races,Accounting,Staff Accountant I,Headquarters,2005-04-03,2012-12-10,California
