# Creating HR data visualizations using Matplotlib and Seaborn

In [None]:
# data analysis libraries
import numpy as np
import pandas as pd
import scipy

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the input file as dataframe
dei_data = pd.read_csv(filepath_or_buffer='records.csv')
dei_data.sample(n=5)

In [None]:
# categorical data profiling with formatted outputs
def category_profiling(df, columns):
    print(df.nunique())
    print("---")
    for col in columns:
        percentages = df[col].value_counts(normalize=True).mul(100).round(1).astype(str)+'%'
        print(percentages)
        print("---")

columns = ['department', 'province', 'level', 'gender', 'ethnicity']

category_profiling(dei_data, columns)

## Analysis of age

In [None]:
# simple data profiling on the age column
print(f"Age: min= {dei_data['age'].min()}")
print(f"Age: max= {dei_data['age'].max()}")
print(f"Age: avg= {dei_data['age'].mean().round(0)}")
print(f"Age: stdev= {dei_data['age'].std()}")
print(f"Age: 1st quart= {np.percentile(a= dei_data['age'], q= 25)}")
print(f"Age: 2nd quart= {np.percentile(a= dei_data['age'], q= 50)}")
print(f"Age: 3rd quart= {np.percentile(a= dei_data['age'], q= 75)}")

# advanced statistical measures
from scipy.stats import skew
print(f"Age: skew= {skew(dei_data['age'], axis=0, bias=True)}")

from scipy.stats import kurtosis
print(f"Age: kurtosis= {kurtosis(dei_data['age'], axis=0, fisher=True, bias=True)}")

In [None]:
# distribution bar graph of ages
age_graph_data = dei_data['age'].value_counts().sort_index()

plt.figure(figsize=(10, 5))
plt.bar(age_graph_data.index, age_graph_data.values, 
        color= 'blue', width= 0.6)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of ages')
plt.show()

In [None]:
# convert Age to numeric for preparation
dei_data['age'] = pd.to_numeric(dei_data['age'])

# creating age bins (e.g., 22-30, 31-40, 41-50, 51-60, 61-70)
bins= [22, 31, 41, 51, 61, 71]
labels= ['22-30', '31-40', '41-50', '51-60', '61-70']
dei_data['AgeGroup']= pd.cut(x= dei_data['age'], 
                             bins= bins,
                             labels= labels,
                             right= False)

# count of individuals per age group by gender
df_agg= dei_data.groupby(['AgeGroup', 'gender']).size().unstack(
    fill_value=0)
df_agg= df_agg.reset_index()
df_agg

In [None]:
# distribution pyramid graph of ages
# setting up the figure and axes
plt.figure(figsize=(10,5))
sns.set(style="whitegrid")

# setting up the male population (in negative for left side)
sns.barplot(data= df_agg,
            x= -df_agg['male'],
            y= df_agg['AgeGroup'],
            color= 'orange',
            label= 'Male')

# setting up the female population
sns.barplot(data= df_agg,
            x= df_agg['female'],
            y= df_agg['AgeGroup'],
            color= 'purple',
            label= 'Female')

# Add data labels for male population
for index, value in enumerate(df_agg['male']):
    plt.text(-value, index, f'{value}', color='black', 
             ha='right', va='center')

# Add data labels for female population
for index, value in enumerate(df_agg['female']):
    plt.text(value, index, f'{value}', color='black', 
             ha='left', va='center')

# customizing the axes
plt.xlabel('Population')
plt.ylabel('Age Group')
plt.title('Count of employees by age and gender')
plt.legend()

# improving readability of x-axis labels
max_population = max(df_agg['male'].max(), df_agg['female'].max())
plt.xticks(ticks=[-max_population, -300, 
                  0, 300, max_population], 
           labels=[f'{max_population}', f'{300}', 
                   '0', f'{300}', f'{max_population}'])

# displaying the graph
plt.show()

## Tenure

In [None]:
# ensuring that start and term dates are date type
dei_data['start_date'] = pd.to_datetime(dei_data['start_date'])
dei_data['term_date'] = pd.to_datetime(dei_data['term_date'])

In [None]:
# simple data profiling on the tenure column
print(f"Tenure: min= {dei_data['tenure'].min()}")
print(f"Tenure: max= {dei_data['tenure'].max()}")
print(f"Tenure: avg= {dei_data['tenure'].mean().round(0)}")
print(f"Tenure: stdev= {dei_data['tenure'].std()}")
print(f"Tenure: 1st quart= {np.percentile(a= dei_data['tenure'], q= 25)}")
print(f"Tenure: 2nd quart= {np.percentile(a= dei_data['tenure'], q= 50)}")
print(f"Tenure: 3rd quart= {np.percentile(a= dei_data['tenure'], q= 75)}")

In [None]:
# distribution bar graph of tenure
tenure_graph_data = dei_data['tenure'].value_counts().sort_index()

plt.figure(figsize= (10,5))
plt.bar(tenure_graph_data.index, tenure_graph_data.values,
        color= 'green', width= 0.6)
plt.xlabel('Tenure (years)')
plt.ylabel('Frequency')
plt.title('Distribution of tenure')
plt.show()

In [None]:
# creating tenure bins (e.g., < 1 year, 1-3, 3-5)
tenure_bins= [0, 1.1, 3.1, 5.1, 10.1, 15.1, 30.1, 54]
tenure_labels= ['< 1 year', '1-3 years', '3-5 years', '5-10 years',
                '10-15 years', '15-30 years', '30+ years']
dei_data['TenureGroup']= pd.cut(x= dei_data['tenure'],
                                 bins= tenure_bins,
                                 labels= tenure_labels,
                                 right= False)

# count of individuals per tenure group by gender
tenure_agg= dei_data.groupby(['TenureGroup', 'gender']).size().unstack(
    fill_value=0)
tenure_agg= tenure_agg.reset_index()
tenure_agg

In [None]:
# distribution pyramid of tenure
# setting up the figure and axes
plt.figure(figsize=(10,5))
sns.set(style="whitegrid")

# setting up the male population (in negative for left side)
sns.barplot(data= tenure_agg,
            x= -tenure_agg['male'],
            y= tenure_agg['TenureGroup'],
            color= 'orange',
            label= 'Male')

# setting up the female population
sns.barplot(data= tenure_agg,
            x= tenure_agg['female'],
            y= tenure_agg['TenureGroup'],
            color= 'purple',
            label= 'Female')

# Add data labels for male population
for index, value in enumerate(tenure_agg['male']):
    plt.text(-value, index, f'{value}', color='black', 
             ha='right', va='center')

# Add data labels for female population
for index, value in enumerate(tenure_agg['female']):
    plt.text(value, index, f'{value}', color='black', 
             ha='left', va='center')

# customizing the axes
plt.xlabel('Population')
plt.ylabel('Tenure Group')
plt.title('Count of employees by tenure and gender')
plt.legend()

# improving readability of x-axis labels
max_population = max(tenure_agg['male'].max(), tenure_agg['female'].max())
plt.xticks(ticks=[-max_population, -600, -400, -200, 
                  0, 200, 400, 600, max_population], 
           labels=[f'{max_population}', f'{600}', f'{400}', f'{200}',
                   '0', f'{200}',f'{400}',f'{600}', f'{max_population}'])

# displaying the graph
plt.show()

## Hiring and terminations

In [None]:
# ensuring term dates and start dates are of correct data type
dei_data['term_date'] = pd.to_datetime(dei_data['term_date'])
dei_data['start_date'] = pd.to_datetime(dei_data['start_date'])

# creating new columns with hiring & term years
dei_data['TermYear'] = dei_data['term_date'].dt.year
dei_data['StartYear'] = dei_data['start_date'].dt.year

In [None]:
# creating counts of new hires per year
hiring_agg= dei_data.groupby(['StartYear']).size()
hiring_agg= hiring_agg.reset_index()

# ensuring columns have proper names
hiring_agg.rename(columns={"StartYear":"Year", 
                           0:"Hire"},
                 inplace=True)

print(hiring_agg.columns)
hiring_agg.sample(n=5)

In [None]:
# creating counts of terminations per year
terms_agg= dei_data.groupby(['TermYear']).size()
terms_agg= terms_agg.reset_index()

# ensuring columns have proper names
terms_agg.rename(columns={"TermYear":"Year",
                          0:"Term"},
                 inplace=True)

print(terms_agg.columns)
terms_agg.sample(n=5)

In [None]:
# combining hires and departures
ee_movements= pd.merge(hiring_agg, terms_agg,
                       on='Year', how='outer')
ee_movements.sample(n=5)

In [None]:
# reshaping to fit graph format
ee_movements_graph= pd.melt(ee_movements,
                            id_vars= ['Year'],
                            value_vars= ['Hire','Term'],
                            var_name= 'Event',
                            value_name= 'Count')

ee_movements_graph.sample(n=5)

In [None]:
# setting the graph
plt.figure(figsize=(10,5))
sns.set(style= 'whitegrid')
sns.lineplot(data= ee_movements_graph,
             x= 'Year',
             y= 'Count',
             hue= 'Event',
             marker= 'o')

# customizing the plot
plt.title('Hires and departures 1970-2024')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Event')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

## Geographical locations

The geographical shapefile required to display geographical data was downloaded from the Open Government Portal.

Reference: Boundary Files, 2016 Census. Statistics Canada Catalogue no. 92-160-X.
Available [here](https://open.canada.ca/data/en/dataset/a883eb14-0c0e-45c4-b8c4-b54c4a819edb).

In [None]:
# loading the shapefile into a variable
canada= gpd.read_file('lpr_000b16a_e/lpr_000b16a_e.shp')
print(canada.columns)
canada.sample(n=5)

In [None]:
# creating the count of employees per province
province_agg= dei_data.groupby(['province']).size()
province_agg= province_agg.reset_index()
province_agg.rename(columns={0:'Count'},
                    inplace= True)
province_agg

In [None]:
# merging employee data with geospatial data
canada= canada.merge(province_agg, 
                     left_on= 'PRENAME',
                     right_on= 'province')

# Plot the map
plt.style.use('tableau-colorblind10')
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

canada.boundary.plot(ax=ax)
canada.plot(column='Count', ax=ax, legend=True,
            legend_kwds={'label': "Number of Employees",
                         'orientation': "horizontal"})

plt.title('Employee Count per Canadian Province')
plt.show()