In [16]:
# Imports 📥

# Packages
#-----------------------------------------------------------------------
import pandas as pd
import numpy as np

# DataTransformer class
import sys                  # allows manipulation of the module search path
import os                   # to handle file paths more flexibly

sys.path.append(os.path.abspath('People-Analytics'))    # Adds the People-Analytics folder to the sys.path, which allows Python to find the data_transformer module.

from data_transformer.data_transformer import DataTransformer     # Import the DataTransformer class from the data_transformer.py file.

# Settings
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # to see all columns in DataFrames

In [17]:
# Import data 
# -----------------------------------------------------------------------
df1 = pd.read_csv("hr_data_transformed.csv", index_col=0)

In [18]:
# Import data 
# -----------------------------------------------------------------------
df2 = pd.read_csv("hr_data_transformed.csv", index_col=0)

In [19]:
# create an object called "analysis1"

analysis1 = DataTransformer(df1)

In [20]:
# create an object called "analysis2"

analysis2 = DataTransformer(df2)

In [21]:
# Explore the dataframe

df1.head(2)

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_since_last_promotion,years_with_curr_manager,date_birth,remote_work
0,51,No,,684.0,Research & Development,6,3,,162.0,1.0,Male,51.0,3,5,Research Director,3,,19537.0,6462,7,No,13,3.0,3,0,,5,3.0,20,15,15,1972,True
1,52,No,,699.0,,1,4,Life Sciences,259.0,3.0,Male,65.0,2,5,Manager,3,,19999.0,5678,0,,14,3.0,1,1,34.0,5,3.0,33,11,9,1971,1


In [22]:
# Explore "employee_number" column

analysis1.quick_check("employee_number")

Column name: employee_number
Data type: float64
Unique values: [162. 259. 319. ... 967. 972. 990.]
Not null count: 1079
Null count: 431
Duplicated values: 430


In [23]:
# check if there are any duplicates

analysis1.df.duplicated(keep=False).sum()

26

In [24]:
# remove "employee_number" column

analysis1.drop_redundant_columns("employee_number")

In [25]:
# check if there are any duplicates

analysis1.df.duplicated(keep=False).sum()

26

In [27]:
# explore duplicated rows

duplicates = analysis1.df[analysis1.df.duplicated(keep=False)]
duplicates.sort_values(by="age")

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_since_last_promotion,years_with_curr_manager,date_birth,remote_work
1539,25,Yes,travel_rarely,688.0,Research & Development,3,3,Medical,1.0,Male,91.0,3,1,Laboratory Technician,1,Married,,9396,5,No,13,3.0,3,1,6.0,5,,2,0,2,1998,True
898,25,Yes,travel_rarely,688.0,Research & Development,3,3,Medical,1.0,Male,91.0,3,1,Laboratory Technician,1,Married,,9396,5,No,13,3.0,3,1,6.0,5,,2,0,2,1998,True
1510,26,No,travel_frequently,1096.0,Research & Development,6,3,Other,3.0,Male,61.0,4,1,Laboratory Technician,4,Married,,7102,0,No,18,3.0,1,1,8.0,3,3.0,7,7,7,1997,True
468,26,No,travel_frequently,1096.0,Research & Development,6,3,Other,3.0,Male,61.0,4,1,Laboratory Technician,4,Married,,7102,0,No,18,3.0,1,1,8.0,3,3.0,7,7,7,1997,True
83,29,No,travel_frequently,1413.0,Sales,1,1,,2.0,Female,42.0,3,3,Sales Executive,4,Married,,6599,1,,14,3.0,4,1,11.0,5,3.0,11,4,1,1994,True
1475,29,No,travel_frequently,1413.0,Sales,1,1,,2.0,Female,42.0,3,3,Sales Executive,4,Married,,6599,1,,14,3.0,4,1,11.0,5,3.0,11,4,1,1994,True
1514,31,No,,,Research & Development,2,4,,3.0,Male,32.0,3,1,Research Scientist,4,,,7747,0,,18,3.0,2,1,3.0,2,1.0,2,2,2,1992,True
1499,31,No,travel_rarely,196.0,Sales,29,4,Marketing,1.0,Female,91.0,2,2,Sales Executive,4,,5468.0,13402,1,,14,3.0,1,2,,3,3.0,12,5,7,1992,1
457,31,No,travel_rarely,196.0,Sales,29,4,Marketing,1.0,Female,91.0,2,2,Sales Executive,4,,5468.0,13402,1,,14,3.0,1,2,,3,3.0,12,5,7,1992,1
873,31,No,,,Research & Development,2,4,,3.0,Male,32.0,3,1,Research Scientist,4,,,7747,0,,18,3.0,2,1,3.0,2,1.0,2,2,2,1992,True


In [28]:
# remove "remote_work" column

analysis2.drop_redundant_columns("remote_work")

In [29]:
# check if there are any duplicates

analysis2.df.duplicated(keep=False).sum()

60

In [30]:
# explore duplicated rows

duplicates = analysis2.df[analysis2.df.duplicated(keep=False)]
duplicates.sort_values(by="age")

Unnamed: 0,age,attrition,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_since_last_promotion,years_with_curr_manager,date_birth
884,21,No,,895.0,Sales,9,2,,,1.0,Male,39.0,3,1,Sales Representative,4,Single,,2851,1,,24,4.0,3,0,3.0,3,2.0,3,2,2,2002
1525,21,No,,895.0,Sales,9,2,,,1.0,Male,39.0,3,1,Sales Representative,4,Single,,2851,1,,24,4.0,3,0,3.0,3,2.0,3,2,2,2002
1539,25,Yes,travel_rarely,688.0,Research & Development,3,3,Medical,,1.0,Male,91.0,3,1,Laboratory Technician,1,Married,,9396,5,No,13,3.0,3,1,6.0,5,,2,0,2,1998
898,25,Yes,travel_rarely,688.0,Research & Development,3,3,Medical,,1.0,Male,91.0,3,1,Laboratory Technician,1,Married,,9396,5,No,13,3.0,3,1,6.0,5,,2,0,2,1998
468,26,No,travel_frequently,1096.0,Research & Development,6,3,Other,,3.0,Male,61.0,4,1,Laboratory Technician,4,Married,,7102,0,No,18,3.0,1,1,8.0,3,3.0,7,7,7,1997
1510,26,No,travel_frequently,1096.0,Research & Development,6,3,Other,,3.0,Male,61.0,4,1,Laboratory Technician,4,Married,,7102,0,No,18,3.0,1,1,8.0,3,3.0,7,7,7,1997
1602,26,Yes,,,Research & Development,5,2,Medical,,3.0,Female,88.0,2,1,Research Scientist,3,Married,2366.0,20898,1,,14,3.0,1,1,8.0,2,3.0,8,1,7,1997
359,26,Yes,,,Research & Development,5,2,Medical,,3.0,Female,88.0,2,1,Research Scientist,3,Married,2366.0,20898,1,,14,3.0,1,1,8.0,2,3.0,8,1,7,1997
1528,27,No,travel_rarely,1469.0,Research & Development,1,2,,,4.0,Male,82.0,3,1,Laboratory Technician,2,,,17881,1,No,11,3.0,2,1,5.0,2,3.0,5,0,4,1996
887,27,No,travel_rarely,1469.0,Research & Development,1,2,,,4.0,Male,82.0,3,1,Laboratory Technician,2,,,17881,1,No,11,3.0,2,1,5.0,2,3.0,5,0,4,1996


# Conclusiones

1. Al eliminar la columna "employee_number" del .csv de datos transformados se detectan 13 registros duplicados.

2. Al profundizar el análisis y eliminar la columna "remote_work" (esto se hizo porque se trata de una columna conflictiva), se identifican 30 registros duplicados.