In [1]:
import pandas as pd


employee = pd.read_csv('C:/Users/LOQ/Desktop/HR clean Python/Employee_Cleaned.csv')
performance = pd.read_csv('C:/Users/LOQ/Desktop/HR clean Python/Performance_Cleaned.csv')

print("=== Employee columns ===")
print(employee.columns.tolist())
print("\n=== Performance columns ===")
print(performance.columns.tolist())

print("\n=== Employee sample rows ===")
print(employee.head(5).to_string(index=False))
print("\n=== Performance sample rows ===")
print(performance.head(5).to_string(index=False))


=== Employee columns ===
['EmployeeID', 'FirstName', 'LastName', 'Gender', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome (KM)', 'State', 'Ethnicity', 'EducationField', 'JobRole', 'MaritalStatus', 'Salary', 'StockOptionLevel', 'OverTime', 'HireDate', 'Attrition', 'YearsAtCompany', 'YearsInMostRecentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'EducationLevel.txt', 'SalaryOutlier', 'LogSalary', 'Avg_Salary_NoOutliers', 'Salary_Cleaned']

=== Performance columns ===
['PerformanceID', 'EmployeeID', 'ReviewDate', 'TrainingOpportunitiesWithinYear', 'TrainingOpportunitiesTaken', 'EnviromentSatisfactionLevel', 'JobSatisfactionLevel', 'RelationshipSatisfactionLevel', 'SatisfactionLevel', 'SelfRatingLevel', 'RatingLevel', 'HireDate', 'AttritionDate', 'AttritionFlag']

=== Employee sample rows ===
EmployeeID FirstName LastName Gender  Age BusinessTravel Department  DistanceFromHome (KM) State Ethnicity      EducationField              JobRole MaritalStatus  Salary  StockOp

In [2]:
employee['HireDate'] = pd.to_datetime(employee['HireDate'], errors='coerce')
performance['ReviewDate'] = pd.to_datetime(performance['ReviewDate'], errors='coerce')

print("Employee HireDate dtype:", employee['HireDate'].dtype)
print("Performance ReviewDate dtype:", performance['ReviewDate'].dtype)
print("Employee HireDate nulls:", employee['HireDate'].isna().sum())
print("Performance ReviewDate nulls:", performance['ReviewDate'].isna().sum())

print("\nEmployees with missing HireDate (up to 10 rows):")
print(employee[employee['HireDate'].isna()].head(10).to_string(index=False))


Employee HireDate dtype: datetime64[ns]
Performance ReviewDate dtype: datetime64[ns]
Employee HireDate nulls: 0
Performance ReviewDate nulls: 0

Employees with missing HireDate (up to 10 rows):
Empty DataFrame
Columns: [EmployeeID, FirstName, LastName, Gender, Age, BusinessTravel, Department, DistanceFromHome (KM), State, Ethnicity, EducationField, JobRole, MaritalStatus, Salary, StockOptionLevel, OverTime, HireDate, Attrition, YearsAtCompany, YearsInMostRecentRole, YearsSinceLastPromotion, YearsWithCurrManager, EducationLevel.txt, SalaryOutlier, LogSalary, Avg_Salary_NoOutliers, Salary_Cleaned]
Index: []


In [3]:
employee['Attrition_str'] = employee['Attrition'].astype(str).str.strip().str.lower()

employee['AttritionDate'] = employee.apply(
    lambda x: x['HireDate'] + pd.to_timedelta(x['YearsAtCompany'] * 365, unit='D')
    if x['Attrition_str'] == 'yes' else pd.NaT,
    axis=1
)

print("Sample AttritionDate rows (first 20):")
cols = ['EmployeeID','HireDate','YearsAtCompany','Attrition','AttritionDate']
print(employee[cols].head(20).to_string(index=False))


Sample AttritionDate rows (first 20):
EmployeeID   HireDate  YearsAtCompany Attrition AttritionDate
 C08F-94F9 2015-07-30               0       Yes    2015-07-30
 ADF8-AD57 2016-08-16               0       Yes    2016-08-16
 6C42-A719 2020-03-11               0       Yes    2020-03-11
 D014-7DB2 2020-08-24               0       Yes    2020-08-24
 FD95-9EEB 2020-11-19               0       Yes    2020-11-19
 20BA-5868 2021-03-27               0       Yes    2021-03-27
 DFDC-286C 2021-12-09               0       Yes    2021-12-09
 C28E-0CC3 2022-01-04               0        No           NaT
 7A98-768E 2022-01-25               0       Yes    2022-01-25
 A5CD-8AB8 2022-01-25               0        No           NaT
 4268-9EAD 2022-01-26               0        No           NaT
 B324-A703 2022-01-31               0        No           NaT
 A12D-DAF8 2022-02-06               0        No           NaT
 078A-E0F5 2022-02-11               0        No           NaT
 401F-CC71 2022-02-11           

In [5]:
perf_check = performance.merge(
    employee[['EmployeeID', 'HireDate', 'Attrition', 'AttritionDate']],
    on='EmployeeID',
    how='left',
    validate='m:1'
)

print("perf_check columns:", perf_check.columns.tolist())
print("Rows in performance:", len(performance), "Rows in perf_check:", len(perf_check))

miss_hire = perf_check[perf_check['HireDate_y'].isna()]

print("Performance rows with missing employee HireDate after merge:", len(miss_hire))
print(miss_hire[['PerformanceID', 'EmployeeID', 'ReviewDate']].head(10).to_string(index=False))


perf_check columns: ['PerformanceID', 'EmployeeID', 'ReviewDate', 'TrainingOpportunitiesWithinYear', 'TrainingOpportunitiesTaken', 'EnviromentSatisfactionLevel', 'JobSatisfactionLevel', 'RelationshipSatisfactionLevel', 'SatisfactionLevel', 'SelfRatingLevel', 'RatingLevel', 'HireDate_x', 'AttritionDate_x', 'AttritionFlag', 'HireDate_y', 'Attrition', 'AttritionDate_y']
Rows in performance: 4386 Rows in perf_check: 4386
Performance rows with missing employee HireDate after merge: 0
Empty DataFrame
Columns: [PerformanceID, EmployeeID, ReviewDate]
Index: []


In [6]:
perf_check.rename(columns={
    'HireDate_y': 'Employee_HireDate',
    'HireDate_x': 'Perf_HireDate',
    'AttritionDate_y': 'Employee_AttritionDate',
    'AttritionDate_x': 'Perf_AttritionDate'
}, inplace=True)


In [7]:
miss_hire = perf_check[perf_check['Employee_HireDate'].isna()]


In [8]:
perf_check = performance.merge(
    employee[['EmployeeID', 'HireDate', 'Attrition', 'AttritionDate']],
    on='EmployeeID',
    how='left',
    validate='m:1'
)

perf_check.rename(columns={
    'HireDate_x': 'Perf_HireDate',
    'HireDate_y': 'Emp_HireDate',
    'AttritionDate_x': 'Perf_AttritionDate',
    'AttritionDate_y': 'Emp_AttritionDate'
}, inplace=True)


In [9]:
performance_valid = perf_check[
    (perf_check['ReviewDate'] >= perf_check['Emp_HireDate']) &
    (
        perf_check['Emp_AttritionDate'].isna() |
        (perf_check['ReviewDate'] <= perf_check['Emp_AttritionDate'])
    )
].copy()

print("Valid performance rows after filtering:", len(performance_valid))


Valid performance rows after filtering: 4386


In [10]:
last_perf = (
    performance_valid.sort_values(['EmployeeID', 'ReviewDate'])
    .groupby('EmployeeID', as_index=False)
    .last()
)

print("Unique employees with latest performance:", len(last_perf))


Unique employees with latest performance: 1089


In [11]:
if 'RatingLevel' in last_perf.columns:
    last_perf.rename(columns={'RatingLevel': 'LastPerformance'}, inplace=True)


In [12]:
HR_full = employee.merge(
    last_perf.drop(columns=['Perf_HireDate', 'Perf_AttritionDate', 'Emp_HireDate', 'Emp_AttritionDate'], errors='ignore'),
    on='EmployeeID',
    how='left'
)

print("Merged HR_full shape:", HR_full.shape)


Merged HR_full shape: (1470, 41)


In [13]:
employee.to_csv('employee_cleaned.csv', index=False)
performance_valid.to_csv('performance_valid.csv', index=False)
last_perf.to_csv('last_performance.csv', index=False)
HR_full.to_csv('HR_full.csv', index=False)
print("✅ All 3 CSVs saved successfully!")


✅ All 3 CSVs saved successfully!


In [3]:
print(df.columns.tolist())



['EmployeeID', 'FirstName', 'LastName', 'Gender', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome (KM)', 'State', 'Ethnicity', 'EducationField', 'JobRole', 'MaritalStatus', 'Salary', 'StockOptionLevel', 'OverTime', 'HireDate', 'Attrition_x', 'YearsAtCompany', 'YearsInMostRecentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'EducationLevel.txt', 'SalaryOutlier', 'LogSalary', 'Avg_Salary_NoOutliers', 'Salary_Cleaned', 'Attrition_str', 'AttritionDate', 'PerformanceID', 'ReviewDate', 'TrainingOpportunitiesWithinYear', 'TrainingOpportunitiesTaken', 'EnviromentSatisfactionLevel', 'JobSatisfactionLevel', 'RelationshipSatisfactionLevel', 'SatisfactionLevel', 'SelfRatingLevel', 'LastPerformance', 'AttritionFlag', 'Attrition_y', 'EnviromentSatisfactionLevel_Num', 'JobSatisfactionLevel_Num', 'RelationshipSatisfactionLevel_Num', 'SatisfactionLevel_Num', 'SelfRating_Num']


In [5]:
df = HR_full.copy()

df.drop(columns=['Attrition_x', 'Attrition_y'], errors='ignore', inplace=True)

df.rename(columns={'EducationLevel.txt': 'EducationLevel'}, inplace=True)

df.columns = df.columns.str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

print(df.columns.tolist())


['EmployeeID', 'FirstName', 'LastName', 'Gender', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome_KM', 'State', 'Ethnicity', 'EducationField', 'JobRole', 'MaritalStatus', 'Salary', 'StockOptionLevel', 'OverTime', 'HireDate', 'YearsAtCompany', 'YearsInMostRecentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'EducationLevel', 'SalaryOutlier', 'LogSalary', 'Avg_Salary_NoOutliers', 'Salary_Cleaned', 'Attrition_str', 'AttritionDate', 'PerformanceID', 'ReviewDate', 'TrainingOpportunitiesWithinYear', 'TrainingOpportunitiesTaken', 'EnviromentSatisfactionLevel', 'JobSatisfactionLevel', 'RelationshipSatisfactionLevel', 'SatisfactionLevel', 'SelfRatingLevel', 'LastPerformance', 'AttritionFlag']


In [6]:
df['SalaryBand_DeptRole'] = df.groupby(['Department', 'JobRole'])['Salary_Cleaned'].transform(
    lambda x: pd.qcut(x, q=3, labels=['Low', 'Medium', 'High'])
)


In [10]:
df.to_csv('HR_complete.csv', index=False)
print("✅ HR_complete.csv ready for analysis!")


✅ HR_complete.csv ready for analysis!


In [11]:
df = HR_full.copy()
print("✅ Working copy created with", df.shape[0], "rows and", df.shape[1], "columns")



✅ Working copy created with 1470 rows and 41 columns


In [12]:
df.info()
df.head(3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   EmployeeID                       1470 non-null   object 
 1   FirstName                        1470 non-null   object 
 2   LastName                         1470 non-null   object 
 3   Gender                           1470 non-null   object 
 4   Age                              1470 non-null   int64  
 5   BusinessTravel                   1470 non-null   object 
 6   Department                       1470 non-null   object 
 7   DistanceFromHome (KM)            1470 non-null   int64  
 8   State                            1470 non-null   object 
 9   Ethnicity                        1470 non-null   object 
 10  EducationField                   1470 non-null   object 
 11  JobRole                          1470 non-null   object 
 12  MaritalStatus       

Unnamed: 0,EmployeeID,FirstName,LastName,Gender,Age,BusinessTravel,Department,DistanceFromHome (KM),State,Ethnicity,...,TrainingOpportunitiesWithinYear,TrainingOpportunitiesTaken,EnviromentSatisfactionLevel,JobSatisfactionLevel,RelationshipSatisfactionLevel,SatisfactionLevel,SelfRatingLevel,LastPerformance,AttritionFlag,Attrition_y
0,C08F-94F9,Nikos,Karpenko,Male,26,Some Travel,Technology,32,NY,White,...,,,,,,,,,,
1,ADF8-AD57,Trip,Bennell,Male,27,Some Travel,Technology,28,CA,White,...,,,,,,,,,,
2,6C42-A719,Bird,Libbey,Female,22,Some Travel,Technology,37,NY,White,...,,,,,,,,,,


In [13]:
if 'Attrition_y' in df.columns:
    df['Attrition'] = df['Attrition_y'].combine_first(df.get('Attrition_x'))
elif 'Attrition_x' in df.columns:
    df['Attrition'] = df['Attrition_x']

df.drop(columns=[c for c in ['Attrition_x', 'Attrition_y'] if c in df.columns], inplace=True)
print("✅ Attrition column unified")


✅ Attrition column unified


In [14]:
overview = {
    'Total Employees': df['EmployeeID'].nunique(),
    'Average Salary': round(df['Salary_Cleaned'].mean(), 2),
    'Median Salary': round(df['Salary_Cleaned'].median(), 2),
    'Attrition Rate (%)': round((df['Attrition'].eq('Yes').mean()) * 100, 2),
    'Avg Years at Company': round(df['YearsAtCompany'].mean(), 1),
    'Avg Satisfaction (where available)': round(df[['JobSatisfactionLevel', 'SatisfactionLevel']].stack().map({
        'Very Dissatisfied': 1, 'Dissatisfied': 2, 'Neutral': 3, 'Satisfied': 4, 'Very Satisfied': 5
    }).mean(), 2)
}

import pandas as pd
pd.DataFrame([overview])


Unnamed: 0,Total Employees,Average Salary,Median Salary,Attrition Rate (%),Avg Years at Company,Avg Satisfaction (where available)
0,1470,111780.66,70768.5,16.12,4.6,3.35


In [15]:
salary_summary = (
    df.groupby(['Department', 'JobRole'])
    .agg(
        Employees=('EmployeeID', 'nunique'),
        Avg_Salary=('Salary_Cleaned', 'mean'),
        Median_Salary=('Salary_Cleaned', 'median'),
        Min_Salary=('Salary_Cleaned', 'min'),
        Max_Salary=('Salary_Cleaned', 'max')
    )
    .round(0)
    .reset_index()
)

print(salary_summary.head(10))


        Department               JobRole  Employees  Avg_Salary  \
0  Human Resources   Hr Business Partner          7    314002.0   
1  Human Resources          Hr Executive         28     90250.0   
2  Human Resources            Hr Manager          4    449331.0   
3  Human Resources             Recruiter         24     37648.0   
4            Sales               Manager         37    317710.0   
5            Sales       Sales Executive        327    117196.0   
6            Sales  Sales Representative         83     40656.0   
7       Technology     Analytics Manager         52    350980.0   
8       Technology        Data Scientist        261     52625.0   
9       Technology   Engineering Manager         75    285014.0   

   Median_Salary  Min_Salary  Max_Salary  
0       309964.0    190466.0    445906.0  
1        93771.0     42303.0    152909.0  
2       486156.0    315071.0    509940.0  
3        30854.0     20583.0     70703.0  
4       313891.0    149813.0    513262.0  
5   

In [16]:
satisfaction_summary = (
    df.groupby('Department')[['JobSatisfactionLevel', 'SatisfactionLevel', 'SelfRatingLevel']]
    .agg(lambda x: x.value_counts().index[0] if len(x.dropna()) else None)
    .reset_index()
)

print(satisfaction_summary.head())


        Department JobSatisfactionLevel SatisfactionLevel      SelfRatingLevel
0  Human Resources         Dissatisfied      Dissatisfied     Above and Beyond
1            Sales       Very Satisfied         Satisfied  Exceeds Expectation
2       Technology              Neutral      Dissatisfied  Exceeds Expectation


In [17]:
salary_bins = [
    df['Salary_Cleaned'].min() - 1,
    df['Salary_Cleaned'].quantile(0.25),
    df['Salary_Cleaned'].quantile(0.75),
    df['Salary_Cleaned'].max() + 1
]
salary_labels = ['Low', 'Mid', 'High']

df['SalaryRange_Global'] = pd.cut(df['Salary_Cleaned'], bins=salary_bins, labels=salary_labels)

df['SalaryRange_Dept'] = df.groupby('Department')['Salary_Cleaned']\
    .transform(lambda x: pd.qcut(x, q=3, labels=['Low', 'Mid', 'High']))

print(df[['Department', 'Salary_Cleaned', 'SalaryRange_Global', 'SalaryRange_Dept']].head(10))


   Department  Salary_Cleaned SalaryRange_Global SalaryRange_Dept
0  Technology         40100.0                Low              Low
1  Technology         20387.0                Low              Low
2  Technology         30442.0                Low              Low
3       Sales         35754.0                Low              Low
4  Technology         96187.0                Mid             High
5       Sales         29673.0                Low              Low
6  Technology         64364.0                Mid              Mid
7  Technology         41539.0                Low              Low
8       Sales         58523.0                Mid              Low
9  Technology         24906.0                Low              Low


In [18]:
employee_summary = (
    df.groupby(['Department', 'JobRole', 'Gender'])
    .agg(
        Employees=('EmployeeID', 'nunique'),
        Avg_Salary=('Salary_Cleaned', 'mean'),
        Avg_YearsAtCompany=('YearsAtCompany', 'mean'),
        OverTime_Rate=('OverTime', lambda x: (x.eq('Yes').mean()) * 100)
    )
    .round(1)
    .reset_index()
)

print(employee_summary.head(10))


        Department              JobRole      Gender  Employees  Avg_Salary  \
0  Human Resources  Hr Business Partner      Female          2    396225.5   
1  Human Resources  Hr Business Partner        Male          3    239875.0   
2  Human Resources  Hr Business Partner  Non-Binary          2    342970.5   
3  Human Resources         Hr Executive      Female         14     94965.0   
4  Human Resources         Hr Executive        Male         11     86517.1   
5  Human Resources         Hr Executive  Non-Binary          3     81936.3   
6  Human Resources           Hr Manager      Female          2    412505.5   
7  Human Resources           Hr Manager        Male          2    486156.0   
8  Human Resources            Recruiter      Female         15     39754.7   
9  Human Resources            Recruiter        Male          8     34567.0   

   Avg_YearsAtCompany  OverTime_Rate  
0                 1.5            0.0  
1                 5.0           33.3  
2                 6.0   

In [21]:
df.columns[df.columns.str.contains('_Num')]


Index(['SatisfactionLevel_Num', 'SelfRating_Num'], dtype='object')

In [22]:
performance_summary = (
    df.groupby('Department')
    .agg(
        Avg_Satisfaction=('SatisfactionLevel_Num', 'mean'),
        Avg_SelfRating=('SelfRating_Num', 'mean'),
        TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100)
    )
    .round(2)
    .reset_index()
)
performance_summary


Unnamed: 0,Department,Avg_Satisfaction,Avg_SelfRating,TrainingTaken_Rate
0,Human Resources,3.0,4.04,47.62
1,Sales,3.49,3.97,43.18
2,Technology,3.34,3.98,48.85


In [24]:
df.columns.tolist()


['EmployeeID',
 'FirstName',
 'LastName',
 'Gender',
 'Age',
 'BusinessTravel',
 'Department',
 'DistanceFromHome (KM)',
 'State',
 'Ethnicity',
 'EducationField',
 'JobRole',
 'MaritalStatus',
 'Salary',
 'StockOptionLevel',
 'OverTime',
 'HireDate',
 'YearsAtCompany',
 'YearsInMostRecentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'EducationLevel.txt',
 'SalaryOutlier',
 'LogSalary',
 'Avg_Salary_NoOutliers',
 'Salary_Cleaned',
 'Attrition_str',
 'AttritionDate',
 'PerformanceID',
 'ReviewDate',
 'TrainingOpportunitiesWithinYear',
 'TrainingOpportunitiesTaken',
 'EnviromentSatisfactionLevel',
 'JobSatisfactionLevel',
 'RelationshipSatisfactionLevel',
 'SatisfactionLevel',
 'SelfRatingLevel',
 'LastPerformance',
 'AttritionFlag',
 'Attrition',
 'SalaryRange_Global',
 'SalaryRange_Dept',
 'SatisfactionLevel_Num',
 'SelfRating_Num']

In [27]:
df['OverallSatisfaction_Num'] = df[
    ['EnviromentSatisfactionLevel_Num',
     'JobSatisfactionLevel_Num',
     'RelationshipSatisfactionLevel_Num',
     'SatisfactionLevel_Num']
].mean(axis=1).round(2)

dept_summary = (
    df.groupby('Department')
    .agg(
        Avg_OverallSatisfaction=('OverallSatisfaction_Num', 'mean'),
        Avg_SelfRating=('SelfRating_Num', 'mean'),
        TrainingTakenRate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
        Avg_Salary=('Salary_Cleaned', 'mean')
    )
    .round(2)
    .reset_index()
)

print("✅ Department-level performance summary:")
print(dept_summary.head())

# === Job Role-level summary ===
role_summary = (
    df.groupby(['Department', 'JobRole'])
    .agg(
        Avg_OverallSatisfaction=('OverallSatisfaction_Num', 'mean'),
        Avg_SelfRating=('SelfRating_Num', 'mean'),
        TrainingTakenRate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
        Avg_Salary=('Salary_Cleaned', 'mean')
    )
    .round(2)
    .reset_index()
)

print("\n✅ Job Role-level performance summary:")
print(role_summary.head())


✅ Department-level performance summary:
        Department  Avg_OverallSatisfaction  Avg_SelfRating  \
0  Human Resources                     3.23            4.04   
1            Sales                     3.56            3.97   
2       Technology                     3.44            3.98   

   TrainingTakenRate  Avg_Salary  
0              47.62   117871.21  
1              43.18   119580.95  
2              48.85   107748.95  

✅ Job Role-level performance summary:
        Department              JobRole  Avg_OverallSatisfaction  \
0  Human Resources  Hr Business Partner                     3.50   
1  Human Resources         Hr Executive                     3.08   
2  Human Resources           Hr Manager                     3.50   
3  Human Resources            Recruiter                     3.31   
4            Sales              Manager                     3.40   

   Avg_SelfRating  TrainingTakenRate  Avg_Salary  
0            4.20              57.14   314002.43  
1            3.82

In [28]:
dept_summary = df.groupby('Department').agg(
    Avg_Satisfaction=('OverallSatisfaction_Num', 'mean'),
    Avg_SelfRating=('SelfRating_Num', 'mean'),
    TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
    Num_Employees=('EmployeeID', 'nunique')
).round(2).reset_index()

print(dept_summary)


        Department  Avg_Satisfaction  Avg_SelfRating  TrainingTaken_Rate  \
0  Human Resources              3.23            4.04               47.62   
1            Sales              3.56            3.97               43.18   
2       Technology              3.44            3.98               48.85   

   Num_Employees  
0             63  
1            447  
2            960  


In [29]:
role_summary = df.groupby(['Department', 'JobRole']).agg(
    Avg_Satisfaction=('OverallSatisfaction_Num', 'mean'),
    Avg_SelfRating=('SelfRating_Num', 'mean'),
    TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
    Num_Employees=('EmployeeID', 'nunique')
).round(2).reset_index()

print(role_summary)


         Department                    JobRole  Avg_Satisfaction  \
0   Human Resources        Hr Business Partner              3.50   
1   Human Resources               Hr Executive              3.08   
2   Human Resources                 Hr Manager              3.50   
3   Human Resources                  Recruiter              3.31   
4             Sales                    Manager              3.40   
5             Sales            Sales Executive              3.60   
6             Sales       Sales Representative              3.49   
7        Technology          Analytics Manager              3.39   
8        Technology             Data Scientist              3.40   
9        Technology        Engineering Manager              3.44   
10       Technology  Machine Learning Engineer              3.45   
11       Technology   Senior Software Engineer              3.47   
12       Technology          Software Engineer              3.46   

    Avg_SelfRating  TrainingTaken_Rate  Num_Emp

In [30]:
df['ManagerRating_Num'] = df['LastPerformance'].map(rating_map)

perf_summary = df.groupby('Department').agg(
    Avg_ManagerRating=('ManagerRating_Num', 'mean')
).round(2).reset_index()

print(perf_summary)


        Department  Avg_ManagerRating
0  Human Resources               3.57
1            Sales               3.44
2       Technology               3.47


In [31]:
df['ManagerRating_Num'] = df['LastPerformance'].map(rating_map)

overview_dashboard = df.groupby('Department').agg(
    Num_Employees=('EmployeeID', 'nunique'),
    Avg_Age=('Age', 'mean'),
    Avg_Salary=('Salary', 'mean'),
    Avg_Satisfaction=('SatisfactionLevel_Num', 'mean'),
    Avg_SelfRating=('SelfRating_Num', 'mean'),
    Avg_ManagerRating=('ManagerRating_Num', 'mean'),
    TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
    Attrition_Rate=('Attrition_str', lambda x: (x == 'Yes').mean() * 100)
).round(2).reset_index()

overview_dashboard


Unnamed: 0,Department,Num_Employees,Avg_Age,Avg_Salary,Avg_Satisfaction,Avg_SelfRating,Avg_ManagerRating,TrainingTaken_Rate,Attrition_Rate
0,Human Resources,63,29.38,119698.81,3.0,4.04,3.57,47.62,0.0
1,Sales,447,29.49,119566.16,3.49,3.97,3.44,43.18,0.0
2,Technology,960,28.73,109436.41,3.34,3.98,3.47,48.85,0.0


In [32]:
overview_dashboard.to_csv('overview_dashboard.csv', index=False)
print("✅ Overview dashboard CSV saved!")


✅ Overview dashboard CSV saved!


In [33]:
df.to_csv('HR_full_cleaned.csv', index=False)
print("✅ CSV for Tableau saved!")


✅ CSV for Tableau saved!


In [35]:

overview_dashboard = df.groupby('Department').agg(
    Num_Employees=('EmployeeID', 'nunique'),
    Avg_Age=('Age', 'mean'),
    Avg_Salary=('Salary', 'mean'),
    Avg_Satisfaction=('OverallSatisfaction_Num', 'mean'),
    Avg_SelfRating=('SelfRating_Num', 'mean'),
    Avg_ManagerRating=('LastPerformance', lambda x: x.map(rating_map).mean()),
    TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
    Attrition_Rate=('Attrition_str', lambda x: (x == 'Yes').mean() * 100)
).round(2).reset_index()

overview_dashboard


Unnamed: 0,Department,Num_Employees,Avg_Age,Avg_Salary,Avg_Satisfaction,Avg_SelfRating,Avg_ManagerRating,TrainingTaken_Rate,Attrition_Rate
0,Human Resources,63,29.38,119698.81,3.23,4.04,3.57,47.62,0.0
1,Sales,447,29.49,119566.16,3.56,3.97,3.44,43.18,0.0
2,Technology,960,28.73,109436.41,3.44,3.98,3.47,48.85,0.0


In [36]:
employee_dashboard = df[[
    'EmployeeID', 'FirstName', 'LastName', 'Department', 'JobRole',
    'Salary', 'OverallSatisfaction_Num', 'SelfRating_Num',
    'LastPerformance', 'TrainingOpportunitiesTaken', 'Attrition_str'
]].copy()

employee_dashboard['ManagerRating_Num'] = employee_dashboard['LastPerformance'].map(rating_map)

employee_dashboard.head()


Unnamed: 0,EmployeeID,FirstName,LastName,Department,JobRole,Salary,OverallSatisfaction_Num,SelfRating_Num,LastPerformance,TrainingOpportunitiesTaken,Attrition_str,ManagerRating_Num
0,C08F-94F9,Nikos,Karpenko,Technology,Software Engineer,40100,,,,,yes,
1,ADF8-AD57,Trip,Bennell,Technology,Data Scientist,20387,,,,,yes,
2,6C42-A719,Bird,Libbey,Technology,Data Scientist,30442,,,,,yes,
3,D014-7DB2,Dolli,Dodgson,Sales,Sales Representative,35754,,,,,yes,
4,FD95-9EEB,Sasha,Incogna,Technology,Data Scientist,96187,,,,,yes,


In [37]:
performance_dashboard = df.groupby(['Department', 'JobRole']).agg(
    Avg_ManagerRating=('LastPerformance', lambda x: x.map(rating_map).mean()),
    Avg_Satisfaction=('OverallSatisfaction_Num', 'mean'),
    Avg_SelfRating=('SelfRating_Num', 'mean'),
    TrainingTaken_Rate=('TrainingOpportunitiesTaken', lambda x: (x > 0).mean() * 100),
    Num_Employees=('EmployeeID', 'nunique')
).round(2).reset_index()

performance_dashboard


Unnamed: 0,Department,JobRole,Avg_ManagerRating,Avg_Satisfaction,Avg_SelfRating,TrainingTaken_Rate,Num_Employees
0,Human Resources,Hr Business Partner,3.8,3.5,4.2,57.14,7
1,Human Resources,Hr Executive,3.23,3.08,3.82,42.86,28
2,Human Resources,Hr Manager,4.0,3.5,4.33,50.0,4
3,Human Resources,Recruiter,3.88,3.31,4.25,50.0,24
4,Sales,Manager,3.64,3.4,4.08,40.54,37
5,Sales,Sales Executive,3.43,3.6,3.96,43.12,327
6,Sales,Sales Representative,3.37,3.49,3.96,44.58,83
7,Technology,Analytics Manager,3.32,3.39,3.85,46.15,52
8,Technology,Data Scientist,3.61,3.4,4.12,50.19,261
9,Technology,Engineering Manager,3.44,3.44,3.93,52.0,75
