In [None]:
#Import necessary libraries
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv("/content/HR_Analytics.csv")

In [None]:
#Visualize the data
df.info()
print('\n')
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   EmpID                     1480 non-null   object 
 1   Age                       1480 non-null   int64  
 2   AgeGroup                  1480 non-null   object 
 3   Attrition                 1480 non-null   object 
 4   BusinessTravel            1480 non-null   object 
 5   DailyRate                 1480 non-null   int64  
 6   Department                1480 non-null   object 
 7   DistanceFromHome          1480 non-null   int64  
 8   Education                 1480 non-null   int64  
 9   EducationField            1480 non-null   object 
 10  EmployeeCount             1480 non-null   int64  
 11  EmployeeNumber            1480 non-null   int64  
 12  EnvironmentSatisfaction   1480 non-null   int64  
 13  Gender                    1480 non-null   object 
 14  HourlyRa

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,...,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1480.0,1423.0
mean,36.917568,801.384459,9.22027,2.910811,1.0,1031.860811,2.724324,65.84527,2.72973,2.064865,...,2.708784,80.0,0.791892,11.281757,2.797973,2.760811,7.009459,4.228378,2.182432,4.11806
std,9.128559,403.126988,8.131201,1.023796,0.0,605.955046,1.092579,20.328266,0.713007,1.105574,...,1.081995,0.0,0.850527,7.77087,1.288791,0.707024,6.117945,3.61602,3.219357,3.555484
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,493.75,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,800.0,7.0,3.0,1.0,1027.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1568.25,4.0,83.0,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


DATA CLEANING

In [None]:
# Check for missing values in each column
df.isnull().sum()

Unnamed: 0,0
EmpID,0
Age,0
AgeGroup,0
Attrition,0
BusinessTravel,0
DailyRate,0
Department,0
DistanceFromHome,0
Education,0
EducationField,0


In [None]:
#Handle missing values in YearsWithCurrManager (impute with median)
df['YearsWithCurrManager'].fillna(df['YearsWithCurrManager'].median(), inplace=True)

In [None]:
#Drop irrelevant columns
drop_cols = ['EmployeeCount','StandardHours','Over18','EmployeeNumber','EmpID']
df.drop(columns=drop_cols, inplace=True)

In [None]:
#Create salary bands
df['SalaryBand'] = pd.cut(df['MonthlyIncome'],
                          bins=[0, 3000, 6000, 12000, 20000],
                          labels=['Low','Medium','High','Very High'])

In [None]:
#Create tenure bands
df['TenureBand'] = pd.cut(df['YearsAtCompany'],
                          bins=[-1,3,7,12,40],   # start from -1 to include 0 years
                          labels=['0-3 yrs','4-7 yrs','8-12 yrs','13+ yrs'])

In [None]:
#Validation after preprocessing
print("\nDataset info after preprocessing:\n")
print(df.info())
print("\nFirst rows after preprocessing:\n")
print(df.head())
print("\nMissing values after preprocessing:\n")
print(df.isnull().sum())


Dataset info after preprocessing:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       1480 non-null   int64   
 1   AgeGroup                  1480 non-null   object  
 2   Attrition                 1480 non-null   object  
 3   BusinessTravel            1480 non-null   object  
 4   DailyRate                 1480 non-null   int64   
 5   Department                1480 non-null   object  
 6   DistanceFromHome          1480 non-null   int64   
 7   Education                 1480 non-null   int64   
 8   EducationField            1480 non-null   object  
 9   EnvironmentSatisfaction   1480 non-null   int64   
 10  Gender                    1480 non-null   object  
 11  HourlyRate                1480 non-null   int64   
 12  JobInvolvement            1480 non-null   int64   
 13  JobLevel    

In [None]:
#Save the cleaned dataset
df.to_csv("HR_Analytics_Clean.csv", index=False)
print("\n✅ Clean dataset saved as HR_Analytics_Clean.csv")


✅ Clean dataset saved as HR_Analytics_Clean.csv
