HR Analytics - Case Study

In [1]:
import pandas as pd

In [2]:
hr = pd.read_csv(r'C:\Users\Laxman\OneDrive\Desktop\Data Files_1\HR Promotion.csv')
hr.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [3]:
hr.shape

(54808, 14)

In [4]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


In [5]:
hr.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [6]:
hr['previous_year_rating'].skew()

-0.3106378431385327

In [7]:
#Since the skewness value of rating is between -1 to 1, we can replace the missing values using mean
hr['previous_year_rating'].fillna(hr['previous_year_rating'].mean(),inplace=True)
hr.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

The column education is categorical in nature, so we will replace the missing values using the mode

In [8]:
hr['education'].value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [9]:
hr['previous_year_rating'].mean()

3.3292557809168257

In [10]:
hr['education'].mode()[0]

"Bachelor's"

In [11]:
hr['education'].fillna(hr['education'].mode()[0],inplace=True)

In [12]:
hr.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [13]:
hr.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')

In [14]:
cat_col = ['department','region','education','gender','recruitment_channel']
hr_dummy = pd.get_dummies(hr,columns=cat_col,drop_first=True)

In [15]:
hr_dummy.shape

(54808, 55)

In [16]:
hr_dummy.head()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_Finance,...,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,gender_m,recruitment_channel_referred,recruitment_channel_sourcing
0,65438,1,35,5.0,8,1,0,49,0,0,...,0,0,1,0,0,0,1,0,0,1
1,65141,1,30,5.0,4,0,0,60,0,0,...,0,0,0,0,0,0,0,1,0,0
2,7513,1,34,3.0,7,0,0,50,0,0,...,0,0,0,0,0,0,0,1,0,1
3,2542,2,39,1.0,10,0,0,50,0,0,...,0,0,0,0,0,0,0,1,0,0
4,48945,1,45,3.0,2,0,0,73,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
y = hr_dummy[['is_promoted']]
x = hr_dummy.drop(columns=['is_promoted','employee_id'])

Lets perform feature scaling on the data using Standardisation

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_scal = sc.fit_transform(x)

In [19]:
x_scal = pd.DataFrame(x_scal,columns=x.columns)
x_scal.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Finance,department_HR,department_Legal,...,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,gender_m,recruitment_channel_referred,recruitment_channel_sourcing
0,-0.415276,0.025598,1.3789,0.50046,1.356878,-0.154018,-1.075931,-0.220262,-0.214834,-0.139009,...,-0.119055,-0.112916,3.212001,-0.109979,-0.087877,-0.122093,1.634695,-1.536223,-0.145876,1.166353
1,-0.415276,-0.627135,1.3789,-0.437395,-0.736986,-0.154018,-0.253282,-0.220262,-0.214834,-0.139009,...,-0.119055,-0.112916,-0.311332,-0.109979,-0.087877,-0.122093,-0.611735,0.650947,-0.145876,-0.857373
2,-0.415276,-0.104948,-0.271742,0.265996,-0.736986,-0.154018,-1.001145,-0.220262,-0.214834,-0.139009,...,-0.119055,-0.112916,-0.311332,-0.109979,-0.087877,-0.122093,-0.611735,0.650947,-0.145876,1.166353
3,1.226063,0.547785,-1.922383,0.969387,-0.736986,-0.154018,-1.001145,-0.220262,-0.214834,-0.139009,...,-0.119055,-0.112916,-0.311332,-0.109979,-0.087877,-0.122093,-0.611735,0.650947,-0.145876,-0.857373
4,-0.415276,1.331064,-0.271742,-0.906322,-0.736986,-0.154018,0.718939,-0.220262,-0.214834,-0.139009,...,-0.119055,-0.112916,-0.311332,-0.109979,-0.087877,-0.122093,-0.611735,0.650947,-0.145876,-0.857373


In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_scal,y,test_size=0.2,random_state=42)

Building the SVM model

In [21]:
from sklearn.svm import SVC
svc = SVC()
model = svc.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [22]:
y_test['Prediction'] = model.predict(x_test)

In [23]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [24]:
print(confusion_matrix(y_test['is_promoted'],y_test['Prediction']))

[[10042    12]
 [  791   117]]


In [25]:
print(accuracy_score(y_test['is_promoted'],y_test['Prediction']))

0.9267469439883232
