<a href="https://colab.research.google.com/github/Keerthana8888/HR-Analytics-/blob/main/Code%20File/HR_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Libraries And Datasets

In [None]:
#Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Models Selection
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency
# Evaluators
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [None]:
train = pd.read_csv("/content/train_LZdllcl (2) (1).csv")
test  = pd.read_csv("/content/test_2umaH9m (2).csv")

In [None]:
train.info()

In [None]:
test.info()

# Exploratory Data Analysis

## Uni-Varient

In [None]:
train.describe()

In [None]:
for i in train.columns:
  if i in ['no_of_trainings','age','previous_year_rating','length_of_service','avg_training_score']:
    sns.displot(train[i],kde=True)
  else:
    continue

In [None]:
columns_to_plot = ['department','region','education','gender','recruitment_chanel','KPIs_met >80%','awards_won?','is_promoted']

In [None]:
train['is_promoted'].value_counts()

In [None]:
train['department'].value_counts().plot(kind='bar')

## Bi-Varient

In [None]:
categ_columns = train.select_dtypes(include=['object'])
categ_columns.columns

## Chi Square

In [None]:
def chi2_sq_test(var1,var2):
  table=pd.crosstab(var1,var2)
  _,p,_,_=chi2_contingency(table)
  if p<=0.05:
    print('Accept the Alternate Hypothesis',round(p,2))
  else:
    print('Accept the Null Hypothesis',round(p,2))
  return table

In [None]:
chi2_sq_test(train['department'],train['is_promoted'])

In [None]:
chi2_sq_test(train['region'],train['is_promoted'])

In [None]:
chi2_sq_test(train['education'],train['is_promoted'])

In [None]:
chi2_sq_test(train['gender'],train['is_promoted'])

In [None]:
chi2_sq_test(train['recruitment_channel'],train['is_promoted'])

# Data Preparation

In [None]:
train['education'].mode([0])

In [None]:
train['previous_year_rating'].value_counts().sort_values().plot(kind='bar')

In [None]:
#Missing values
train['education'] = np.where(train['education'].isna(), train['education'].mode(dropna=True)[0], train['education'])
train['previous_year_rating'] = np.where(train['previous_year_rating'].isna(), train['previous_year_rating'].median(skipna=True), train['previous_year_rating'])

In [None]:
train.info()

In [None]:
train['previous_year_rating'] = train['previous_year_rating'].astype('int')

## Encoding

In [None]:
train_enc = pd.get_dummies(train)

In [None]:
train_enc.drop(['employee_id'], axis=1, inplace=True)

In [None]:
train_enc.info()

# Divide Data into X and Y

In [None]:
y = train['is_promoted']
x = train_enc

In [None]:
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

In [None]:
print(x.shape)
print(y.shape)

In [None]:
y.value_counts().plot(kind='bar')

**Train Test Split**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=123)

In [None]:
x_train.drop(['is_promoted'], axis = 1, inplace = True)
x_test.drop(['is_promoted'], axis = 1, inplace = True)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Multi-Varient

## Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
y_hat_train_lr = lr.predict(x_train)
y_hat_test_lr = lr.predict(x_test)

In [None]:
def model_eval(actual, predicted):
  conf_matrix = confusion_matrix(actual,predicted)
  acc_score = accuracy_score(actual, predicted)
  clas_rep = classification_report(actual, predicted)
  print('The Accuracy of the model is: ', round(acc_score,2))
  print(conf_matrix)
  print(clas_rep)

In [None]:
model_eval(y_train, y_hat_train_lr)

In [None]:
model_eval(y_test, y_hat_test_lr)

## Decision Tree

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

In [None]:
y_hat_train_dtree=dtree.predict(x_train)
y_hat_test_dtree=dtree.predict(x_test)

In [None]:
model_eval(y_train,y_hat_train_dtree)

In [None]:
model_eval(y_train,y_hat_train_dtree)

In [None]:
plt.figure(figsize=(15,10))
clf=DecisionTreeClassifier(max_depth=4)
clf.fit(x_train, y_train)
plot_tree(clf, filled=True)
plt.title("Decision tree trained HR Analytics Navie Bayes")
plt.show()

## Random Forest

In [None]:
rf=RandomForestClassifier()
rf.fit(x_train, y_train)

In [None]:
y_hat_train_rf=rf.predict(x_train)
y_hat_test_rf=rf.predict(x_test)

In [None]:
model_eval(y_test, y_hat_test_rf)

In [None]:
model_eval(y_test, y_hat_test_rf)

## AdaBoost

In [None]:
ada=AdaBoostClassifier()
ada.fit(x_train, y_train)

In [None]:
y_hat_train_ada = ada.predict(x_train)
y_hat_test_ada = ada.predict(x_test)

In [None]:
model_eval(y_train, y_hat_train_ada)

In [None]:
model_eval(y_test, y_hat_test_ada)

## Gradient Boost

In [None]:
gb=GradientBoostingClassifier()
gb.fit(x_train, y_train)

In [None]:
y_hat_train_gb=gb.predict(x_train)
y_hat_test_gb=gb.predict(x_test)

In [None]:
model_eval(y_train, y_hat_train_gb)

In [None]:
model_eval(y_test, y_hat_test_gb)


## XG Boosting

In [None]:
le=LabelEncoder()
y_train_enc=le.fit_transform(y_train)
y_test_enc=le.fit_transform(y_test)

In [None]:
xgb=XGBClassifier()
xgb.fit(x_train, y_train_enc)

In [None]:
y_hat_train_xgb = xgb.predict(x_train)
y_hat_test_xgb = xgb.predict(x_test)

In [None]:
model_eval(y_train_enc, y_hat_train_xgb)

In [None]:
model_eval(y_test_enc, y_hat_test_xgb)

## Navie Bayes

In [None]:
nb = GaussianNB()
nb.fit(x_train, y_train)

In [None]:
y_hat_train_nb = nb.predict(x_train)
y_hat_test_nb = nb.predict(x_test)

In [None]:
model_eval(y_train,y_hat_train_nb)

In [None]:
model_eval(y_test,y_hat_test_nb)

In [None]:
test['education'] = np.where(test['education'].isna(), test['education'].mode(dropna=True)[0], test['education'])
test['previous_year_rating'] = np.where(test['previous_year_rating'].isna(), test['previous_year_rating'].median(skipna= True), test['previous_year_rating'])

In [None]:
test['previous_year_rating'] = test['previous_year_rating'].astype('int')

In [None]:
test_enc = pd.get_dummies(test)

In [None]:
test_enc.drop(['employee_id'], axis = 1, inplace = True)

In [None]:
test['is_promoted'] = nb.predict(test_enc)

In [None]:
test.head()

In [None]:
accuracy_table=[['Logistic Regression',round(accuracy_score(y_test , y_hat_test_lr),2)],
  #['Decision Tree', round(accuracy_score(y_test , y_hat_test_dtree),2)],
  ['Random Forest' , round(accuracy_score(y_test , y_hat_test_rf),2)],
  ['Ada Boosting' , round(accuracy_score(y_test , y_hat_test_ada),2)],
  ['Gradient Boosting' , round(accuracy_score(y_test , y_hat_test_gb),2)],
  ['XGBoosting' , round(accuracy_score(y_test_enc, y_hat_test_xgb),2)],
  ['Navie Bayes' , round(accuracy_score(y_test_enc, y_hat_test_nb),2)]]
df1 = pd.DataFrame(accuracy_table, columns = ['Model','Test_Accuracy'])
print(df1)

**Random Forest** is the best-performing model with a 0.96 accuracy, making it the top choice for predicting outcomes accurately.