<a href="https://www.kaggle.com/code/mostafamamdouhhassan/hr-analytics-using-logistic-regression?scriptVersionId=216318509" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Reading Data

In [2]:
df = pd.read_csv('/kaggle/input/hr-analytics/HR_comma_sep.csv')


In [3]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
df.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [6]:
df_train = df.drop(['Department' , 'left'] , axis=1)
df_train

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary
0,0.38,0.53,2,157,3,0,0,low
1,0.80,0.86,5,262,6,0,0,medium
2,0.11,0.88,7,272,4,0,0,medium
3,0.72,0.87,5,223,5,0,0,low
4,0.37,0.52,2,159,3,0,0,low
...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,low
14995,0.37,0.48,2,160,3,0,0,low
14996,0.37,0.53,2,143,3,0,0,low
14997,0.11,0.96,6,280,4,0,0,low


In [7]:
df_test = df['left']
df_test

0        1
1        1
2        1
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: left, Length: 14999, dtype: int64

# Encoding

In [8]:
salary_dummies = pd.get_dummies(df_train['salary'], prefix="salary")
salary_dummies = salary_dummies.astype(int)

In [9]:
df_with_dummies = pd.concat([df_train,salary_dummies],axis='columns')
df_with_dummies.drop(['salary'],axis=1 , inplace=True)
df_with_dummies

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,1,0
1,0.80,0.86,5,262,6,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,0,0,1,0


# Preparing Data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_with_dummies,df_test,train_size=0.8,random_state=42)

# Logestic Regression Model

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)

In [12]:
model.fit(X_train,y_train)

In [13]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
model.score(X_test,y_test)

0.7746666666666666

# 2nd Order Polynomial


In [15]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2 , include_bias=False)
second_order = poly.fit_transform(df_with_dummies)
second_order

array([[0.38, 0.53, 2.  , ..., 1.  , 0.  , 0.  ],
       [0.8 , 0.86, 5.  , ..., 0.  , 0.  , 1.  ],
       [0.11, 0.88, 7.  , ..., 0.  , 0.  , 1.  ],
       ...,
       [0.37, 0.53, 2.  , ..., 1.  , 0.  , 0.  ],
       [0.11, 0.96, 6.  , ..., 1.  , 0.  , 0.  ],
       [0.37, 0.52, 2.  , ..., 1.  , 0.  , 0.  ]])

In [16]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(second_order, df_test, test_size=0.3, random_state=42)

In [17]:
model2 = LogisticRegression(max_iter=1000)

In [18]:
model2.fit(X_train2,y_train2)

In [19]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
model2.score(X_test2,y_test2)

0.8444444444444444