# Import Packages

In [1]:
import numpy as np
import pandas as pd

# Import Data

In [29]:
df = pd.read_csv('hr_prediction.csv')

## Check Data

In [30]:
# Metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             18359 non-null  int64  
 1   city                    18359 non-null  object 
 2   city_development_index  18359 non-null  float64
 3   gender                  14261 non-null  object 
 4   relevent_experience     18359 non-null  object 
 5   enrolled_university     18017 non-null  object 
 6   education_level         17902 non-null  object 
 7   major_discipline        15521 non-null  object 
 8   experience              18300 non-null  object 
 9   company_size            13580 non-null  object 
 10  company_type            13320 non-null  object 
 11  last_new_job            17992 non-null  object 
 12  training_hours          18359 non-null  int64  
 13  target                  18359 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

## Summary Statistics

In [31]:
df.describe()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
count,18359.0,18359.0,18359.0,18359.0
mean,16729.360096,0.84714,65.899014,0.132088
std,9643.749725,0.110189,60.8853,0.338595
min,1.0,0.448,1.0,0.0
25%,8378.5,0.796,23.0,0.0
50%,16706.0,0.91,47.0,0.0
75%,25148.5,0.92,89.0,0.0
max,33380.0,0.949,336.0,1.0


In [32]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1


## Check Missing Values

In [33]:
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

## Treat Missing Values

### Check and Treat Col 1

In [34]:
df['gender'].value_counts(dropna=False)

Male      12884
NaN        4098
Female     1188
Other       189
Name: gender, dtype: int64

In [35]:
# Treat missing values
df['gender'] = df['gender'].fillna('missing')

In [36]:
df['gender'].value_counts()

Male       12884
missing     4098
Female      1188
Other        189
Name: gender, dtype: int64

### Check & treat Col 2

In [37]:
df['enrolled_university'].value_counts(dropna=False)

no_enrollment       13659
Full time course     3187
Part time course     1171
NaN                   342
Name: enrolled_university, dtype: int64

In [38]:
# Treat missing values
df['enrolled_university'] = df['enrolled_university'].fillna('missing')

In [39]:
df['enrolled_university'].value_counts(dropna=False)

no_enrollment       13659
Full time course     3187
Part time course     1171
missing               342
Name: enrolled_university, dtype: int64

### Treat rest

In [40]:
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       0
relevent_experience          0
enrolled_university          0
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [41]:
df['education_level'] = df['education_level'].fillna('missing')
df['major_discipline'] = df['major_discipline'].fillna('missing')
df['company_size'] = df['company_size'].fillna('missing')
df['company_type'] = df['company_type'].fillna('missing')

In [42]:
df.isnull().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                      0
relevent_experience         0
enrolled_university         0
education_level             0
major_discipline            0
experience                 59
company_size                0
company_type                0
last_new_job              367
training_hours              0
target                      0
dtype: int64

### Treat Numericals

In [43]:
df['experience'] = df['experience'].apply(lambda x : 21 if x == '>20' else x)
df['experience'] = df['experience'].apply(lambda x : 0 if x == '<1' else x)

In [45]:
df['experience'] = df['experience'].fillna(df['experience'].median())

In [46]:
df['last_new_job'] = df['last_new_job'].apply(lambda x : 5 if x == '>4' else x)
df['last_new_job'] = df['last_new_job'].apply(lambda x : 0 if x == 'never' else x)
df['last_new_job'] = df['last_new_job'].fillna(df['last_new_job'].median())

In [47]:
df.isnull().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
target                    0
dtype: int64

# Model

## Divide Data into X & y

In [54]:
y = df['target']
X = df.drop(['enrollee_id','target'], axis = 1)

In [56]:
X.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106
1,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69
2,city_16,0.91,missing,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4
3,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26
4,city_100,0.887,missing,No relevent experience,no_enrollment,Masters,STEM,8,missing,missing,2,88


## Encode Categorical Features

In [61]:
X = pd.get_dummies(X, drop_first = True)

In [68]:
X.head()

Unnamed: 0,city_development_index,training_hours,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,city_city_105,city_city_106,...,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,company_type_missing,last_new_job_1.0,last_new_job_5,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4
0,0.689,106,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0.923,69,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.91,4,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0.666,26,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0.887,88,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


## Build Model

In [62]:
# Import Package
from sklearn.linear_model import LogisticRegression

In [69]:
# Initiate an Instance
lr = LogisticRegression(max_iter = 1000)

In [70]:
# Fit on Train data
lr.fit(X, y)

LogisticRegression(max_iter=1000)

In [79]:
#lr.coef_

## Make Predictions

In [71]:
y_prediction = lr.predict(X)

In [74]:
y_prediction

25

## Accuracy

In [80]:
from sklearn.metrics import classification_report

In [82]:
print(classification_report(y_prediction, y))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93     18334
           1       0.01      0.56      0.01        25

    accuracy                           0.87     18359
   macro avg       0.50      0.71      0.47     18359
weighted avg       1.00      0.87      0.93     18359



# Add predictions to Table

In [75]:
df['y_prediction'] = y_prediction

In [78]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,y_prediction
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0,0
2,46,city_16,0.91,missing,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0,0
4,21751,city_100,0.887,missing,No relevent experience,no_enrollment,Masters,STEM,8,missing,missing,2,88,1,0


In [77]:
df.to_csv('hr_prediction_export.csv')