In [1]:
import numpy as np
import pandas as pd

#### enrollee_id : Unique ID for candidate

#### city : City code

#### city_ development _index : Developement index of the city (scaled)

#### gender : Gender of candidate

#### relevent_experience : Relevant experience of candidate

#### enrolled_university : Type of University course enrolled if any

#### education_level : Education level of candidate

#### major_discipline :Education major discipline of candidate

#### experience : Candidate total experience in years

#### company_size : No of employees in current employer's company

#### company_type : Type of current employer

#### lastnewjob : Difference in years between previous job and current job

#### training_hours : training hours completed

#### target : 0 – Not looking for job change, 1 – Looking for a job change

# Cleansing Data

In [2]:
hrd = pd.read_csv('aug_train.csv')
hrd_test = pd.read_csv('aug_test.csv')

In [3]:
hrd.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:
hrd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

## Imputation

In [5]:
hrd.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
last_new_job               423
training_hours               0
target                       0
dtype: int64

In [6]:
missing1 = hrd.isnull().sum().sort_values(ascending=False).reset_index()
missing1.columns = ['features','missing_num']
missing1['percentage'] = missing1['missing_num']/hrd.shape[0]*100
missing1

Unnamed: 0,features,missing_num,percentage
0,company_type,6140,32.049274
1,company_size,5938,30.994885
2,gender,4508,23.53064
3,major_discipline,2813,14.683161
4,education_level,460,2.401086
5,last_new_job,423,2.207955
6,enrolled_university,386,2.014824
7,experience,65,0.339284
8,target,0,0.0
9,training_hours,0,0.0


Because the company_type and company_size have more than 30% missing value, im gonna drop both coloumn

In [7]:
hrd_train = hrd.drop(['company_type', 'company_size'], axis=1)

In [8]:
missing2 = hrd_train.isnull().sum().sort_values(ascending=False).reset_index()
missing2.columns = ['features','missing_num']
missing2['percentage'] = missing2['missing_num']/hrd.shape[0]*100
missing2

Unnamed: 0,features,missing_num,percentage
0,gender,4508,23.53064
1,major_discipline,2813,14.683161
2,education_level,460,2.401086
3,last_new_job,423,2.207955
4,enrolled_university,386,2.014824
5,experience,65,0.339284
6,target,0,0.0
7,training_hours,0,0.0
8,relevent_experience,0,0.0
9,city_development_index,0,0.0


now im gonna fill missing coloumn with mode

In [9]:
col_mode = ['gender','major_discipline','education_level','last_new_job','enrolled_university', 'experience']
for col in col_mode:
    hrd_train[col].fillna(hrd_train[col].mode()[0],inplace=True)

In [10]:
missing3 = hrd_train.isnull().sum().sort_values(ascending=False).reset_index()
missing3.columns = ['features','missing_num']
missing3['percentage'] = missing3['missing_num']/hrd_train.shape[0]
missing3

Unnamed: 0,features,missing_num,percentage
0,target,0,0.0
1,training_hours,0,0.0
2,last_new_job,0,0.0
3,experience,0,0.0
4,major_discipline,0,0.0
5,education_level,0,0.0
6,enrolled_university,0,0.0
7,relevent_experience,0,0.0
8,gender,0,0.0
9,city_development_index,0,0.0


In [11]:
hrd_train.head(10)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,>4,47,0.0
2,11561,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,never,83,0.0
3,33241,city_115,0.789,Male,No relevent experience,no_enrollment,Graduate,Business Degree,<1,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,4,8,0.0
5,21651,city_176,0.764,Male,Has relevent experience,Part time course,Graduate,STEM,11,1,24,1.0
6,28806,city_160,0.92,Male,Has relevent experience,no_enrollment,High School,STEM,5,1,24,0.0
7,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,>4,18,1.0
8,27107,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,1,46,1.0
9,699,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,17,>4,123,0.0


In [12]:
hrd_train.to_csv('hrd_train.csv')