# Missing Values

## Import Libraries

In [2]:
import pandas as pd

## Import Data

In [3]:
train = pd.read_csv('train_classification.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             18359 non-null  int64  
 1   city                    18359 non-null  object 
 2   city_development_index  18359 non-null  float64
 3   gender                  14261 non-null  object 
 4   relevent_experience     18359 non-null  object 
 5   enrolled_university     18017 non-null  object 
 6   education_level         17902 non-null  object 
 7   major_discipline        15521 non-null  object 
 8   experience              18300 non-null  object 
 9   company_size            13580 non-null  object 
 10  company_type            13320 non-null  object 
 11  last_new_job            17992 non-null  object 
 12  training_hours          18359 non-null  int64  
 13  target                  18359 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

## Partition Data into X and y

In [6]:
y = train['target']
X = train.drop(['target','enrollee_id'], axis =  1)

## Get Numerical and Categorical Column Names

In [8]:
numerical_features = X.select_dtypes(include = ['int64','float64']).columns.tolist()
numerical_features

['city_development_index', 'training_hours']

In [9]:
categorical_features = X.select_dtypes(exclude = ['int64','float64']).columns.tolist()
categorical_features

['city',
 'gender',
 'relevent_experience',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

# Missing Strategy

## 1/ Identify

In [16]:
X.isnull().sum()

city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
dtype: int64

In [22]:
# Check Numerical Features
numerical_features

['city_development_index', 'training_hours']

## 2/ Numerical - Treat Missing Values

In [13]:
from sklearn.impute import SimpleImputer

In [15]:
# Create Instance
num_impute = SimpleImputer(strategy = 'median')

In [17]:
# Fit - calculating mean 
num_impute.fit(X[numerical_features])

SimpleImputer(strategy='median')

In [21]:
# Transform and Replace in X Dataframe
X[numerical_features] = num_impute.transform(X[numerical_features])

## 3/ Categorical - Treat Missing Values

In [24]:
# Create Instance
cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')

In [25]:
# Fit 
cat_impute.fit(X[categorical_features])

SimpleImputer(fill_value='missing', strategy='constant')

In [26]:
# Transform and Replace in X Datafrome
X[categorical_features] = cat_impute.transform(X[categorical_features])

## 4/ Check 

In [27]:
X.isnull().sum()

city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64