### Identifying missing values in a tabular data

In [7]:
import pandas as pd
from io import StringIO

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

csv_data = \
'''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
'''
df = pd.read_csv(StringIO(csv_data))
df

#to check the missing values in each columns
df.isnull().sum()

# You can always access the underlying numpy array
# by using the values attribute, As scikit-learn api
# sometimes accepts the numpy array as input
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Eliminating training examples or features with missing values


In [12]:
# drop the rows with missing values
df.dropna(axis=0) #axis 0 for index, 1 for columns

# drop columns that have atlest one NaN in any row
df.dropna(axis=1)

# only drops rows where all columns are NaN
df.dropna(how='all')

# drop rows that have fewer than 4 real values
df.dropna(thresh=3)

# Only drop rows where NaN appears in specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


### Imputing missing values

In [15]:
from sklearn.impute import SimpleImputer
import numpy as np

# mean imputation, replace missing values with mean of feature
# other parameter for strategu are median, most_frequent
# most_frequent useful for imputing categorical variable

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

# alternative method in pandas
df.fillna(df.mean())

# KNN Imputer based on k-nearest neighbors approach to missing values

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


### Handling Categorical data


In [21]:
# Ordincal features - they have order or can be sorted
# Nominal features - no order
import pandas as pd
df = pd.DataFrame([
    ['Red', 'M', 11.5, 'class1'],
    ['Blue', 'L', 13.5, 'class2'],
    ['Green', 'XL', 15.5, 'class1']
])
df.columns = ['color', 'size', 'price', 'classlabel']
df

# Mapping ordinal features manually
size_mapping = {
    'XL':3,
    'L':2,
    'M':1}

df['size'] = df['size'].map(size_mapping)
df

# define a inverse mapping dictionary
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

Unnamed: 0,color,size,price,classlabel
0,Red,M,11.5,class1
1,Blue,L,13.5,class2
2,Green,XL,15.5,class1


Unnamed: 0,color,size,price,classlabel
0,Red,1,11.5,class1
1,Blue,2,13.5,class2
2,Green,3,15.5,class1


0     M
1     L
2    XL
Name: size, dtype: object