Handling missing Data in sklearn

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
miles = pd.DataFrame({'farthsest_run_mi':[50,62,np.nan,100,26,13,31,50]})

In [4]:
miles

Unnamed: 0,farthsest_run_mi
0,50.0
1,62.0
2,
3,100.0
4,26.0
5,13.0
6,31.0
7,50.0


In [5]:
miles.isna().sum()

farthsest_run_mi    1
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
imp_mean = SimpleImputer(strategy='mean')

In [8]:
imp_mean.fit_transform(miles)

array([[ 50.        ],
       [ 62.        ],
       [ 47.42857143],
       [100.        ],
       [ 26.        ],
       [ 13.        ],
       [ 31.        ],
       [ 50.        ]])

In [9]:
imp_median = SimpleImputer(strategy='median')

In [10]:
imp_median.fit_transform(miles)

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [11]:
imp_mode = SimpleImputer(strategy='most_frequent')

In [12]:
imp_mode.fit_transform(miles)

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [13]:
imp_constant = SimpleImputer(strategy='constant', fill_value=13)

In [14]:
names = pd.DataFrame({'names':['ryan','nolan','honus','wanger',np.nan,'ruth']})

In [15]:
names

Unnamed: 0,names
0,ryan
1,nolan
2,honus
3,wanger
4,
5,ruth


In [16]:
imp_constant_cat = SimpleImputer(strategy='constant', fill_value='babe')

In [17]:
imp_constant_cat.fit_transform(names)

array([['ryan'],
       ['nolan'],
       ['honus'],
       ['wanger'],
       ['babe'],
       ['ruth']], dtype=object)

In [18]:
imp_mean_marked = SimpleImputer(strategy='mean', add_indicator=True)

In [19]:
imp_mean_marked.fit_transform(miles)

array([[ 50.        ,   0.        ],
       [ 62.        ,   0.        ],
       [ 47.42857143,   1.        ],
       [100.        ,   0.        ],
       [ 26.        ,   0.        ],
       [ 13.        ,   0.        ],
       [ 31.        ,   0.        ],
       [ 50.        ,   0.        ]])

In [20]:
df = pd.read_csv('simple_imputer_csv.csv')

In [21]:
df

Unnamed: 0,Name,farthest_run_mi
0,Ryan,50.0
1,Nolan,62.0
2,Walter,
3,Honus,100.0
4,Christy,26.0
5,,13.0
6,Napoleon,31.0
7,Tris,50.0


In [22]:
from sklearn.compose import make_column_transformer

In [23]:
ct =  make_column_transformer(
    (imp_constant_cat,['Name']),
    (imp_mean,['farthest_run_mi']),
    remainder = 'drop'
)

In [24]:
ct.set_output(transform='pandas')

In [25]:
df_pandas = ct.fit_transform(df)

In [26]:
df_pandas

Unnamed: 0,simpleimputer-1__Name,simpleimputer-2__farthest_run_mi
0,Ryan,50.0
1,Nolan,62.0
2,Walter,47.428571
3,Honus,100.0
4,Christy,26.0
5,babe,13.0
6,Napoleon,31.0
7,Tris,50.0
