In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.DataFrame({
    'boolean_column': [True,False,True,False], 
    'integer_column': [1,2,3,4],
    'float_column': [1.,2.,3.,4.]
})
df

Unnamed: 0,boolean_column,integer_column,float_column
0,True,1,1.0
1,False,2,2.0
2,True,3,3.0
3,False,4,4.0


In [5]:
df.select_dtypes(include='bool')
df.select_dtypes(include=np.number)

Unnamed: 0,integer_column,float_column
0,1,1.0
1,2,2.0
2,3,3.0
3,4,4.0


In [11]:
class TypeSelector(BaseEstimator,TransformerMixin):
    def __init__(self,dtype):
        self.dtype = dtype
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X.select_dtypes(include=self.dtype)

In [12]:
a = TypeSelector('bool')
a.transform(df)

Unnamed: 0,boolean_column
0,True
1,False
2,True
3,False


In [14]:
b = TypeSelector(np.number)
b.transform(df)

Unnamed: 0,integer_column,float_column
0,1,1.0
1,2,2.0
2,3,3.0
3,4,4.0


In [19]:
df = pd.DataFrame({'eye_color':['green', np.nan, 'blue', 'brown']})
df['eye_color'] = df['eye_color'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
eye_color    3 non-null category
dtypes: category(1)
memory usage: 236.0 bytes


In [27]:
pd.get_dummies(df['eye_color'],dummy_na=True)

Unnamed: 0,blue,brown,green,NaN
0,0,0,1,0
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0


In [29]:
df['eye_color'].cat.codes

0    2
1   -1
2    0
3    1
dtype: int8

In [31]:
class StringIndex(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X.apply(lambda s:s.cat.codes.replace({-1:len(s.cat.categories)}))

In [32]:
c = StringIndex()
c.transform(df)

Unnamed: 0,eye_color
0,2
1,3
2,0
3,1


In [33]:
df

Unnamed: 0,eye_color
0,green
1,
2,blue
3,brown
