In [74]:
# https://github.com/scikit-learn-contrib/sklearn-pandas

from sklearn_pandas import DataFrameMapper, cross_val_score
import pandas as pd
import numpy as np
import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                     'children': [4., 6, 3, 3, 2, 3, 5, 4],
                     'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})

In [3]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


set
mapper = DataFrameMapper([(<font color=red>'pet'</font>,<font color=green> sklearn.preprocessing.LabelBinarizer())</font>,
                        (<font color=red>['children']</font>,<font color=green>sklearn.preprocessing.StandardScaler())</font>])
#### DataFrameMapper takes a list of tuples:
- <font color=red> first element- column name</font>
- <font color=green> second element- object which perform the transformation</font>
- third element- is optional, dict with transformation options

Diffrence between <font color=red>'pet'</font> and <font color=red>['children']</font> is the SHAPE, some transformers need 1-dim input und some 2-dim input

In [14]:
mapper = DataFrameMapper([('pet', sklearn.preprocessing.LabelBinarizer()),
                        (['children'], sklearn.preprocessing.StandardScaler())
                        ],df_out=True)

In [15]:
mapper

DataFrameMapper(default=False, df_out=True,
                features=[('pet',
                           LabelBinarizer(neg_label=0, pos_label=1,
                                          sparse_output=False)),
                          (['children'],
                           StandardScaler(copy=True, with_mean=True,
                                          with_std=True))],
                input_df=False, sparse=False)

## Test the Transformation

In [16]:
np.round(mapper.fit_transform(data.copy()),2)

# here firts 3 columns are for column pet, and fourth column is standarize children column

Unnamed: 0,pet_cat,pet_dog,pet_fish,children
0,1,0,0,0.21
1,0,1,0,1.88
2,0,1,0,-0.63
3,0,0,1,-0.63
4,1,0,0,-1.46
5,0,1,0,-0.63
6,1,0,0,1.04
7,0,0,1,0.21


In [17]:
sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]})
np.round(mapper.transform(sample), 2)


Unnamed: 0,pet_cat,pet_dog,pet_fish,children
0,1,0,0,1.04


### Output features names

In [18]:
mapper.transformed_names_


['pet_cat', 'pet_dog', 'pet_fish', 'children']

### Customer columns name for transformed features

In [22]:
mapper_alias = DataFrameMapper([
            (['children'], sklearn.preprocessing.StandardScaler(),
             {'alias': 'children_scaled'})
])

np.round(mapper_alias.fit_transform(data.copy()),2)
mapper_alias.transformed_names_

# here we change the name of transformed column

['children_scaled']

### Passing Series/ DataFrames to the transformer


In [41]:
from sklearn.base import TransformerMixin

class DateEncoder(TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        dt = X.dt
        return pd.concat([dt.year, dt.month, dt.day], axis=1)
dates_df = pd.DataFrame({'dates': pd.date_range('2015-10-30', '2015-11-02')})

mapper_dates = DataFrameMapper([
            ('dates', DateEncoder())], input_df=True)
mapper_dates.fit_transform(dates_df)


array([[2015,   10,   30],
       [2015,   10,   31],
       [2015,   11,    1],
       [2015,   11,    2]], dtype=int64)

### Outputting a dataframe

#### this doesn't work with deflaut = True or sparse = True

In [43]:
mapper_df = DataFrameMapper([('pet', sklearn.preprocessing.LabelBinarizer()),
                             (['children'], sklearn.preprocessing.StandardScaler())],
                             df_out=True)
np.round(mapper_df.fit_transform(data.copy()), 2)

Unnamed: 0,pet_cat,pet_dog,pet_fish,children
0,1,0,0,0.21
1,0,1,0,1.88
2,0,1,0,-0.63
3,0,0,1,-0.63
4,1,0,0,-1.46
5,0,1,0,-0.63
6,1,0,0,1.04
7,0,0,1,0.21


### Transform multiple columns
- sometimes transformer requires multiple columns as a input

In [48]:
mapper2 = DataFrameMapper([(['children','salary'],sklearn.decomposition.PCA(1))])

mapper2.fit_transform(data.copy())

array([[ 47.62195051],
       [-18.39077736],
       [  1.63037658],
       [-15.36917967],
       [-10.36208485],
       [ 16.62998504],
       [ -6.38386526],
       [-15.376405  ]])

### Multiple transformers for the same columns
- we can specify multiple transformers the same way like multiple columns. To do this we use a list of transformers

In [51]:
mapper3 = DataFrameMapper([(['age'], [sklearn.preprocessing.Imputer(),sklearn.preprocessing.StandardScaler()])])
data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
mapper3.fit_transform(data_3)




array([[-1.22474487],
       [ 0.        ],
       [ 1.22474487]])

### Columns that don't need any transformation

In [53]:
mapper3 = DataFrameMapper([('pet',sklearn.preprocessing.LabelBinarizer()),
                           ('children',None)])

mapper3.fit_transform(data.copy())

array([[1., 0., 0., 4.],
       [0., 1., 0., 6.],
       [0., 1., 0., 3.],
       [0., 0., 1., 3.],
       [1., 0., 0., 2.],
       [0., 1., 0., 3.],
       [1., 0., 0., 5.],
       [0., 0., 1., 4.]])

### Applaying a deflaut transformer

In [59]:
mapper4 = DataFrameMapper([
         ('pet', sklearn.preprocessing.LabelBinarizer()),
         ('children', None)], 
         default=sklearn.preprocessing.StandardScaler())
np.round(mapper4.fit_transform(data.copy()), 1)
# here for 'children' column we don't do the transformetion, for columns which are not selected/defined  we do the deflaut transformetion

array([[ 1. ,  0. ,  0. ,  4. ,  2.3],
       [ 0. ,  1. ,  0. ,  6. , -0.9],
       [ 0. ,  1. ,  0. ,  3. ,  0.1],
       [ 0. ,  0. ,  1. ,  3. , -0.7],
       [ 1. ,  0. ,  0. ,  2. , -0.5],
       [ 0. ,  1. ,  0. ,  3. ,  0.8],
       [ 1. ,  0. ,  0. ,  5. , -0.3],
       [ 0. ,  0. ,  1. ,  4. , -0.7]])

In [56]:
mapper4 = DataFrameMapper([
         ('pet', sklearn.preprocessing.LabelBinarizer())], 
         default=sklearn.preprocessing.StandardScaler())
np.round(mapper4.fit_transform(data.copy()), 1)
# for columns which are not selected/defined we do the deflaut transformetion

array([[ 1. ,  0. ,  0. ,  0.2,  2.3],
       [ 0. ,  1. ,  0. ,  1.9, -0.9],
       [ 0. ,  1. ,  0. , -0.6,  0.1],
       [ 0. ,  0. ,  1. , -0.6, -0.7],
       [ 1. ,  0. ,  0. , -1.5, -0.5],
       [ 0. ,  1. ,  0. , -0.6,  0.8],
       [ 1. ,  0. ,  0. ,  1. , -0.3],
       [ 0. ,  0. ,  1. ,  0.2, -0.7]])

### Same transformers for the multiple columns

In [65]:
from sklearn_pandas import gen_features
features_def = gen_features(
            columns = ['col1','col2','col3'],
            classes = [sklearn.preprocessing.LabelEncoder])
features_def
mapper5 = DataFrameMapper(features_def)

data5 = pd.DataFrame({
     'col1': ['yes', 'no', 'yes'],
     'col2': [True, False, False],
     'col3': ['one', 'two', 'three']})
    
mapper5.fit_transform(data5)

array([[1, 1, 0],
       [0, 0, 2],
       [1, 0, 1]], dtype=int64)

In [70]:
from sklearn.feature_selection import SelectKBest, chi2
mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
mapper_fs.fit_transform(data[['children','salary']], data['pet'])


array([[90.],
       [24.],
       [44.],
       [27.],
       [32.],
       [59.],
       [36.],
       [27.]])

In [72]:
pipe = sklearn.pipeline.Pipeline([
    ('featurize', mapper),
    ('lm', sklearn.linear_model.LinearRegression())])
np.round(cross_val_score(pipe, X=data.copy(), y=data.salary, scoring='r2'), 2)




TypeError: 'builtin_function_or_method' object is not iterable