# Sklearn Pandas

In [1]:
import pandas as pd
sat_df = pd.read_csv('./nyc_hs_sat.csv', index_col = 0)

In [4]:
# !pip install sklearn_pandas

In [7]:
import pandas as pd
import numpy as np
import sklearn.preprocessing, sklearn.decomposition, \
sklearn.linear_model, sklearn.pipeline, sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from sklearn_pandas import DataFrameMapper, cross_val_score

In [25]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                  'children': [4., 6, 3, 3, 2, 3, 5, 4],
                  'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})

In [28]:
mapper = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     (['children'], sklearn.preprocessing.StandardScaler(), {'alias': 'children_scaled'})
], df_out = True)

In [29]:
mapper.fit_transform(data.copy())

Unnamed: 0,pet_cat,pet_dog,pet_fish,children_scaled
0,1,0,0,0.208514
1,0,1,0,1.87663
2,0,1,0,-0.625543
3,0,0,1,-0.625543
4,1,0,0,-1.459601
5,0,1,0,-0.625543
6,1,0,0,1.042572
7,0,0,1,0.208514


In [24]:
sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]})

In [14]:
mapper.transform(sample)

array([[1.        , 0.        , 0.        , 1.04257207]])

In [16]:
mapper.transformed_names_

['pet_cat', 'pet_dog', 'pet_fish', 'children']

### Custom Column Names

In [50]:
mapper = DataFrameMapper([
     ('pet', sklearn.preprocessing.LabelBinarizer()),
     (['children'], sklearn.preprocessing.StandardScaler(), 
      {'alias': 'children_scaled'}),
    ('pet', sklearn.preprocessing.LabelBinarizer()),
])

In [51]:
mapper.fit_transform(data.copy())

array([[ 1.        ,  0.        ,  0.        ,  0.20851441,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  1.87662973,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        , -0.62554324,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        , -0.62554324,  0.        ,
         0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        , -1.4596009 ,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        , -0.62554324,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  1.04257207,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ,  0.20851441,  0.        ,
         0.        ,  1.        ]])

In [21]:
mapper.transformed_names_

['pet_cat', 'pet_dog', 'pet_fish', 'children_scaled']

### Custom Encoder

In [46]:
from sklearn.base import TransformerMixin
class DateEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        dt = X.dt
        return pd.concat([dt.year, dt.month, dt.day], axis=1, names = ['year', 'month', 'day'])

In [47]:
dates_df = pd.DataFrame(
{'dates': pd.date_range('2015-10-30', '2015-11-02')})


In [52]:
mapper_dates = DataFrameMapper([
('dates', DateEncoder()),
    
], df_out=True, input_df=True)

In [53]:
mapper_dates.fit_transform(dates_df)

Unnamed: 0,dates_0,dates_1,dates_2
0,2015,10,30
1,2015,10,31
2,2015,11,1
3,2015,11,2


### Combining Columns

In [57]:
from sklearn.impute import MissingIndicator

In [56]:
data

Unnamed: 0,pet,children,salary
0,cat,4.0,90.0
1,dog,6.0,24.0
2,dog,3.0,44.0
3,fish,3.0,27.0
4,cat,2.0,32.0
5,dog,3.0,59.0
6,cat,5.0,36.0
7,fish,4.0,27.0


In [68]:
mapper = DataFrameMapper([
('pet', sklearn.preprocessing.LabelBinarizer()),
('children', None),
('children', None)
], df_out=True, input_df=True)

In [69]:
mapper.fit_transform(data)

Unnamed: 0,pet_cat,pet_dog,pet_fish,children,children.1
0,1,0,0,4.0,4.0
1,0,1,0,6.0,6.0
2,0,1,0,3.0,3.0
3,0,0,1,3.0,3.0
4,1,0,0,2.0,2.0
5,0,1,0,3.0,3.0
6,1,0,0,5.0,5.0
7,0,0,1,4.0,4.0


### Resources

[Sklearn pandas](https://github.com/scikit-learn-contrib/sklearn-pandas)