### Applying pandas qcut bins to new data

In [21]:
import pandas as pd
import numpy as np
prng = np.random.RandomState(0)
df = pd.DataFrame(prng.randn(100, 2), columns = ["A", "B"])

In [22]:
col_c = ['a','b','c','d'] * 25

In [23]:
df['C'] = col_c

In [24]:
df.head()

Unnamed: 0,A,B,C
0,1.764052,0.400157,a
1,0.978738,2.240893,b
2,1.867558,-0.977278,c
3,0.950088,-0.151357,d
4,-0.103219,0.410599,a


In [28]:
grp = df.groupby(by='C')
for col in ['A','B']:
    print(grp[col])
    

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x0000000008A83CF8>
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000000087CF208>


In [87]:
bins_dict = {}
for group, subset in grp:
    print(group)
    print(subset)

    for col in ['B']:
        ser, bins = pd.qcut(subset[col], 5, retbins=True, labels=range(5))
        col_box = col + '_box'
        subset[col_box] = ser
        key = group + '_' + col
        bins_dict[key] = bins
    print(subset)
        

a
           A         B  C
0   1.764052  0.400157  a
4  -0.103219  0.410599  a
8   1.494079 -0.205158  a
12  2.269755 -1.454366  a
16 -0.887786 -1.980796  a
20 -1.048553 -1.420018  a
24 -1.613898 -0.212740  a
28  0.066517  0.302472  a
32  0.177426 -0.401781  a
36  1.139401 -1.234826  a
40 -1.165150  0.900826  a
44 -1.070753  1.054452  a
48  0.010500  1.785870  a
52 -1.173123  1.943621  a
56 -0.861226  1.910065  a
60  0.376426 -1.099401  a
64  0.672295  0.407462  a
68  0.576591 -0.208299  a
72  2.383145  0.944479  a
76 -0.744755 -0.826439  a
80 -0.498032  1.929532  a
84  1.188030  0.316943  a
88 -0.803410 -0.689550  a
92  0.625231 -1.602058  a
96 -0.039283 -1.168093  a
           A         B  C B_box
0   1.764052  0.400157  a     3
4  -0.103219  0.410599  a     3
8   1.494079 -0.205158  a     2
12  2.269755 -1.454366  a     0
16 -0.887786 -1.980796  a     0
20 -1.048553 -1.420018  a     0
24 -1.613898 -0.212740  a     2
28  0.066517  0.302472  a     2
32  0.177426 -0.401781  a     1
36

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [44]:
bins_dict

{'a_A': array([-1.61389785, -0.91993919, -0.26114429,  0.2570259 ,  1.14912651,
         2.38314477]),
 'a_B': array([-1.98079647, -1.18143996, -0.28835654,  0.35022845,  0.96647393,
         1.94362119]),
 'b_A': array([-1.70627019, -0.79502617, -0.37001807,  0.13376468,  0.4150058 ,
         0.97873798]),
 'b_B': array([-1.53624369, -0.66774465,  0.03829071,  0.39293724,  1.13810204,
         2.2408932 ]),
 'c_A': array([-2.55298982, -0.87809739, -0.57546726,  0.76533886,  1.28188298,
         1.92294203]),
 'c_B': array([-1.37495129, -0.71627605, -0.23682732,  0.24876169,  1.0217872 ,
         1.89588918]),
 'd_A': array([-1.29285691, -1.00702085, -0.40645751,  0.15963785,  0.75615969,
         2.16323595]),
 'd_B': array([-2.22340315, -0.32940621,  0.21182369,  0.65159218,  0.90927706,
         1.84926373])}

In [49]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [50]:
save_obj(bins_dict, 'bins_dict')

In [52]:
load_obj('bins_dict')

{'a_A': array([-1.61389785, -0.91993919, -0.26114429,  0.2570259 ,  1.14912651,
         2.38314477]),
 'a_B': array([-1.98079647, -1.18143996, -0.28835654,  0.35022845,  0.96647393,
         1.94362119]),
 'b_A': array([-1.70627019, -0.79502617, -0.37001807,  0.13376468,  0.4150058 ,
         0.97873798]),
 'b_B': array([-1.53624369, -0.66774465,  0.03829071,  0.39293724,  1.13810204,
         2.2408932 ]),
 'c_A': array([-2.55298982, -0.87809739, -0.57546726,  0.76533886,  1.28188298,
         1.92294203]),
 'c_B': array([-1.37495129, -0.71627605, -0.23682732,  0.24876169,  1.0217872 ,
         1.89588918]),
 'd_A': array([-1.29285691, -1.00702085, -0.40645751,  0.15963785,  0.75615969,
         2.16323595]),
 'd_B': array([-2.22340315, -0.32940621,  0.21182369,  0.65159218,  0.90927706,
         1.84926373])}

In [54]:
bins_dict.get('a_A')

array([-1.61389785, -0.91993919, -0.26114429,  0.2570259 ,  1.14912651,
        2.38314477])

In [63]:
bins_dict = {}
res = []
for group, subset in grp:
    for col in ['A','B']:
        col_box = col + '_box'
        key = group + '_' + col
        bins = bins_dict.get(key)
        ser, bins = pd.qcut(subset[col], 5, retbins=True, labels=range(5))        
        subset[col_box] = pd.cut(subset[col], bins=bins, labels=False, include_lowest=True)
    res.append(subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [64]:
r = pd.concat(res, axis=0)

In [65]:
r.head()

Unnamed: 0,A,B,C,A_box,B_box
0,1.764052,0.400157,a,4,3
4,-0.103219,0.410599,a,2,3
8,1.494079,-0.205158,a,4,2
12,2.269755,-1.454366,a,4,0
16,-0.887786,-1.980796,a,1,0


### apply LabelEncoder to new data

In [69]:
df = pd.DataFrame([['a','a','b','c','d','d'],['cat','cat','cat','bird','bird','dog']]).T
df.columns = ['type','name']

In [73]:
from sklearn.preprocessing import LabelEncoder

In [77]:
le_dict = {}
for col in ['type','name']:
    col_en = col + '_en'
    le = LabelEncoder()
    le.fit(df[col])
    df[col_en] = le.transform(df[col])
    le_dict[col] = le
    

In [81]:
le_dict.get('type').transform(df['type'])

array([0, 0, 1, 2, 3, 3], dtype=int64)

In [82]:
save_obj(obj=le_dict, name='le_dict')

In [83]:
le_dict1 = load_obj(name='le_dict')

In [84]:
le_dict1.get('type').transform(df['type'])

array([0, 0, 1, 2, 3, 3], dtype=int64)

In [85]:
df

Unnamed: 0,type,name,type_en,name_en
0,a,cat,0,1
1,a,cat,0,1
2,b,cat,1,1
3,c,bird,2,0
4,d,bird,3,0
5,d,dog,3,2


### model_selection

In [2]:
import numpy as np

from sklearn.model_selection import LeavePOut

X = np.ones(4)
lpo = LeavePOut(p=2)

for train, test in lpo.split(X):
    print("%s %s" % (train, test))

[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]


In [6]:
X = np.ones(4)
X

array([ 1.,  1.,  1.,  1.])