In [2]:
import numpy as np 
from sklearn.datasets import load_boston

boston = load_boston() 
X, y = boston.data, boston.target.reshape(-1, 1)

In [3]:
#two ways to binarize features in scikit-learn:　preprocessing.binarize and preprocessing.Binarizer

In [6]:
# 1
from sklearn import preprocessing 
new_target = preprocessing.binarize(y,threshold=boston.target.mean()) 
new_target[:5]

array([[1.],
       [0.],
       [1.],
       [1.],
       [1.]])

In [5]:
(y[:5] > y.mean()).astype(int)

array([[1],
       [0],
       [1],
       [1],
       [1]])

In [8]:
# 2
binar = preprocessing.Binarizer(y.mean()) 
new_target = binar.fit_transform(y) 
new_target[:5]

array([[1.],
       [0.],
       [1.],
       [1.],
       [1.]])

In [9]:
#sparse matrices
from scipy.sparse import coo 
spar = coo.coo_matrix(np.random.binomial(1, .25, 100)) 
preprocessing.binarize(spar, threshold=-1)

ValueError: Cannot binarize a sparse matrix with threshold < 0

In [12]:
#categorical variables
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data 
y = iris.target

In [15]:
cat_encoder = preprocessing.OneHotEncoder() 
cat_encoder.fit_transform(y.reshape(-1,1)).toarray()[:5]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [16]:
cat_encoder.transform(np.ones((3, 1))).toarray()

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [17]:
from sklearn.linear_model import Ridge 
ridge_inst = Ridge()

In [18]:
from sklearn.multioutput import MultiOutputRegressor 
multi_ridge = MultiOutputRegressor(ridge_inst, n_jobs=-1)

In [21]:
y_multi = cat_encoder.fit_transform(y.reshape(-1,1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [22]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, stratify=y, random_state= 7) 

In [23]:
multi_ridge.fit(X_train, y_train)

MultiOutputRegressor(estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                                     max_iter=None, normalize=False,
                                     random_state=None, solver='auto',
                                     tol=0.001),
                     n_jobs=-1)

In [24]:
y_multi_pre = multi_ridge.predict(X_test) 
y_multi_pre[:5]


array([[ 0.81327059,  0.37348962, -0.1867602 ],
       [ 0.9524035 ,  0.17877376, -0.13117727],
       [-0.01657826,  0.36538441,  0.65119386],
       [ 0.1772412 ,  0.47768186,  0.34507695],
       [ 0.87693053,  0.14905609, -0.02598662]])

In [26]:
y_multi_pred = preprocessing.binarize(y_multi_pre,threshold=0.5) 
y_multi_pred[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.]])

In [28]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_multi_pre)

0.9198717948717948

In [29]:
from sklearn.metrics import accuracy_score
print ("Multi-Output Scores for the Iris Flowers: ") 
for column_number in range(0,3):
    print ('Accuracy score of flower ' + str(column_number),accuracy_score(y_test[:,column_number], y_multi_pred[:,column_number]))
    print ('AUC score of flower ' + str(column_number),roc_auc_score(y_test[:,column_number], y_multi_pre[:,column_number]))
    print ('')
 

Multi-Output Scores for the Iris Flowers: 
Accuracy score of flower 0 1.0
AUC score of flower 0 1.0

Accuracy score of flower 1 0.7368421052631579
AUC score of flower 1 0.7692307692307693

Accuracy score of flower 2 0.9736842105263158
AUC score of flower 2 0.9903846153846154



In [30]:
#DictVectorizer: convert strings to features
from sklearn.feature_extraction import DictVectorizer 
dv = DictVectorizer() 
my_dict = [{'species': iris.target_names[i]} for i in y] 
dv.fit_transform(my_dict).toarray()[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])