In [1]:
# LabelEncoder can only convert one list at a time.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(['one','two','three'])  
le.transform(['one','two','three']) 

# OrdinalEncoder is much powerful since it can apply on multiple features at one time.
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder()
oe.fit([[1.0,'one'],[2.0,'two'],[3.0,'three']]) # pd.dataframe or np.array is also available 
oe.transform([[2.0,'one']])

array([[1., 0.]])

In [2]:
class new_OrdinalEncoder:
    def __init__(self,cat_index='all'):
        self.dicts={}
        # cate_index is the categorical feature index list
        self.cat_index=cat_index
    
    def __str__(self):
        return 'New_OrdinalEncoder()'
    
    __repr__=__str__
    
    def fit(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
            
    def fit_transform(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        df_output=df.copy()
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(lambda x: dic[x])
        return df_output
        
    def transform(self,df):
        df_output=df.copy()
        for feat in self.cat_index:
            dic=self.dicts[feat]
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(self.unknown_value,args=(dic,))
        return df_output
    
    def unknown_value(self,value,dic): # It will set up a new interger for unknown values!
        try:
            return dic[value]
        except:
            return len(dic)

In [3]:
# Load data
import pandas as pd
import numpy as np
#Save the data and notebook in the same folder! 
data=pd.read_csv('transaction.csv')

# identify categorical and continous features 
cat=['account_id','date','type','operation','k_symbol','bank']
conti=['amount','balance']

# fit the OrdinalEncoder on entire dataset 
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder()
oe.fit(data[cat])

# split data into training and testing set
from sklearn.model_selection import train_test_split
train, test = train_test_split(data,test_size=0.3,random_state=0)

# convert training categorical features to numeric 
train_cat=oe.transform(train[cat])
# concate categorical features and continous features for training set
import numpy as np
train_oe=np.append(train_cat,train[conti],axis=1)

# apply onehotencoder on train_array
from sklearn.preprocessing import OneHotEncoder
# fit onehotencoder only on training set
cat_index=[i for i in range(len(cat))]
ohe=OneHotEncoder(handle_unknown='ignore',categorical_features=cat_index)
train_ohe=ohe.fit_transform(train_oe)

# Let's check the train_ohe
train_ohe



<161325x4199 sparse matrix of type '<class 'numpy.float64'>'
	with 1290600 stored elements in COOrdinate format>

In [4]:
# it is very easy to perform conversion
cat_index=[0,1,2,3,6,7]# This is the categorical features index list
new_oe=new_OrdinalEncoder()
train_oe=new_oe.fit_transform(train,cat_index)
ohe=OneHotEncoder(handle_unknown='ignore',categorical_features=cat_index)
train_ohe=ohe.fit_transform(train_oe,cat_index)
test_oe=new_oe.transform(test)
test_ohe=ohe.transform(test_oe)



In [5]:
from sklearn.pipeline import Pipeline
estimators=[('new_ordianlencoder',new_OrdinalEncoder(cat_index)),('onehotencoder',OneHotEncoder(handle_unknown='ignore',categorical_features=cat_index))]
pipe=Pipeline(estimators)
pipe

Pipeline(memory=None,
     steps=[('new_ordianlencoder', New_OrdinalEncoder()), ('onehotencoder', OneHotEncoder(categorical_features=[0, 1, 2, 3, 6, 7], categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True))])

In [6]:
# apply fit on pipeline
pipe.fit(train)



Pipeline(memory=None,
     steps=[('new_ordianlencoder', New_OrdinalEncoder()), ('onehotencoder', OneHotEncoder(categorical_features=[0, 1, 2, 3, 6, 7], categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True))])

In [7]:
pipe.transform(test)

<69140x4199 sparse matrix of type '<class 'numpy.float64'>'
	with 553112 stored elements in COOrdinate format>

In [8]:
# Do not use from sklearn.preprocessing import _BaseEncoder, it is protected class!
from sklearn.preprocessing._encoders import _BaseEncoder
class new_OrdinalEncoder(_BaseEncoder):
    def __init__(self,cat_index='all'):
        self.dicts={}
        # cate_index is the categorical feature index list
        self.cat_index=cat_index
    
    def fit(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
            
    def fit_transform(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        df_output=df.copy()
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(lambda x: dic[x])
        return df_output
        
    def transform(self,df):
        df_output=df.copy()
        for feat in self.cat_index:
            dic=self.dicts[feat]
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(self.unknown_value,args=(dic,))
        return df_output
    
    def unknown_value(self,value,dic): # It will set up a new interger for unknown values!
        try:
            return dic[value]
        except:
            return len(dic)

In [9]:
from sklearn.pipeline import Pipeline
estimators=[('new_ordianlencoder',new_OrdinalEncoder(cat_index)),('onehotencoder',OneHotEncoder(handle_unknown='ignore',categorical_features=cat_index))]
pipe=Pipeline(estimators)
pipe

Pipeline(memory=None,
     steps=[('new_ordianlencoder', new_OrdinalEncoder(cat_index=[0, 1, 2, 3, 6, 7])), ('onehotencoder', OneHotEncoder(categorical_features=[0, 1, 2, 3, 6, 7], categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True))])

In [10]:
# apply fit on pipeline
pipe.fit(train)



Pipeline(memory=None,
     steps=[('new_ordianlencoder', new_OrdinalEncoder(cat_index=[0, 1, 2, 3, 6, 7])), ('onehotencoder', OneHotEncoder(categorical_features=[0, 1, 2, 3, 6, 7], categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True))])

In [11]:
pipe.transform(test)

<69140x4199 sparse matrix of type '<class 'numpy.float64'>'
	with 553112 stored elements in COOrdinate format>