# Sklearn Labelencoder keep encoded values when encoding new dataframe

Based on an answer posted on StackOverflow:
https://stackoverflow.com/questions/58754795/sklearn-labelencoder-keep-encoded-values-when-encoding-new-dataframe.

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [2]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    """Transform data frame columns with different categorical values
    in training and test data
    
    Pass individual data frame columns to the class instance"""
    
    def fit(self, df_tr):
        self.code_dict = dict(zip(df['IP'].unique(),
                                  np.arange(len(df['IP'].unique()))))
        self.__max_code__()
        return self
    
    def transform(self, df_test):
        new_cat = set(df_test.unique()).difference(set(self.code_dict.keys()))
        if new_cat:
            new_codes = dict(zip(new_cat, 
                     np.arange(len(new_cat)) + self.max_code + 1))
            self.code_dict.update(new_codes)
            self.__max_code__()
        return df_test.map(self.code_dict)
    
    def __max_code__(self):
        self.max_code = max(self.code_dict.values())
        return self

# Example

In [4]:
df = pd.DataFrame({'IP': np.random.choice(list('ABCD'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
df

Unnamed: 0,IP,Counts
0,C,18
1,B,17
2,D,13
3,A,11
4,D,17


In [5]:
ip_encoder = CustomLabelEncoder()
ip_encoder.fit(df['IP'])
ip_encoder.code_dict

{'C': 0, 'B': 1, 'D': 2, 'A': 3}

In [6]:
df['IP'] = ip_encoder.transform(df['IP'])
df

Unnamed: 0,IP,Counts
0,0,18
1,1,17
2,2,13
3,3,11
4,2,17


In [8]:
df_1 = pd.DataFrame({'IP': np.random.choice(list('DEF'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
df_1

Unnamed: 0,IP,Counts
0,E,13
1,D,14
2,F,10
3,F,11
4,D,13


In [9]:
df_1['IP'] = ip_encoder.transform(df_1['IP'])
df_1

Unnamed: 0,IP,Counts
0,4,13
1,2,14
2,5,10
3,5,11
4,2,13
