# Sklearn Labelencoder keep encoded values when encoding new dataframe

Based on an answer posted on StackOverflow:
https://stackoverflow.com/questions/58754795/sklearn-labelencoder-keep-encoded-values-when-encoding-new-dataframe.

In [2]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [3]:
class TTLabelEncoder(BaseEstimator, TransformerMixin):
    """Transform data frame columns with different categorical values
    in training and test data
    
    Pass individual data frame columns to the class instance"""
    
    def __init__(self):
        self.code_dic = None
        self.max_code = None
        self.fitted = False
    
    def fit(self, df):
        self.code_dict = dict(zip(df.unique(),
                                  np.arange(len(df.unique()))))
        self.__max_code__()
        self.fitted = True
        return self
    
    def transform(self, df):
        assert self.fitted == True, 'Fit the data before transforming.'
        new_cat = set(df.unique()).difference(set(self.code_dict.keys()))
        if new_cat:
            new_codes = dict(zip(new_cat, 
                     np.arange(len(new_cat)) + self.max_code + 1))
            self.code_dict.update(new_codes)
            self.__max_code__()
        return df.map(self.code_dict)
    
    def __max_code__(self):
        self.max_code = max(self.code_dict.values())
        return self
    
    def fit_transform(self, df):
        if self.fitted == False:
            self.fit(df)
        df = self.transform(df)
        return df

# Example

In [20]:
df_1 = pd.DataFrame({'IP': np.random.choice(list('ABCD'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
print(df_1)

  IP  Counts
0  B      14
1  D      10
2  A      10
3  C      12
4  B      19


In [21]:
ip_encoder = TTLabelEncoder()
ip_encoder.fit(df_1['IP'])
ip_encoder.code_dict

{'B': 0, 'D': 1, 'A': 2, 'C': 3}

In [24]:
df_1['IP'] = ip_encoder.transform(df_1['IP'])
print(df_1)

   IP  Counts
0   0      14
1   1      10
2   2      10
3   3      12
4   0      19


In [30]:
df_2 = pd.DataFrame({'IP': np.random.choice(list('DEF'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
print(df_2)

  IP  Counts
0  E      11
1  F      12
2  D      15
3  F      17
4  E      18


In [31]:
df_2['IP'] = ip_encoder.transform(df_2['IP'])
print(df_2)

   IP  Counts
0   4      11
1   5      12
2   1      15
3   5      17
4   4      18


In [34]:
df_3 = pd.DataFrame({'IP': np.random.choice(list('XYZ'), size=5),
                     'Counts': np.random.randint(40, 100, size=5)})
print(df_3)

  IP  Counts
0  Z      81
1  X      62
2  X      54
3  Y      92
4  Y      42


In [35]:
df_3['IP'] = ip_encoder.fit_transform(df_3['IP'])
print(df_3)

   IP  Counts
0   8      81
1   7      62
2   7      54
3   6      92
4   6      42


In [45]:
df_4 = pd.DataFrame({'IP': np.random.choice(list('TUV'), size=5),
                     'Counts': np.random.randint(40, 100, size=5)})
print(df_4)

  IP  Counts
0  T      73
1  U      41
2  V      90
3  T      59
4  U      54


In [46]:
ip_encoder_2 = TTLabelEncoder()
df_4['IP'] = ip_encoder_2.fit_transform(df_4['IP'])
print(df_4)

   IP  Counts
0   0      73
1   1      41
2   2      90
3   0      59
4   1      54


In [48]:
ip_encoder_2.code_dict

{'T': 0, 'U': 1, 'V': 2}