# TrainTestLabelencoder - Keep Encoded Values when Encoding a New Data Frame

Based on an answer posted on StackOverflow:
https://stackoverflow.com/questions/58754795/sklearn-labelencoder-keep-encoded-values-when-encoding-new-dataframe.

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [2]:
class TTLabelEncoder(BaseEstimator, TransformerMixin):
    """Transform data frame columns with different categorical values
    in training and test data. TT stands for Train-Test.
    
    Pass individual data frame columns to the class instance."""
    
    def __init__(self):
        self.code_dict = None
        self.inv_code_dict = None
        self.max_code = None
        self.fitted = False
    
    def fit(self, df):
        self.code_dict = dict(zip(df.unique(),
                                  np.arange(len(df.unique()))))
        self.__max_code__()
        self.fitted = True
        return self
    
    def transform(self, df):
        self.check_fit_status()
        new_cat = set(df.unique()).difference(set(self.code_dict.keys()))
        if new_cat:
            new_codes = dict(zip(new_cat, 
                     np.arange(len(new_cat)) + self.max_code + 1))
            self.code_dict.update(new_codes)
            self.__max_code__()
        return df.map(self.code_dict)

    def fit_transform(self, df):
        if self.fitted == False:
            self.fit(df)
        df = self.transform(df)
        return df
    
    def inverse_transform(self, df):
        self.check_fit_status()
        self.inv_code_dict = {v:k for k, v in self.code_dict.items()}
        df = df.map(self.inv_code_dict)
        return df
    
    def check_fit_status(self):
        assert self.fitted == True, 'Fit the data before transforming.'
        return self
    
    def __max_code__(self):
        self.max_code = max(self.code_dict.values())
        return self

## Example 1

In [16]:
df_1 = pd.DataFrame({'IP': np.random.choice(list('ABCD'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
print(df_1)

  IP  Counts
0  D      16
1  C      15
2  A      15
3  C      19
4  B      13


In [17]:
ip_encoder = TTLabelEncoder()
ip_encoder.fit(df_1['IP'])
ip_encoder.code_dict

{'D': 0, 'C': 1, 'A': 2, 'B': 3}

In [18]:
df_1['IP'] = ip_encoder.transform(df_1['IP'])
print(df_1)

   IP  Counts
0   0      16
1   1      15
2   2      15
3   1      19
4   3      13


In [19]:
df_2 = pd.DataFrame({'IP': np.random.choice(list('DEF'), size=5),
                   'Counts': np.random.randint(10, 20, size=5)})
print(df_2)

  IP  Counts
0  E      19
1  D      17
2  F      18
3  E      18
4  D      14


In [20]:
df_2['IP'] = ip_encoder.transform(df_2['IP'])
print(df_2)

   IP  Counts
0   5      19
1   0      17
2   4      18
3   5      18
4   0      14


In [24]:
ip_encoder.code_dict

{'D': 0, 'C': 1, 'A': 2, 'B': 3, 'F': 4, 'E': 5}

In [25]:
df_2['IP'] = ip_encoder.inverse_transform(df_2['IP'])
print(df_2)

  IP  Counts
0  E      19
1  D      17
2  F      18
3  E      18
4  D      14


## Example 2

In [21]:
df_3 = pd.DataFrame({'IP': np.random.choice(list('XYZ'), size=5),
                     'Counts': np.random.randint(40, 100, size=5)})
print(df_3)

  IP  Counts
0  Z      68
1  X      89
2  Z      57
3  Y      74
4  Z      75


In [22]:
ip_encoder_2 = TTLabelEncoder()
df_3['IP'] = ip_encoder_2.fit_transform(df_3['IP'])
print(df_3)

   IP  Counts
0   0      68
1   1      89
2   0      57
3   2      74
4   0      75


In [28]:
df_4 = pd.DataFrame({'IP': np.random.choice(list('TUV'), size=5),
                     'Counts': np.random.randint(40, 100, size=5)})
print(df_4)

  IP  Counts
0  T      68
1  V      70
2  U      61
3  U      48
4  U      98


In [29]:
df_4['IP'] = ip_encoder_2.fit_transform(df_4['IP'])
print(df_4)

   IP  Counts
0   3      68
1   4      70
2   5      61
3   5      48
4   5      98


In [30]:
ip_encoder_2.code_dict

{'Z': 0, 'X': 1, 'Y': 2, 'T': 3, 'V': 4, 'U': 5}