In [1]:
import pandas as pd
import numpy as np

In [9]:
df = pd.DataFrame({'WindGustDir': ['WNW', 'NW', 'SW', 'N', 'NNW'],
                   'MinTemp': [5.8, 13.4, 8.4, 15.0, 4.3],
                   'MaxTemp': [11.9, 23.8, 18.1, 30.8, 17.1],
                   'Evaporation': [8.0, np.nan, np.nan, np.nan, 5],
                   'Sunshine': [np.nan, np.nan, 9.8, np.nan, 10.2]    
})

df

Unnamed: 0,WindGustDir,MinTemp,MaxTemp,Evaporation,Sunshine
0,WNW,5.8,11.9,8.0,
1,NW,13.4,23.8,,
2,SW,8.4,18.1,,9.8
3,N,15.0,30.8,,
4,NNW,4.3,17.1,5.0,10.2


In [10]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [11]:
from pickle import load

scaler = load(open('scaler.pkl', 'rb'))
le = load(open('labelencoder.pkl', 'rb'))

In [12]:
df

Unnamed: 0,WindGustDir,MinTemp,MaxTemp,Evaporation,Sunshine
0,WNW,5.8,11.9,8.0,
1,NW,13.4,23.8,,
2,SW,8.4,18.1,,9.8
3,N,15.0,30.8,,
4,NNW,4.3,17.1,5.0,10.2


In [13]:
for col in df.columns:
    if col == 'WindGustDir':
        df[col] = le[col].transform(df[col].astype('str'))
    else:        
        df[col] = scaler[col].transform(np.array(df[col]).reshape(-1, 1))

In [14]:
df

Unnamed: 0,WindGustDir,MinTemp,MaxTemp,Evaporation,Sunshine
0,15,0.343164,0.296223,0.098522,
1,7,0.546917,0.532803,,
2,12,0.412869,0.419483,,0.7
3,3,0.589812,0.671968,,
4,6,0.302949,0.399602,0.061576,0.728571
