In [1]:
%%writefile rare_values.py

import pandas as pd
# import numpy as np
# from warnings import warn

# 2018.11.07 Created by Eamon.Zhang
# 2018.11.12 change into fit() transform() format

class GroupingRareValues():
    """
    Grouping the observations that show rare labels into a unique category ('rare')
    
    Parameters
    ----------
   
    """

    def __init__(self, mapping=None, cols=None, threshold=0.01):
        self.cols = cols
        self.mapping = mapping
        self._dim = None
        self.threshold = threshold


    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        self._dim = X.shape[1]

        _, categories = self.grouping(
            X,
            mapping=self.mapping,
            cols=self.cols,
            threshold=self.threshold
        )
        self.mapping = categories
        return self


    def transform(self, X):
        """Perform the transformation to new categorical data.
        Will use the mapping (if available) and the column list to encode the
        data.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        X : Transformed values with encoding applied.
        """

        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')

        #  make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

        X, _ = self.grouping(
            X,
            mapping=self.mapping,
            cols=self.cols,
            threshold=self.threshold
        )

        return X 


    def grouping(self, X_in, threshold, mapping=None, cols=None):
        """
        Grouping the observations that show rare labels into a unique category ('rare')

        """

        X = X_in.copy(deep=True)

#        if cols is None:
#            cols = X.columns.values

        if mapping is not None:  # transform
            mapping_out = mapping
            for i in mapping:
                column = i.get('col') # get the column name
                X[column] = X[column].map(i['mapping'])

#                try:
#                    X[column] = X[column].astype(int)
#                except ValueError as e:
#                    X[column] = X[column].astype(float)
        else: # fit
            mapping_out = []
            for col in cols:
#                if util.is_category(X[col].dtype):
#                    categories = X[col].cat.categories
#                else:
                temp_df = pd.Series(X[col].value_counts()/len(X))
                mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k)
                          for k in temp_df.index}

                mapping = pd.Series(mapping)
                mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )

        return X, mapping_out



#==============================================================================
# def rare_imputation(X_train, X_test, variable):
#     
#     # find the most frequent category
#     frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
#     
#     # find rare labels
#     temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
#     rare_cat = [x for x in temp.loc[temp<0.05].index.values]
#     
#     # create new variables, with Rare labels imputed
#     
#     # by the most frequent category
#     X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
#     X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
#     
#     # by adding a new label 'Rare'
#     X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
#     X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
#==============================================================================

# 2018.11.26 created by Eamon.Zhang
class ModeImputation():
    """
    Replacing the rare label by most frequent label
    
    Parameters
    ----------
   
    """

    def __init__(self, mapping=None, cols=None, threshold=0.01):
        self.cols = cols
        self.mapping = mapping
        self._dim = None
        self.threshold = threshold


    def fit(self, X, y=None, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        self._dim = X.shape[1]

        _, categories = self.impute_with_mode(
            X,
            mapping=self.mapping,
            cols=self.cols,
            threshold=self.threshold
        )
        self.mapping = categories
        return self


    def transform(self, X):
        """Perform the transformation to new categorical data.
        Will use the mapping (if available) and the column list to encode the
        data.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        X : Transformed values with encoding applied.
        """

        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')

        #  make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

        X, _ = self.impute_with_mode(
            X,
            mapping=self.mapping,
            cols=self.cols,
            threshold=self.threshold
        )

        return X 


    def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
        """
        Grouping the observations that show rare labels into a unique category ('rare')

        """

        X = X_in.copy(deep=True)

#        if cols is None:
#            cols = X.columns.values

        if mapping is not None:  # transform
            mapping_out = mapping
            for i in mapping:
                column = i.get('col') # get the column name
                X[column] = X[column].map(i['mapping'])

#                try:
#                    X[column] = X[column].astype(int)
#                except ValueError as e:
#                    X[column] = X[column].astype(float)
        else: # fit
            mapping_out = []
            for col in cols:
#                if util.is_category(X[col].dtype):
#                    categories = X[col].cat.categories
#                else:
                temp_df = pd.Series(X[col].value_counts()/len(X))
                median = X[col].mode()[0]
                mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
                          for k in temp_df.index}

                mapping = pd.Series(mapping)
                mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )

        return X, mapping_out

Writing rare_values.py


### Imports

In [2]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
import rare_values as ra

## Load Dataset

In [3]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

# see column Pclass & SibSp's distributions
# SibSp has values 3/8/5 that occur rarely, under 2%
# Pclass has 3 values, but no one is under 20%
data = pd.read_csv('https://raw.githubusercontent.com/daniel-dc-cd/feature-engineering-and-feature-selection/master/data/titanic.csv', usecols=use_cols)


for i in ['Pclass','SibSp']:
    print('Variable',i,'label proportion:')
    print(data[i].value_counts()/len(data))

Variable Pclass label proportion:
3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64
Variable SibSp label proportion:
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: SibSp, dtype: float64


## Grouping into one new category
Grouping the observations that show rare labels into a unique category ('rare')

In [4]:
# create the encoder and fit with our data
enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)

In [5]:
# let's see the mapping
# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%
# for Pclass, nothing changed
print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0
1       1
2       2
4       4
3       3
8    rare
5    rare
dtype: object, 'data_type': dtype('int64')}]


In [6]:
# perform transformation
data2 = enc.transform(data)

In [7]:
# check the result
print(data2.SibSp.value_counts())

0       608
1       209
2        28
4        18
3        16
rare     12
Name: SibSp, dtype: int64


## Mode Imputation
Replacing the rare label by most frequent label

In [8]:
# create the encoder and fit with our data
enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)

In [9]:
# let's see the mapping
# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label
# for Pclass, nothing changed
print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0
1    1
2    2
4    4
3    3
8    0
5    0
dtype: int64, 'data_type': dtype('int64')}]


In [10]:
# perform transformation
data3 = enc.transform(data)

In [11]:
# check the result
print(data3.SibSp.value_counts())

0    620
1    209
2     28
4     18
3     16
Name: SibSp, dtype: int64
