# Turnover_modelling_FW.ipynb
> Project: **ABI Turnover**  
> Turnover Process Phase: **3**  
> Author: **Varun V**  
> Location: **GCC**  
> Team: **People Analytics**

In [2]:
## importing the relevant packages:

# clear the workspace
#%reset -f

# print list of files in directory
import os
print(os.listdir())

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect, re
from datetime import datetime as dt

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
import sklearn
from sklearn import metrics, preprocessing
import sklearn.decomposition as decomposition
from sklearn.model_selection import cross_val_score
from sklearn.base import TransformerMixin
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import category_encoders as ce
from scipy.stats import truncnorm

# the modelling packages and related
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# hyperopt modules
#from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, Trials, space_eval, rand
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

MAX_EVALS = 5
randomseed = 5 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 1000 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
## HELPER FUNCTIONS CLASS ##

class helper_funcs():
    
    def __init__():
        """ helper functions used across the pipeline """
    
    ## find and append multiple dataframes of the type specified in string
    def append_datasets(cols_to_remove, string = ['TRAIN', 'VALID']):
        files = dbutils.fs.ls('/mnt/datalake/OUTPUT/')
        file_names = {}
        
        iter = 0
        for i in files:
          file_names[i.name] = i.path
          iter = iter+1
        file_names_keys = list(file_names.keys())
        
        # pass either train or valid as str argument
        temp_files = [name for name in file_names_keys if name.startswith(string)]
        temp_dict = {}
        for i in temp_files:
            df_name = re.sub(string=i, pattern='.csv', repl='')
            print(df_name)
            temp_dict[df_name] = pd.read_csv(('/dbfs/mnt/datalake/OUTPUT/') + i, na_values=['No Data', ' ', 'UNKNOWN', '', np.nan, np.inf])
            temp_dict[df_name].columns = map(str.lower, temp_dict[df_name].columns)
            temp_dict[df_name].drop(cols_to_remove, axis=1, inplace=True)
            chars_to_remove = [' ', '.', '(', ')', '__', '-']
            for i in chars_to_remove:
                temp_dict[df_name].columns = temp_dict[df_name].columns.str.strip().str.lower().str.replace(i, '_')
        temp_list = [v for k,v in temp_dict.items()]
        temp = pd.concat(temp_list, axis=0, sort=True, ignore_index=True)
        return temp
    
    ## datetime feature engineering
    def datetime_feats(train, valid):
        cols = [s for s in train.columns.values if 'date' in s]
        print('datetime feature engineering is happening ...', '\n')
        # nested function to derive the various datetime features for a given date column
        def dt_feats(df, col):
            df[col] = pd.to_datetime(df[i])
            #df[str(col+'_'+'day')] = df[col].dt.day
            #df[str(col+'_'+'day_name')] = df[col].dt.day_name
            #df[str(col+'_'+'dayofweek')] = df[col].dt.dayofweek
            df[str(col+'_'+'dayofyear')] = df[col].dt.dayofyear
            #df[str(col+'_'+'days_in_month')] = df[col].dt.days_in_month
            df[str(col+'_'+'month')] = df[col].dt.month
            #df[str(col+'_'+'month_name')] = df[col].dt.month_name
            df[str(col+'_'+'quarter')] = df[col].dt.quarter
            #df[str(col+'_'+'week')] = df[col].dt.week
            #df[str(col+'_'+'weekday')] = df[col].dt.weekday
            df[str(col+'_'+'year')] = df[col].dt.year
            #df[col] = df[col].dt.date
            df = df.drop([col], axis = 1)
            return df
        # loop function over all raw date columns
        for i in cols:
            train = dt_feats(train, i)
            valid = dt_feats(valid, i)
        return train, valid
    
    ## function to get frequency count of elements in a vector/list
    def freq_count(input_vector):
        return collections.Counter(input_vector)
    
    # removing near zero variance columns
    def variance_threshold_selector(train, valid, threshold):
        print('input data shape is: ', train.shape, '\n')
        selector = VarianceThreshold(threshold)
        selector.fit(np.asanyarray(train))
        X = train[train.columns[selector.get_support(indices=True)]]
        Y = valid[valid.columns[selector.get_support(indices=True)]]
        #display(pd.DataFrame(X.head(5)))
        print('output data shape is: ', X.shape, '\n')
        return X, Y

In [4]:
## MISSING VALUE IMPUTATION CLASS ##

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):
        X_temp = X.copy()
        #self.fill = pd.Series([X_temp.groupby(['pay_grade_group_pa', 'global_job_om'])[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp.groupby(['pay_grade_group_pa', 'global_job_om'])[c].mean() for c in X_temp], index=X_temp.columns)
        #X_temp = self.fill.copy().reset_index()
        #self.fill = pd.Series([X_temp.groupby(['pay_grade_group_pa'])[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp.groupby(['pay_grade_group_pa'])[c].mean() for c in X_temp], index=X_temp.columns)
        #X_temp = self.fill.copy().reset_index()
        self.fill = pd.Series([X_temp[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp[c].mean() for c in X_temp], 
                              index=X_temp.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def num_missing(self):
        return sum(self.isnull())

In [5]:
## CATEGORICAL ENCODERS CLASS ##

class categ_encoders(object):
    def ce_encodings(self, train_df, valid_df, encoding):
        print(str(encoding) + ' encoding is happening ...', '\n')
        if encoding=='bne':    
            enc=ce.BaseNEncoder(base=4)
        elif encoding=='be':
            enc=ce.BinaryEncoder()
        elif encoding=='he':
            enc=ce.HashingEncoder(drop_invariant=True)
        elif encoding=='oe':
            enc=ce.OrdinalEncoder()
        elif encoding=='ohe':
            enc=ce.BaseNEncoder(base=1)
        enc.fit(train_df)
        train_enc=enc.transform(train_df)
        valid_enc=enc.transform(valid_df)
        print('category encoding completed', '\n')
        self.enc=enc
        return train_enc, valid_enc

In [7]:
class main(object):
    
    def __init__(self):
        """ random """
    
    def prepare(self, cols_to_remove, response='label', id_col='employee_personnal_number_pa'):
        # read in the train and validation datasets
        # clean column names and remove unwanted columns
        # append the (multiple?) train datasets into a single one (simple appending for now)
        
        print('1. Appending the multiple train/valid datasets in the working directory \n')
        train = helper_funcs.append_datasets(string='TRAIN_201801_1_', cols_to_remove=cols_to_remove)
        valid = helper_funcs.append_datasets(string='VALID_201802_201808', cols_to_remove=cols_to_remove)
        main.removed_cols = cols_to_remove ## attribute
        
        # reorder columns so that they are in the same order (impacts nothing but to be foolproof)
        valid = valid[train.columns]
        
        # creating the datetime features from date columns (works only for cols with date in header, modify for other cases)
        print('2. Datetime features are being created for the columns (which have "date" in their column name) \n')
        train, valid = helper_funcs.datetime_feats(train, valid)

        # missing value threshold control (for both rows and columns)
        mt = 0.6
        print(train.shape, '\n')
        train.dropna(thresh=mt*(train.shape[0]), axis=1, inplace = True)
        train.dropna(thresh=mt*(train.shape[1]), axis=0, inplace = True)
        print(train.shape, '\n')
        valid = valid[train.columns]
        valid.dropna(thresh=mt*(valid.shape[0]), axis=1, inplace = True)
        train = train[valid.columns]
        main.missing_threshold = mt ## attribute

        # reset the index since inplace operations happened earlier
        train.index = pd.RangeIndex(len(train.index))
        valid.index = pd.RangeIndex(len(valid.index))
        # save the global ids for mapping later (forward looking)
        valid_ids = valid[[id_col, response]]
        main.validation_labels = valid_ids ## attribute
        valid_ids.to_csv('test_dfs.csv', index=False)
        valid.drop(id_col, axis=1, inplace=True)
        train.drop(id_col, axis=1, inplace=True)
        train = pd.DataFrame(train)
        valid = pd.DataFrame(valid)
        # the class balance in the training dataset for the response
        print(helper_funcs.freq_count(train[response]), '\n')
        # creating the response vector
        y_train = train[response].values
        y_valid = valid[response].values

        # drop the response
        train = train.drop([response], axis = 1)
        valid = valid.drop([response], axis = 1)

        #######################################################################################################
        ## MISSING VALUE IMPUTATION ##
        #######################################################################################################
        # store all feature names
        feat_names = train.columns.values
        feat_names2 = valid.columns.values
        
        miss_enc = DataFrameImputer()
        miss_enc.fit(X=train)
        train_new = miss_enc.transform(train)
        valid_new = miss_enc.transform(valid)
        
        # returning as pandas dataframes to retain feature names for LIME and feature importance plots
        train = pd.DataFrame(data=train_new, columns=feat_names)
        valid = pd.DataFrame(data=valid_new, columns=feat_names2)
        print('missing value treatment completed ...', '\n')
        #######################################################################################################
        
        #######################################################################################################
        ## ENCODING ##
        #######################################################################################################
        cat_columns = train.select_dtypes(include=['object']).columns.values
        print(cat_columns)
        train_cat = train[cat_columns]
        num_cols = list(set(train.columns) - set(train_cat.columns))
        train_num = train[num_cols]
        valid_cat = valid[cat_columns]
        valid_num = valid[num_cols]
        
        ce_ins = categ_encoders()
        train_cat, valid_cat = ce_ins.ce_encodings(train_cat, valid_cat, encoding='oe')
        self.enc = ce_ins

        train = pd.concat([train_cat.reset_index(drop=True), train_num], axis=1)
        valid = pd.concat([valid_cat.reset_index(drop=True), valid_num], axis=1)
        
        train = train.apply(lambda x: pd.to_numeric(x, errors='ignore'))
        valid = valid.apply(lambda x: pd.to_numeric(x, errors='ignore'))
        print('encoding completed ...', '\n')
        #need to fix below part (store the categorical classes for remapping during interpretation)
        #main.categorical_dict = categorical_names ## attribute
        #######################################################################################################

        #######################################################################################################
        ## VARIANCE THRESHOLD FEATURE SELECTION ##
        #######################################################################################################
        train, valid = helper_funcs.variance_threshold_selector(train=train, valid=valid, threshold=0.1)
        #######################################################################################################
    
        #######################################################################################################
        ## CORRELATION ANALYSIS ##
        #######################################################################################################
        # remove highly correlated features to reduce further computation time
        print('correlation analysis is happening ...', '\n')
        # Create correlation matrix
        corr_matrix = train.corr().abs()
        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        # Find index of feature columns with correlation greater than 0.9
        to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]        
        # Drop features
        #print(to_drop, '\n')
        train.drop(to_drop, axis=1, inplace=True)
        valid.drop(to_drop, axis=1, inplace=True)
        print('correlation analysis completed ...', '\n')
        main.cor_dropped_vars = to_drop ## attribute
        #######################################################################################################

        return train, valid, y_train, y_valid

In [8]:
main_object = main()

In [9]:
train, valid, y_train, y_valid = main_object.prepare(cols_to_remove=['ohd', 'manager_s_ohd', 'employee_age', 'manager_s_age', 'yearmonth'], id_col='employee_personnal_number_pa')

# RF and GBC from sklearn

In [11]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#model = RandomForestClassifier(n_estimators=300, n_jobs=-1, max_depth=10)
model = GradientBoostingClassifier(max_depth=10, n_estimators=100, learning_rate=0.1, subsample=0.8)
model.fit(train, y_train)
pred = model.predict_proba(valid)

predict=np.where(pred[:,1]>0.1,1, 0)
roc_auc_score(y_score=pred[:,1], y_true=y_valid)
recall_score(y_pred=predict, y_true=y_valid)
accuracy_score(y_pred=predict, y_true=y_valid)
precision_score(y_pred=predict, y_true=y_valid)

# LOGIT

In [13]:
## SIMPLE LOGIT MODEL CLASS ##

class logit(object):
  
  def __init__(self):
    """ class for implementing logistic regression """
    return None
  
  def logit_cv(self, train, y_train, scorer = 'roc_auc'):
    modelCV = LogisticRegression(solver='liblinear')
    # Create regularization penalty and intercept hyperparameter space
    penalty = ['l1', 'l2']
    C = np.logspace(0, 5, 5)
    hyperparameters = dict(C=C, penalty=penalty)
    
    # Create grid search using 5-fold cross validation
    clf = GridSearchCV(modelCV, hyperparameters, cv=5, verbose=1)
    # Fit grid search
    model_fit = clf.fit(train, y_train)

    # View best hyperparameters
    print('Best Penalty:', model_fit.best_estimator_.get_params()['penalty'])
    print('Best C:', model_fit.best_estimator_.get_params()['C'])
    scoring_metric = scorer # give recall/precision or f1 if needed
    results = cross_val_score(modelCV, train, y_train, cv=5, scoring=scoring_metric)
    print("5-fold cross validation average accuracy: %.3f" % (results.mean()))
    
    self.penalty = model_fit.best_estimator_.get_params()['penalty']
    self.C = model_fit.best_estimator_.get_params()['C']
    self.score = results.mean()
    
    return None
  
  def main(self, train, y_train, valid, y_valid, thresh = 0, scorer = 'roc_auc'):
    self.logit_cv(train, y_train, scorer)
    model = LogisticRegression(penalty = self.penalty, C = self.C, solver = 'liblinear')
    model.fit(train, y_train)
    
    self.model = model
    self.valid = valid
    self.y_valid = y_valid
    self.predict(thresh)
    return None
  
  def predict(self, thresh):
    model = self.model
    valid = self.valid
    y_valid = self.y_valid
    
    pred = model.predict_proba(valid)
    pred = pred[:, 1]
    self.pred = pred
    
    self.thresh_predict(thresh)
    thresh = self.thresh

    predict = np.where(pred > thresh, 1, 0)
    self.predict = predict
    # print the various evaluation metrics
    print('auc: ', roc_auc_score(y_score=pred, y_true=y_valid))
    print('recall: ', recall_score(y_pred=predict, y_true=y_valid))
    print('precision: ', precision_score(y_pred=predict, y_true=y_valid))
    print('f1: ', f1_score(y_pred=predict, y_true=y_valid))
    print('accuracy score: ', accuracy_score(y_pred=predict, y_true=y_valid))
    
    return None
  
  def get_truncated_normal(self, mean=0, sd=1, low=0, upp=10):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

  def best_thresh_score(self, yp, yt):
    rc = recall_score(y_pred=yp, y_true=yt)
    ac = accuracy_score(y_pred=yp, y_true=yt)
    rc_ac_flag = 0
    if (rc>0.6 and ac>0.6) : rc_ac_flag = 1
    score = (rc_ac_flag)*(0.6*rc + 0.4*ac)
    return score

  def opt_thresh(self, pred):
    X = self.get_truncated_normal(mean=0.1, sd=0.2, low=0, upp=0.4)
    Y = list(X.rvs(1000))

    cols = ['thresh', 'recall', 'precision', 'f1', 'acc', 'score'] #score = (0.6*recall + 0.4*acc)
    thresh_grid = []
    for i in Y:
        predict=np.where(pred > i, 1, 0)
        thresh_grid.append([i, recall_score(y_pred=predict, y_true=y_valid),
                            precision_score(y_pred=predict, y_true=y_valid),
                            f1_score(y_pred=predict, y_true=y_valid),
                            accuracy_score(y_pred=predict, y_true=y_valid),
                           self.best_thresh_score(yp=predict, yt=y_valid)])
    thresh_grid = pd.DataFrame(thresh_grid, columns=cols)
    thresh_grid.sort_values(by='score', ascending=False, inplace=True)
    thresh = thresh_grid.reset_index(drop=True).iloc[0][0]
    return thresh

  def thresh_predict(self, thresh):
    pred = self.pred
    if (thresh==0):
        thresh = self.opt_thresh(pred)
    self.thresh = thresh
    return None

In [14]:
log_results = logit()
log_results.main(train, y_train, valid, y_valid)

#RANDOM FOREST

In [16]:
# random forest class for tuning

class rf_model:
    
    def __init__(self, train, y_train, valid, y_valid):
      """ this class initializes some functions used in the random forest pipeline """
      self.train = train
      self.y_train = y_train
      self.valid = valid
      self.y_valid = y_valid
              
    def rf_score(self, params):        
        global ITERATION
        ITERATION += 1

        # Make sure parameters that need to be integers are integers
        for parameter_name in ['max_depth', 'n_estimators']:
            params[parameter_name] = int(params[parameter_name])
                
        rf_results = RandomForestClassifier(**params, random_state=randomseed)
        #rf_results.fit(X_train, y_train)
        rf_cv_scores = sklearn.model_selection.cross_val_predict(rf_results, self.train, self.y_train, cv=5, verbose=False)        
        recall_score = sklearn.metrics.recall_score(y_pred=rf_cv_scores, y_true=self.y_train)
        precision_score = sklearn.metrics.precision_score(y_pred=rf_cv_scores, y_true=self.y_train)
        f1_score = sklearn.metrics.f1_score(y_pred=rf_cv_scores, y_true=self.y_train)
        return {'loss': (1 - recall_score), 'status': STATUS_OK, 'params': params, 'iteration': ITERATION}
    
    def optimize(self):
        # Keep track of evals
        global ITERATION
        ITERATION = 0
        
        global trials
        trials = Trials()
        space = {
            'max_depth' : hp.quniform('max_depth', 5, 10, 1),
            'max_features': hp.choice('max_features', range(2, int((self.train.shape[:][1])/5))),
            'criterion': hp.choice('criterion', ['gini', 'entropy']),
            'n_estimators': hp.choice('n_estimators', np.arange(200, 1000))}
        
        # Run optimization
        best = fmin(fn = self.rf_score, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = trials, rstate = np.random.RandomState(randomseed))
        best_params = space_eval(space, best)
        #best_params = trials.best_trial['result']['params']
        return best_params, trials
    
    def rf_train(self, best_params):
        model = RandomForestClassifier(random_state = randomseed)
        model.set_params(**best_params)
        model.fit(self.train, self.y_train)
        return model
    
    def rf_predict(self, X_test, y_test, model, mode = "validate"):
        pred = model.predict_proba(self.valid)[:, 1]
        predict = np.where(pred > 0.12, 1, 0)
        
        if mode == "validate":
            recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=self.y_valid)
            precision_score = sklearn.metrics.precision_score(y_pred=predict, y_true=self.y_valid)
            f1_score = sklearn.metrics.f1_score(y_pred=predict, y_true=self.y_valid)
            auc_score = roc_auc_score(self.y_valid, pred)
            tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_pred=predict, y_true=self.y_valid).ravel()
            print(sklearn.metrics.confusion_matrix(y_pred=predict, y_true=self.y_valid), '\n')
            print('recall score is: ', recall_score)
            print('precision score is: ', precision_score)
            print('f1_score is: ', f1_score)
            print('accuracy score: ', sklearn.metrics.accuracy_score(y_true=self.y_valid, y_pred=predict))
            print('The final AUC after taking the best params and num_rounds when it stopped is {:.4f}.'.format(auc_score), '\n')
            return pred, predict, tn, fp, fn, tp
        else:
            return pred
        
    def rf_cv(self, X_train, y_train, best):
        model = RandomForestClassifier(**best, verbose=False)
        rf_cv_scores = sklearn.model_selection.cross_val_predict(model, self.train, self.y_train, cv=5)
        print('recall: ', sklearn.metrics.recall_score(y_pred=rf_cv_scores, y_true=self.y_train))
        print('precision: ', sklearn.metrics.precision_score(y_pred=rf_cv_scores, y_true=self.y_train))
        print('f1: ', sklearn.metrics.f1_score(y_pred=rf_cv_scores, y_true=self.y_train))
        print('accuracy: ', sklearn.metrics.accuracy_score(y_pred=rf_cv_scores, y_true=self.y_train))
        return None

In [17]:
rf = rf_model(train, y_train, valid, y_valid)

# calling the randomforest function and returning the best model
best, trials = rf.optimize()
print(1 - trials.average_best_error(), '\n')

model = rf.rf_train(best)

In [18]:
# cv results
rf.rf_cv(train, y_train, best)

In [19]:
# predicting using the best random forest model on the validation set
rf_pred, rf_predict, tn, fp, fn, tp = rf.rf_predict(X_test=valid, model=model, y_test=y_valid, mode='validate')

In [20]:
# print the various evaluation metrics
print('auc: ', roc_auc_score(y_score=pred[:,1], y_true=y_valid))
print('recall: ', recall_score(y_pred=predict, y_true=y_valid))
print('precision: ', precision_score(y_pred=predict, y_true=y_valid))
print('f1: ', f1_score(y_pred=predict, y_true=y_valid))
print('accuracy score: ', accuracy_score(y_pred=predict, y_true=y_valid))

In [21]:
from treeinterpreter import treeinterpreter as ti

prediction, bias, contributions = ti.predict(model, valid)

In [22]:
# explanation per instance

print("Prediction", prediction[0][1])
print("Bias (trainset prior)", bias[0][1])
print("Feature contributions:")
for c, feature in zip(contributions[0][:,1], 
                             valid.columns.values):
    print(feature, c)

In [23]:
# function to bucket sparse levels in categorical features to the 'others' category as well as handle new values in the valid df

from sklearn.base import TransformerMixin, BaseEstimator
from collections import defaultdict

class CategoryGrouper(BaseEstimator, TransformerMixin):  
    """A tranformer for combining low count observations for categorical features.
    This transformer will preserve category values that are above a certain threshold, while bucketing together all the other values. This will fix issues where new data may have an unobserved category value that the training data did not have.
    """
    
    def __init__(self, threshold=0.05):
        """ Initialize method.
        Args: threshold (float): The threshold to apply the bucketing when categorical values drop below that threshold.
        """
        self.d = defaultdict(list)
        self.threshold = threshold

    def transform(self, X, **transform_params):
        """Transforms X with new buckets.
        Args: X (obj): The dataset to pass to the transformer.
        Returns: The transformed X with grouped buckets.
        """
        X_copy = X.copy()
        for col in X_copy.columns:
            X_copy[col] = X_copy[col].apply(lambda x: x if x in self.d[col] else 'others')
        return X_copy

    def fit(self, X, y=None, **fit_params):
        """ Fits transformer over X.
        Builds a dictionary of lists where the lists are category values of the
        column key for preserving, since they meet the threshold.
        """
        df_rows = len(X.index)
        for col in X.columns:
            calc_col = X.groupby(col)[col].agg(lambda x: (len(x) * 1.0) / df_rows)
            self.d[col] = calc_col[calc_col >= self.threshold].index.tolist()
        return self

In [24]:
# dfs with 100 elements in cat1 and cat2
# note how df_test has elements 'g' and 't' in the respective categories (unknown values)
df_train = pd.DataFrame({'cat1': ['a'] * 20 + ['b'] * 30 + ['c'] * 40 + ['d'] * 3 + ['e'] * 4 + ['f'] * 3,
                         'cat2': ['z'] * 25 + ['y'] * 25 + ['x'] * 25 + ['w'] * 20 +['v'] * 5})
df_test = pd.DataFrame({'cat1': ['a'] * 10 + ['b'] * 20 + ['c'] * 5 + ['d'] * 50 + ['e'] * 10 + ['g'] * 5,
                        'cat2': ['z'] * 25 + ['y'] * 55 + ['x'] * 5 + ['w'] * 5 + ['t'] * 10})

In [25]:
catgrouper = CategoryGrouper()
catgrouper.fit(df_train)
df_test_transformed = catgrouper.transform(df_test)
df_train_transformed = catgrouper.transform(df_train)

df_train_transformed