In [2]:
## importing the relevant packages:

# clear the workspace
#%reset -f

# print list of files in directory
import os
print(os.listdir())

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect, re
from datetime import datetime as dt

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
import sklearn
from sklearn import metrics, preprocessing
import sklearn.decomposition as decomposition
from sklearn.model_selection import cross_val_score
from sklearn.base import TransformerMixin
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import category_encoders as ce
from scipy.stats import truncnorm

# the modelling packages and related
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# hyperopt modules
#from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, Trials, space_eval, rand
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

MAX_EVALS = 5
randomseed = 5 # the value for the random state used at various points in the pipeline
pd.options.display.max_rows = 1000 # specify if you want the full output in cells rather the truncated list
pd.options.display.max_columns = 200

# to display multiple outputs in a cell without usin print/display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
## HELPER FUNCTIONS CLASS ##

class helper_funcs():
    
    def __init__():
        """ helper functions used across the pipeline """
    
    ## datetime feature engineering
    def datetime_feats(train, valid):
        cols = [s for s in train.columns.values if 'date' in s]
        print('datetime feature engineering is happening ...', '\n')
        # nested function to derive the various datetime features for a given date column
        def dt_feats(df, col):
            df[col] = pd.to_datetime(df[i])
            #df[str(col+'_'+'day')] = df[col].dt.day
            #df[str(col+'_'+'day_name')] = df[col].dt.day_name
            #df[str(col+'_'+'dayofweek')] = df[col].dt.dayofweek
            df[str(col+'_'+'dayofyear')] = df[col].dt.dayofyear
            #df[str(col+'_'+'days_in_month')] = df[col].dt.days_in_month
            df[str(col+'_'+'month')] = df[col].dt.month
            #df[str(col+'_'+'month_name')] = df[col].dt.month_name
            df[str(col+'_'+'quarter')] = df[col].dt.quarter
            #df[str(col+'_'+'week')] = df[col].dt.week
            #df[str(col+'_'+'weekday')] = df[col].dt.weekday
            df[str(col+'_'+'year')] = df[col].dt.year
            #df[col] = df[col].dt.date
            df = df.drop([col], axis = 1)
            return df
        # loop function over all raw date columns
        for i in cols:
            train = dt_feats(train, i)
            valid = dt_feats(valid, i)
        return train, valid
    
    ## function to get frequency count of elements in a vector/list
    def freq_count(input_vector):
        return collections.Counter(input_vector)
    
    # removing near zero variance columns
    def variance_threshold_selector(train, valid, threshold):
        print('input data shape is: ', train.shape, '\n')
        selector = VarianceThreshold(threshold)
        selector.fit(np.asanyarray(train))
        X = train[train.columns[selector.get_support(indices=True)]]
        Y = valid[valid.columns[selector.get_support(indices=True)]]
        #display(pd.DataFrame(X.head(5)))
        print('output data shape is: ', X.shape, '\n')
        return X, Y

In [4]:
## MISSING VALUE IMPUTATION CLASS ##

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):
        X_temp = X.copy()
        #self.fill = pd.Series([X_temp.groupby(['pay_grade_group_pa', 'global_job_om'])[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp.groupby(['pay_grade_group_pa', 'global_job_om'])[c].mean() for c in X_temp], index=X_temp.columns)
        #X_temp = self.fill.copy().reset_index()
        #self.fill = pd.Series([X_temp.groupby(['pay_grade_group_pa'])[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp.groupby(['pay_grade_group_pa'])[c].mean() for c in X_temp], index=X_temp.columns)
        #X_temp = self.fill.copy().reset_index()
        self.fill = pd.Series([X_temp[c].value_counts().index[0] if X_temp[c].dtype == np.dtype('O') else X_temp[c].mean() for c in X_temp], 
                              index=X_temp.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def num_missing(self):
        return sum(self.isnull())

In [23]:
# function to bucket sparse levels in categorical features to the 'others' category as well as handle new values in the valid df

from sklearn.base import TransformerMixin, BaseEstimator
from collections import defaultdict

class CategoryGrouper(BaseEstimator, TransformerMixin):  
    """A tranformer for combining low count observations for categorical features.
    This transformer will preserve category values that are above a certain threshold, while bucketing together all the other values. This will fix issues where new data may have an unobserved category value that the training data did not have.
    """
    
    def __init__(self, threshold=0.05):
        """ Initialize method.
        Args: threshold (float): The threshold to apply the bucketing when categorical values drop below that threshold.
        """
        self.d = defaultdict(list)
        self.threshold = threshold

    def transform(self, X, **transform_params):
        """Transforms X with new buckets.
        Args: X (obj): The dataset to pass to the transformer.
        Returns: The transformed X with grouped buckets.
        """
        X_copy = X.copy()
        for col in X_copy.columns:
            X_copy[col] = X_copy[col].apply(lambda x: x if x in self.d[col] else 'others')
        return X_copy

    def fit(self, X, y=None, **fit_params):
        """ Fits transformer over X.
        Builds a dictionary of lists where the lists are category values of the
        column key for preserving, since they meet the threshold.
        """
        df_rows = len(X.index)
        for col in X.columns:
            calc_col = X.groupby(col)[col].agg(lambda x: (len(x) * 1.0) / df_rows)
            self.d[col] = calc_col[calc_col >= self.threshold].index.tolist()
        return self

In [24]:
# dfs with 100 elements in cat1 and cat2
# note how df_test has elements 'g' and 't' in the respective categories (unknown values)
df_train = pd.DataFrame({'cat1': ['a'] * 20 + ['b'] * 30 + ['c'] * 40 + ['d'] * 3 + ['e'] * 4 + ['f'] * 3,
                         'cat2': ['z'] * 25 + ['y'] * 25 + ['x'] * 25 + ['w'] * 20 +['v'] * 5})
df_test = pd.DataFrame({'cat1': ['a'] * 10 + ['b'] * 20 + ['c'] * 5 + ['d'] * 50 + ['e'] * 10 + ['g'] * 5,
                        'cat2': ['z'] * 25 + ['y'] * 55 + ['x'] * 5 + ['w'] * 5 + ['t'] * 10})

In [25]:
catgrouper = CategoryGrouper()
catgrouper.fit(df_train)
df_test_transformed = catgrouper.transform(df_test)
df_train_transformed = catgrouper.transform(df_train)

df_train_transformed