### This is my library of Python functions to speed up standard ML analysis of tabular data.

Author: Mykola Pinchuk

Started on 05/21/2022

In [1]:
def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: 
    fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    
    # set df_pred to None if it does not exist
    if (cat_features is not None):
        if (cat_fill=='mode'):

            df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            if (df_pred is not None):
                df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])

        if (cat_fill=='missing'):

            df_train[cat_features] = df_train[cat_features].fillna(value='missing')
            df_test[cat_features] = df_test[cat_features].fillna(value='missing')
            if (df_pred is not None):
                df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    if (cat_features is not None):
        all_good = (
        (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
        (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()))
        if (all_good):
            print('Missing values imputed successfully')
        else:
            print('There are still some missing values...')
    else:
        all_good = (
        (np.prod(df_train[num_features].shape)==df_train[num_features].count().sum()) and 
        (np.prod(df_test[num_features].shape) == df_test[num_features].count().sum()))
        if (all_good):
            print('Missing values imputed successfully')
        else:
            print('There are still some missing values...')
# END

    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    
    columns_before = df_train.shape[1]
    
    for feature_name in features:
        
        if df_train[feature_name].count()==df_train.shape[0]:
            continue
        
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
            
        columns_after = df_train.shape[1]
            
    print(columns_after-columns_before, ' dummy features added')
# END
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)"""
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')
# END


def log_transformer_mp_i1(df_train, df_test, df_pred=None, feature_subset=False, max_skew=3):
    """This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)"""
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[(df_train.skew())>max_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if (df_pred is not None):
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
# END
    
    
def add_dummyfeatures(df_train, df_test, df_pred, feature_dict):
    """This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})"""
    input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
        df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
# END

In [2]:
# need to add function to add dummy feature where continuous feature is equal to some number, e.g., 0.
# see SpaceTitanic for raw code.

# add function to delete columns from all datasets

In [3]:
help(fillna_mp_i1)


Help on function fillna_mp_i1 in module __main__:

fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode')
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: 
    fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)

