In [1]:
# create toggle for hiding or displaying raw code
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [2]:
# hiding error warnings
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

In [3]:
# hide warning messages
import warnings
warnings.filterwarnings('ignore')

<a id="desc"></a>

<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Part II: Data Preprocessing</tt></span></h1>
</center>

![Imgur](https://i.imgur.com/WADp795.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>EXTREME GRADIENT BOOSTING PIPELINE</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/dwEyicL.png)
<center>
<h1><span style="font-size:6px;font-family:Times New Roman,Times,serif"><tt>image: Shutterstock</tt></span></h1>
</center>
<br>



### Author: Jan Erish Baluca
[**LinkedIn**](https://www.linkedin.com/in/jan-erish-baluca-099569103/)  
[**Portfolio on Github**](https://github.com/JanErish/Portfolio_von_Jan)  

**Data**:  
* Credit card transactions in September 2013 by european card holders that occured in two days.  
* All the features are numeric as a result of PCA (Principal Component Analysis) transformations (except 'Time' and 'Amount').  
- _Due to confidentiality issues, the original features and background information are not available._  
- **'Class'** is the target variable. The value **1** is for cases of fraud and **0** for otherwise.  

[_Dataset Source: Kaggle_](https://www.kaggle.com/mlg-ulb/creditcardfraud/data)

# Table of Contents
## Part 2: Extreme Gradient Boosting: Preprocessing Pipeline   
1. [Imputation and Encoding](#impute)  
2. [Resampling](#sample)  
3. [Feature Scaling](#scale)  
4. [Feature Extraction](#extract)  
5. [Feature Selection](#select)  
6. [Verdict](#verdict)  
7. [Applying to test set/new data](#test)  

# Introduction:

* The following is a data preprocessing pipeline I authored powered by Extreme Gradient Boosting.  
* The following preprocessing steps are meant to be housed within a custom sklearn-compatible estimator for:
    * fitting to and transforming training data within the estimator
    * transforming test data for .predict() and .predict_proba() using statistics learned from the training data  
* The next step in the estimator's pipeline is **Hyperparameter Tuning**, which will be in Part 3.
------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#3366cc"> The main goal is to be able to let the following algorithms handle decision-making on which methods to use for every step in pre-processing with just a few lines of code and minimal effort for maximum performance.  </span>

In [4]:
# import packages

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import xgboost as xgb
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder

import numbers

from sklearn.decomposition import FactorAnalysis,FastICA,IncrementalPCA,KernelPCA,\
LatentDirichletAllocation,MiniBatchDictionaryLearning,MiniBatchSparsePCA,NMF,PCA,SparsePCA,\
TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.cross_decomposition import PLSRegression
from collections import Counter

# impoering sklearn metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, hamming_loss,\
jaccard_similarity_score, log_loss, matthews_corrcoef, precision_score, recall_score, zero_one_loss,\
explained_variance_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score

# importing imblearn resampling classes
from imblearn.under_sampling import ClusterCentroids,CondensedNearestNeighbour,EditedNearestNeighbours,\
RepeatedEditedNearestNeighbours,AllKNN,InstanceHardnessThreshold,NearMiss,NeighbourhoodCleaningRule,OneSidedSelection,\
RandomUnderSampler,TomekLinks
from imblearn.over_sampling import ADASYN,RandomOverSampler,SMOTE
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.ensemble import BalanceCascade,BalancedBaggingClassifier,EasyEnsemble


from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from imblearn.pipeline import make_pipeline
import operator

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, minmax_scale, RobustScaler, \
                                    MaxAbsScaler, QuantileTransformer
from sklearn import tree


from colorama import Fore, Style


warnings.filterwarnings('ignore')



In [5]:
""" Importing the dataset"""

df = pd.read_csv('creditcard.csv')
df = df.drop('Time', axis=1)
df.reset_index(inplace=True)
target = 'Class'


# Adding random missing values to both independent and dependent variables
import random
def add_random_na(row):
    vals = row.values
    for _ in range(random.randint(0,len(vals)-2)):
        i = random.randint(0,len(vals)-1)
        vals[i] = np.nan
    return vals
df = df.apply(add_random_na,axis=1)


X = df.drop(target,axis=1)
y = pd.DataFrame(df[target])
print(f'\033[1m{Fore.RED}X with randomly generated missing values:{Style.RESET_ALL}\033[0m')
X.head(10)

[1m[31mX with randomly generated missing values:[0m[0m


Unnamed: 0,index,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,,,-0.072781,,1.378155,,,,,0.363787,...,0.251412,,,,,0.128539,-0.189115,,-0.021053,
1,1.0,,0.266151,0.16648,0.448154,0.060018,-0.082361,,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,,,-1.340163,1.773209,,,1.800499,,,-1.514654,...,,,0.771679,,,-0.327642,,,,378.66
3,3.0,,,,,-0.010309,1.247203,0.237609,,,...,,,0.005274,,-1.175575,,,0.062723,,
4,,,,,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,,,,,0.141267,-0.20601,0.502292,,,
5,,-0.425966,,1.141109,-0.168252,0.420987,-0.029728,,0.260314,,...,,,,-0.026398,,-0.232794,,,,
6,6.0,,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.219633,-0.167716,-0.27071,-0.154104,,0.750137,,0.034507,0.005168,4.99
7,7.0,-0.644269,,1.07438,-0.492199,,0.428118,1.120631,,0.615375,...,,,-1.015455,0.057504,,,-0.051634,-1.206921,-1.085339,40.8
8,8.0,-0.894286,0.286157,-0.113192,,2.669599,,0.370145,0.851084,-0.392048,...,,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,
9,9.0,-0.338262,,,,,,0.651583,,,...,0.203711,-0.246914,-0.633753,-0.120794,-0.38505,,0.094199,,0.083076,3.68


In [6]:
print(f'\033[1m{Fore.RED}Target class values:{Style.RESET_ALL}\033[0m')
y.iloc[:,0].unique()

[1m[31mTarget class values:[0m[0m


array([  0.,  nan,   1.])

<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Nullifier
    </span> 

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### .null_values: list
* list of values that do not belong to a fitted array's valid values

### .null_dictionary
* a dictionary of values considered null for each column of the fitted DataFrame

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .find/ .finder(a, valid_values)
Lists values that do not belong to the provided valid values for an array.  
**Parameters:**
* a: *array-like, shape (n_samples,)*  
    * array or DataFrame column for which null values will be found  
* valid_values: _list_
    * list of values considered valid.
    * values that do not belong to this list will be included to .null_values
**Returns:**
* null_values: *list*  
    * list of values to be considered null  
 
### .nullify(a)
Converts values in null_values into NaNs. 
**Parameters:**
* a: *array-like, shape (n_samples,)*  
    * array or DataFrame column for which null values will be found  
* valid_values: _list_
    * list of values considered valid.
    * values that do not belong to this list will be included to .null_values
**Returns:**
* a: *array-like, shape (n_samples,)*  
    * array with nullified invalid values

### .find_nullify(a, valid_values)
Efficienty finds null_values and converts them to NaNs.
**Returns:**
* a: *array-like, shape (n_samples,)*  
    * array with nullified invalid values

### .df_nullifier(df, valid_dictionary)
Converts values in null_values into NaNs. 
**Parameters:**
* df: *pandas DataFrame, shape (n_samples, n_features)*  
    * DataFrame whose columns will be nullified using the dictionary of valid values  
* valid_dictionary: _dictionary_
    * a dictionary comprising of the following key/value pairs:  
        **Key** =  column name  
        **Value** = list of valid values for the column.  
**Returns:**
* df: *pandas DataFrame, shape (n_samples, n_features)*  
    * DataFrame with null values turned into NaNs  
    
### .df_null_dictionary(df, valid_dictionary)
Returns a dictionary of values considered null for each DatFrame column  
**Parameters:**  
* df: *pandas DataFrame, shape (n_samples, n_features)*  
    * DataFrame whose columns will be nullified using the dictionary of valid values  
* valid_dictionary: _dictionary_
    * a dictionary comprising of the following key/value pairs:  
        **Key** =  column name  
        **Value** = list of valid values for the column.    
        
**Returns:**  
* null_dictionary: *pandas DataFrame, shape (n_samples, n_features)*  
    * a dictionary comprising of the following key/value pairs:  
        **Key** =  column name  
        **Value** = list of null values for the column.  

In [7]:
class Nullifier:
    def __init__(self):
        self

    def find(self, a, valid_values):
        a = pd.DataFrame(a) 
        valid_indeces = a.iloc[:,0].isin(valid_values)
        if len(valid_indeces) != 0:
            invalid_a = pd.DataFrame(a.drop(a.index[valid_indeces]))
            self.null_values = list(invalid_a.iloc[:,0].unique())
        return self
    
    def finder(self, a, valid_values):
        self.find(a, valid_values)
        return self.null_values
    
    def nullify(self, a):
        a = pd.DataFrame(a)
        if len(self.null_values) != 0:
            for null_val in self.null_values:
                a = a.replace([null_val], np.nan)
        return a
    
    def find_nullify(self, a, valid_values):
        self.find(a, valid_values)
        return self.nullify(a)
    
    # DataFrame nullifier
    def df_nullifier(self, df, valid_dictionary):
        """ This method should be given a pandas DataFrame and a dictionary comprising of the following key/value pairs:
            Key =  column name
            Value = list of valid values for the column."""
        for col, valid_vals in valid_dictionary.items():
            df[col] = self.find_nullify(df[col], valid_vals)
        return df
    
    def df_null_dictionary(self, df, valid_dictionary):
        """Returns a dictionary of values considered null for each DatFrame column"""
        self.null_dictionary = {}
        for col, valid_vals in valid_dictionary.items():
            self.null_dictionary[col] = self.finder(df[col], valid_vals)
        return self.null_dictionary

<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;MCAR_Dropper</span><i><span style="font-size:23px;font-weight:bold;color:#3366cc">(null_values=None)</span></i>  
 
This class automatically drops rows with missing data for a categorical target/dependent variable (y) under the assumption that the data is Missing Completely at Random (MCAR). Parameter Null_values can be fed a list of label values that will be considered Null and thus will be dropped

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters/Attributes:  </span>

### .null_values: list
* list of values that do not belong to a y's valid values


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit_transform(X, y)  

**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
    
**Returns:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features whose rows with missing target values have been dropped
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target whose rows with missing target values have been dropped

In [8]:
class MCAR_Dropper:
    """This transformer is not compatible with sklearn Pipeline.
    
    This class automatically drops rows with missing data for a categorical target/dependent variable
    under the assumption that the data is Missing Completely at Random (MCAR).
    
    Parameter Null_values can be fed a list of label values that will be considered Null and thus will be dropped"""
    def __init__(self, null_values=None):
        self.null_values = null_values
    def fit(self, X, y):
        return self
    def transform(self, X, y):
        X_nonull = X
        y_nonull = y
        null_inds = pd.isnull(y).any(1).nonzero()[0]
        if len(null_inds) != 0:
            X_nonull = pd.DataFrame(X.drop(X.index[null_inds])).reset_index(drop=True)
            y_nonull = pd.DataFrame(y.drop(y.index[null_inds])).reset_index(drop=True)
        
        # drop values considered as null based on given list
        if self.null_values is not None:
            for value in self.null_values:
                null_inds = y_nonull.iloc[:,0] == value
                if len(null_inds) != 0:
                    X_nonull = pd.DataFrame(X_nonull.drop(X_nonull.index[null_inds])).reset_index(drop=True)
                    y_nonull = pd.DataFrame(y_nonull.drop(y_nonull.index[null_inds])).reset_index(drop=True)
        
        return X_nonull, y_nonull
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X, y)

<span style="font-size:25px;font-weight:bold;color:#b22222"> Nullifier + MCAR_Dropper: dropping rows with missing y values  </span>

In [65]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [9]:
# Dropping rows which have missing y values
X_mcar, y_mcar = MCAR_Dropper(null_values=Nullifier().finder(a=y,valid_values=[1,0])).fit_transform(X, y)
print(f'\033[1m{Fore.RED}Checking for null target values:{Style.RESET_ALL}\033[0m')
y_mcar.isnull().sum()

[1m[31mChecking for null target values:[0m[0m


Class    0
dtype: int64

In [62]:
print(f'\033[1m{Fore.RED}X (MCAR) missing value count:{Style.RESET_ALL}\033[0m')
X_mcar.isnull().sum()

[1m[31mX (MCAR) missing value count:[0m[0m


index     57030
V1        57210
V2        57025
V3        57078
V4        57078
V5        56949
V6        57025
V7        57027
V8        56953
V9        57032
V10       56740
V11       57085
V12       57040
V13       56821
V14       57103
V15       56974
V16       57101
V17       56842
V18       56982
V19       56960
V20       56827
V21       57136
V22       57053
V23       56922
V24       56630
V25       56732
V26       57084
V27       56691
V28       56878
Amount    56858
dtype: int64

In [10]:
# splitting data to training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_mcar, y_mcar,
                                                test_size=0.2, random_state=69, stratify=y_mcar)

<a id="impute"></a>

![Imgur](https://i.imgur.com/gwV4Msa.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Imputation and Encoding</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/xlClQAG.png)




<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Imputation_Nation</span><i><span style="font-size:16px;font-weight:bold;color:#3366cc">(impute_y=False, objective="reg:linear", missing_values='NaN', num_imput_strat='mean', cat_imput_strat='most_frequent',  imputation_axis=0, Cat_mask = None, Num_cols = None, X_fillers=[], y_filler=None, classification_objectives=None)</span></i>  
 
This class handles the imputation of missing values and the encoding and creation of dummy variables for categorical variables. If the target variable is passed as y, it will also be imputed if impute_y= True. Otherwise, y will pass through as untouched.  

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters:  </span>

### impute_y: boolean, default=False
* whether or not y will also undergo imputation and encoding

### objective: str, default='reg:linear'
* refers to the xgboost objective being used
* if objective is a classification objective, and impute_y is True, y will be handled as categorical data

### missing_values: str, default='NaN'
* identifies what will be considered as missing values  

### num_imput_strat: str, default='mean'
* 'mean', 'median', or 'most frequent'
* determines how numerical variables will be imputed using the sklearn Imputer module

### cat_mask: list, default=None
* allows user to pass a list of features which will be considered categorical
* if left None, categorical nature of features will be determined through dtype

### num_cols: list, default=None
* allows user to pass a list of features which will be considered numerical
* if left None, numerical nature of features will be determined through dtype

### X_fillers: pandas DataFrame, default=[]
* allows user to pass a DataFrame of features and their respective values used to fill in NaNs
* sourced from previous use of fit_transform and most useful for imputing test or new data

### y_filler: pandas DataFrame, default=None
* allows user to pass a value used to fill in NaNs in the target y  

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### X_fillers: pandas DataFrame, default=[]
* DataFrame of features and their respective values used to fill in NaNs
* sourced from previous use of fit_transform and most useful for imputing test or new data

### y_filler: pandas DataFrame, default=None
* value used to fill in NaNs in the target y  
  
------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit_transform(X, y=None)  

**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
    
**Returns:**
* X: *pandas DataFrame, shape (n_samples, n_features)*  
    * imputed and encoded independent variables / features
* y: *pandas DataFrame, shape (n_samples,)*    
    * imputed or untouched dependent variable

In [11]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [12]:
# Imputer Class

class Imputation_Nation:
        """This class handles the imputation of missing values
        and the encoding and creation of dummy variables for categorical variables.

        If the target variable is passed as y, it will also be imputed if impute_y= True.
        Otherwise, y will pass through as untouched."""
        
        classification_objectives = ['reg:logistic','binary:logistic', 'binary:logitraw',
                            'multi:softmax', 'multi:softprob', 'rank:pairwise' ]
        
        missing_values = None
        num_imput_strat = None
        cat_imput_strat = None
        imputation_axis = None
        objective = None
        impute_y = None
        X_fillers=None
        y_filler=None
        Cat_mask = None
        Num_cols = None
        
        def __init__(self, impute_y=False, objective="reg:linear",missing_values='NaN', num_imput_strat='mean', 
                     cat_imput_strat='most_frequent', imputation_axis=0,
                    Cat_mask = None, Num_cols = None, X_fillers=[],y_filler=None,
                    
                     classification_objectives=None):
        
            # assigning parameters as instance variables
            varses = list(vars(Imputation_Nation).keys())
            self.variables = varses[len(varses) - varses[::-1].index('__doc__') : varses.index('__init__')]
            class_name = "Imputation_Nation"+"."
            for v in self.variables:
                # if the class variable for the argument is not empty, assign its value as the instance variable
                if eval("%s != None" % (class_name+v)) is True:
                    exec("self.%s = %s" % (v,class_name+v))
                # if the class variable is empty, assign to instance the value passed as argument during instantiation
                else:
                    exec("self.%s = %s" % (v, v)) 

        def fit(self, X, y=None):
            return self
        
        def transform(self, X, y=None):
                
            X_full = pd.DataFrame(X)
            if y is not None:
                y_full = pd.DataFrame(y)
                
            # identify categorical columns
            if self.Cat_mask is None:
                Cat_mask = X_full.dtypes == object
                Cat_cols = X_full.columns[Cat_mask].tolist()
            else:
                Cat_mask = self.Cat_mask
                Cat_cols = X_full.columns[Cat_mask].tolist()
            
            # identify numeric columns
            if self.Num_cols is None:
                Num_cols = X_full.select_dtypes(exclude=['object']).columns.tolist()
            else:
                Num_cols = self.Num_cols
            
            fill_cols = []
            fill_values = []
            
            # Encoding and imputing categorical variables
            if len(Cat_cols) != 0:
                for col in Cat_cols:

                    X_full.loc[:,col] = X_full.loc[:,col].fillna(self.missing_values)
                    missing_count = sum(X_full.loc[:,col] == self.missing_values)
 
                    if len(self.X_fillers) != 0 and col in self.X_fillers.index:
                        if missing_count != 0:
                            X_full.loc[:,col]=X_full.loc[:,col].replace(self.missing_values,
                                                                        self.X_fillers.loc[col,'Fills'])
                    else:
                        if missing_count != 0:
                            values, counts = np.unique(X_full.loc[X_full.loc[:,col]!=self.missing_values,col],
                                                       return_counts=True)
                            m = counts.argmax()
                            most_frequent = [values[m]][0]
                            X_full.loc[:,col] = X_full.loc[:,col].replace(self.missing_values,most_frequent)

                            fill_cols.append(col)
                            fill_values = fill_values + [most_frequent]

                X_full = pd.get_dummies(X_full, drop_first=True, columns=Cat_cols)
            
            # Imputing numeric columns
            if len(Num_cols) != 0:
                for col in Num_cols:
                    if len(self.X_fillers) != 0 and col in self.X_fillers.index:
                        X_full.loc[:,col] = X_full.loc[:,col].fillna(self.X_fillers.loc[col,'Fills'])
                    else:
                        IMP=Imputer(missing_values=self.missing_values,strategy=self.num_imput_strat, 
                                    axis=self.imputation_axis)
                        IMP.fit(X_full.loc[:,col].values.reshape(-1, 1))
                        X_full.loc[:,col] = IMP.transform(X_full.loc[:,col].values.reshape(-1, 1))
                        fill_cols.append(col)
                        fill_values = fill_values  + (list(IMP.statistics_))
            
            if y is not None and self.impute_y == True:
                if self.objective in self.classification_objectives:
                    col = y_full.columns[0]
                    y_full.loc[:,col] =  y_full.loc[:,col].fillna(self.missing_values)
                    
                    # checking if values are numeric
                    numerics = 0
                    for value in list(y_full.loc[:,col].unique()):
                        if isinstance(value, numbers.Number) is True:
                            numerics += 1
                    if numerics > 0:
                        y_is_numeric = True
                    else:
                        y_is_numeric = False
                        
                    # convert all values to string
                    y_full.loc[:,col] = y_full.loc[:,col].apply(str) 
                    
                    # count missing values
                    missing_count = sum(y_full.loc[:,col] == self.missing_values)
                    
                    # if filler value has been provided, use it as replacement
                    if self.y_filler is not None and missing_count != 0:
                        y_full.loc[:,col]=y_full.loc[:,col].replace(self.missing_values,
                                                                    self.y_filler.loc[col,'Fills'])
                    elif missing_count != 0:
                        values, counts = np.unique(y_full.loc[y_full.loc[:,col]!=self.missing_values,col],
                                                   return_counts=True)
                        m = counts.argmax()
                        most_frequent = [values[m]][0]
                        y_full.loc[:,col] = y_full.loc[:,col].replace(self.missing_values,most_frequent)
                        self.y_filler = most_frequent
                        
                    # if values were originally numeric, convert to int
                    if y_is_numeric is True:
                        y_full.loc[:,col] = [int(float(value)) for value in y_full.loc[:,col]]
                    
                
                else:
                    if len(self.y_filler) != 0:
                        y_full.loc[:,col] = y_full.loc[:,col].fillna(self.y_filler)
                    else:
                        IMP = Imputer(missing_values=self.missing_values,strategy=self.num_imput_strat, 
                                    axis=self.imputation_axis)
                        IMP.fit(y_full.loc[:,col])
                        y_full.loc[:,col] = IMP.transform(y_full.loc[:,col])
                        self.y_filler = list(IMP.statistics_)
            
            self.X_fillers = pd.DataFrame()
            self.X_fillers['Columns'] = fill_cols
            self.X_fillers['Fills'] = fill_values
            self.X_fillers.set_index('Columns', inplace=True)
            
            if y is not None:
                return pd.DataFrame(X_full), pd.DataFrame(y_full)
            return pd.DataFrame(X_full)
        
        def fit_transform(self, X, y=None):
            self.fit(X, y)
            return self.transform(X, y)


<span style="font-size:30px;font-weight:bold;color:#b22222"> Imputation_Nation in action  </span>

In [13]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [14]:
I_N = Imputation_Nation(objective="binary:logistic", impute_y=True)
X_imputed, y_imputed = I_N.fit_transform(X_train, y_train)
print(f'\033[1m{Fore.RED}Imputed and Encoded X_train:{Style.RESET_ALL}\033[0m')
X_imputed.head(10)

[1m[31mImputed and Encoded X_train:[0m[0m


Unnamed: 0,index,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
45924,71071.0,1.112634,0.008148,0.000802,2.514559,-0.313505,-0.117546,0.005794,0.056953,0.002071,...,-0.175697,-0.04588,-0.127814,0.142695,0.0008,0.000854,3.6e-05,0.023632,0.030464,0.77
170525,263822.0,-0.741988,0.724629,1.105168,-0.015564,0.999639,-0.002993,0.874461,-0.270365,-0.782157,...,-0.00148,0.002016,0.895954,-0.554227,0.067597,0.000854,3.6e-05,0.034668,0.063369,87.959687
56081,142772.993236,0.002235,0.008148,0.000802,0.421228,-0.005245,-0.002993,-0.397842,0.002292,0.002071,...,-0.00148,-0.015377,0.000876,-0.000661,-0.412952,0.000854,0.435571,-0.020457,-0.000223,76.46
120027,142772.993236,2.282386,0.008148,-0.984139,-1.812335,-1.029801,-0.002993,-1.443545,0.089758,0.002071,...,-0.401575,-0.143415,0.000876,0.207734,0.0008,-0.228512,-0.189058,0.013076,-0.054427,19.2
117252,181424.0,1.602153,-2.090925,-0.308115,-0.735557,-0.005245,-0.002993,-1.280689,0.204274,0.002071,...,0.425196,0.435741,0.646779,-0.000661,0.0008,-0.593891,-0.248057,-0.018549,-0.000223,268.0
73480,113572.0,-2.909004,1.469046,-0.919853,1.543391,-1.633809,2.489002,0.644544,0.002292,0.786929,...,-0.651084,0.047422,1.250971,0.007136,-0.868204,-1.286972,-0.256837,-0.614046,-0.25863,292.17
173065,267768.0,0.114134,0.945923,-0.515894,0.001551,1.060809,-0.345297,0.876227,0.065495,-0.253679,...,0.037631,-0.307573,-0.794852,0.066814,0.239695,-0.419018,0.112853,0.214251,0.071237,10.11
51174,142772.993236,1.42092,0.008148,-0.265795,0.001551,-0.490289,-0.554512,0.005794,-0.031047,-1.000422,...,0.007863,-0.22064,-0.922325,0.025935,-0.578219,0.360725,-0.490796,-0.001058,-0.006215,24.0
182463,142772.993236,0.002235,0.008148,0.000802,0.001551,-0.005245,-1.1679,0.005794,0.002292,0.262477,...,-0.172705,0.002016,0.000876,-0.000661,0.0008,-0.532645,3.6e-05,-0.001058,-0.000223,4.49
33249,51575.0,1.056573,0.405634,0.562085,2.55057,-0.005245,-0.512296,0.276847,-0.185107,0.002071,...,0.031035,0.102413,0.145298,-0.13919,0.414416,0.553505,0.058215,-0.021225,0.034642,75.31


In [48]:
print(f'\033[1m{Fore.RED}Imputed X_train missing value count:{Style.RESET_ALL}\033[0m')
X_imputed.isnull().sum()

[1m[31mImputed X_train missing value count:[0m[0m


index     0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64

<span style="font-size:16px;font-weight:bold;color:#b22222"> Filler values collected from X_train:  </span>

In [15]:
I_N.X_fillers

Unnamed: 0_level_0,Fills
Columns,Unnamed: 1_level_1
index,142772.993236
V1,0.002235
V2,0.008148
V3,0.000802
V4,0.001551
V5,-0.005245
V6,-0.002993
V7,0.005794
V8,0.002292
V9,0.002071


<span style="font-size:16px;font-weight:bold;color:#b22222"> Collected filler for missing y values:  </span>

In [16]:
I_N.y_filler

<a id="sample"></a>

![Imgur](https://i.imgur.com/OwNPE6P.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Dealing with Imbalance: Resampling</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/YOQuMbt.png)


<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;HyperSampler</span><i><span style="font-size:16px;font-weight:bold;color:#3366cc">(sampler="HyperSampler", best_params={}, booster='gbtree', objective='reg:logistic', scorer='Auto', test_samplers=[], excluded_samplers=[], additional_samplers=[], set_sampler_params={})</span></i>  
 
"""This transformer is not compatible with the current sklearn Pipeline.
    
        Options for resampling methods methods (parameter sampler='') are the following:
            Binary_Undersampler, ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours,
            RepeatedEditedNearestNeighbours, AllKNN, InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, 
            OneSidedSelection, RandomUnderSampler, TomekLinks, ADASYN, RandomOverSampler, SMOTE, SMOTEENN, 
            SMOTETomek, BalanceCascade, BalancedBaggingClassifier, EasyEnsemble
.
        
        If sampler = 'HyperSampler',
            every resampling method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The resampling method that contributes to the best score will be applied to the data to be returned.
            
            HyperSampler.resampler = chosen resampling class assigned after instantiation 
                                                                        or after fit() if sampler = 'HyperSampler'
        
        This feature selection transformer works only with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        This class also requires the target (y) to also be passed.
        
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit, sample, and fit_sample methods are given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters:  </span>

### sampler: str, default='HyperSampler'
* If 'HyperSampler', .sample() will perform tests to determine which imblearn resampling method returns the best performance.  
* also used to specify a single imblearn resampling method

### best_params: dict, default={}
* dictionary of parameters to be used as kwargs for XGBoost models to be used for evaluating performance

### booster: str, default='gbtree'
* specifies the xgboost booster to be used for the evaluation models ('gbtree','gblinear', or 'dart')  

### objective: str, default='reg:logistic'
* specifies the xgboost objective to be used for the evaluation models
* see XGBoost documentation for more details

### scorer: str, default='Auto'
* determines the model evaluation metric used
* if 'Auto', built-in XGBoost metrics will be used
* otherwise, existic sklearn metrics can be passed as strings

### test_samplers: list, default=[]
* allows user to pass a list of resampling methods that will be tested
* if left empty, a default list will be used

### excluded_samplers: list, default=[]
* allows user to pass a list of resampling methods that will be excluded from testing

### set_sampler_params: dictionary, default={}
* allows user to pass a dictionary for setting parameters of resampling methods
* if left empty, a default dictionary will be used
* resampling methods not built into the class can be added this way

### additional_samplers: list, default=[]
* new resampling methods introduced through set_sampler_params can be added to the list of methods to be tested


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### scoring_functions: dictionary
* dictonary of metrics available for use

### classification_objectives: list
* list of XGBoost objectives considered as for classification

### regression_objectives: list
* list of XGBoost objectives considered as for regression

### the_higher_the_better: list
* list of metrics of which an increase means improvement in performance

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### need_encoded_y: list
* list of metrics that require an encoded y
* automatically dealt with my the class

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### sampler_params: dictionary
* default dictionary of parameters used for resampling methods

### sampler: str
* name or determined resampling method

### resampler: obj
* resampler object
* fitted to X (or also y) after .fit()
* should NOT be used to transform test set or new data

### performance_scores: pandas DataFrame
* results of testing

### best_perf: pandas DataFrame
* highest performer/s

### best_sampler: str
* name of resampling method deemed best


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit(X, y=None, target=None)  
Either fits the pre-determined resampling method to the data or performs tests first before fitting the best method to the data.  
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

### .sample(X, y=None, target=None)  
Using statistics learned from fitted data, returns resampled data.
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

**Returns:**
* X_resampled: *pandas DataFrame, shape (n_samples, n_features)*  
    * resampled or untouched independent variables / features
* y_resampled: *pandas DataFrame, shape (n_samples,)*    
    * resampled or untouched dependent variable

### .fit_sample(X, y=None, target=None)  
Streamlines fit() and sample() methods.

In [17]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [18]:
class Binary_Undersampler:
    """This transformer is not compatible with the current sklearn Pipeline.
    
    This class is applicable only on data for binary classification.
    This function assumes that 1 is the value of the underrepresented class."""
    def fit(self, X, y):
        return self
    
    def sample(self, X, y):
        X = pd.DataFrame(X)
        y = pd.DataFrame(y)
        # Encoding target variables formatted as string
        if y.iloc[:,0].dtype == object:
            Target_is_string = True
        else:
            Target_is_string = False

        if Target_is_string == True:
            L_E = LabelEncoder()
            L_E = L_E.fit(y.values.ravel().astype(str))
            y.iloc[:,0] = L_E.transform(y.values.ravel().astype(str))
    
        Positive_indices = np.array(y[y.iloc[:,-1] ==1].index)
        Negative_indices = np.array(y[y.iloc[:,-1] == 0].index)
        if len(Positive_indices) < len(Negative_indices):
            Minority_indices = Positive_indices
            Majority_indices = Negative_indices
        elif len(Positive_indices) > len(Negative_indices):
            Minority_indices = Negative_indices
            Majority_indices = Positive_indices
        #return X, y
        random_Majority_indices = np.array(np.random.choice(Majority_indices, len(Minority_indices), replace = False))
        Undersample_indices = np.concatenate([Minority_indices, random_Majority_indices])
        
        # Restore original labels
        if Target_is_string == True:
            y.iloc[:,0] = L_E.inverse_transform(y.values.ravel())
                
        return np.array(X.loc[Undersample_indices,:].reset_index(drop=True)),\
                    np.array(y.loc[Undersample_indices,:].reset_index(drop=True))
        
    def fit_sample(self, X, y):
        self.fit(X, y)
        return self.sample(X, y)

# Checking for kurtosis

In [19]:
# Checking for class imbalance:
print("Length of X: {}".format(len(X_imputed)))
print("Length of y: {}".format(len(y_imputed)))
print(f'\033[1m{Fore.RED}Class Imbalance:{Style.RESET_ALL}\033[0m')
print(Counter(y_imputed.iloc[:,0]))

Length of X: 147296
Length of y: 147296
[1m[31mClass Imbalance:[0m[0m
Counter({0: 147046, 1: 250})


## Binary_Undersampler in action

In [20]:
X_undersampled, y_undersampled = Binary_Undersampler().fit_sample(X_imputed, y_imputed)
print("Length of undersampled X_train: {}".format(len(X_undersampled)))
print("Length of undersampled y_train: {}".format(len(y_undersampled)))
print(f'\033[1m{Fore.RED}Class Balance:{Style.RESET_ALL}\033[0m')
print(Counter(pd.DataFrame(y_undersampled).iloc[:,0]))

Length of undersampled X_train: 500
Length of undersampled train: 500
[1m[31mClass Balance:[0m[0m
Counter({1: 250, 0: 250})


In [21]:
# Resampling Class

class HyperSampler:
    """This transformer is not compatible with the current sklearn Pipeline.
    
        Options for resampling methods methods (parameter sampler='') are the following:
            Binary_Undersampler, ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours,
            RepeatedEditedNearestNeighbours, AllKNN, InstanceHardnessThreshold, NearMiss, NeighbourhoodCleaningRule, 
            OneSidedSelection, RandomUnderSampler, TomekLinks, ADASYN, RandomOverSampler, SMOTE, SMOTEENN, 
            SMOTETomek, BalanceCascade, BalancedBaggingClassifier, EasyEnsemble
.
        
        If sampler = 'HyperSampler',
            every resampling method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The resampling method that contributes to the best score will be applied to the data to be returned.
            
            HyperSampler.resampler = chosen resampling class assigned after instantiation 
                                                                        or after fit() if sampler = 'HyperSampler'
        
        This feature selection transformer works only with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        This class also requires the target (y) to also be passed.
        
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit, sample, and fit_sample methods are given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""
    
    # Compatible model evaluation metrics: Sklearn metrics
    scoring_functions = {'accuracy_score':accuracy_score,'f1_score':f1_score, 'hamming_loss':hamming_loss,
        'jaccard_similarity_score':jaccard_similarity_score, 'log_loss':log_loss, 'matthews_corrcoef':matthews_corrcoef,
                 'precision_score':precision_score, 'recall_score':recall_score, 'zero_one_loss':zero_one_loss,
                'explained_variance_score':explained_variance_score, 'mean_absolute_error':mean_absolute_error,
                 'mean_squared_error':mean_squared_error, 'mean_squared_log_error':mean_squared_log_error,
                 'median_absolute_error':median_absolute_error, 'r2_score':r2_score}

    classification_objectives = ['reg:logistic','binary:logistic', 'binary:logitraw',
                                'multi:softmax', 'multi:softprob', 'rank:pairwise' ]
    regression_objectives = ['reg:linear','count:poisson','reg:gamma', 'reg:tweedie']
    
    # metrics with which higher value = higher model performance
    the_higher_the_better = ['accuracy_score','f1_score','jaccard_similarity_score',
                                  'precision_score','recall_score',
                                'explained_variance_score','r2_score','Best test auc',
                                'Best test ndcg','Best test map']
    # metrics with which lower value = higher model performance
    the_lower_the_better = ['hamming_loss', 'log_loss','matthews_corrcoef','zero_one_loss','mean_absolute_error',
                           'mean_squared_error','mean_squared_log_error','median_absolute_error',
                           'Best test error','Best test rmse','Best test mae','Best test log loss',
                           'Best test merror','Best test mlogloss']
    
    # metrics that require encoded target variable
    need_encoded_y = ['recall_score','precision_score','f1_score']

    # parameters to inherited from mother class
    sampler='N/A'
    best_params='N/A'
    booster='N/A'
    objective='N/A'
    scorer='N/A'
    test_samplers='N/A'
    excluded_samplers='N/A'
    additional_samplers='N/A'
    set_sampler_params='N/A'
    
    # arguments for classes
    estimator='N/A'
    voting='N/A'
    n_jobs='N/A'
    return_indices='N/A'
    n_neighbors='N/A'
    n_seeds_S='N/A'
    kind_sel='N/A'
    max_iter='N/A'
    allow_minority='N/A'
    ratio='N/A'
    cv='N/A'
    version='N/A'
    ver3_samp_ngh='N/A'
    n_neighbors_ver3='N/A'
    threshold_cleaning='N/A'
    replacement='N/A'
    k_neighbors='N/A'
    m_neighbors='N/A'
    out_step='N/A'
    kind='N/A'
    svm_estimator='N/A'
    smote='N/A'
    enn='N/A'
    kind_smote='N/A'
    kind_enn='N/A'
    tomek='N/A'
    n_max_subset='N/A'
    classifier='N/A'
    n_estimators='N/A'
    max_samples='N/A'
    max_features='N/A'
    bootstrap='N/A'
    bootstrap_features='N/A'
    oob_score='N/A'
    warm_start='N/A'
    verbose='N/A'
    n_subsets='N/A'

#     **kwargs
    
    random_state='N/A'
    test_size='N/A'
    
    def __init__(self, sampler="HyperSampler", best_params={},
                 booster='gbtree', objective='reg:logistic', scorer='Auto',
                 test_samplers=[],excluded_samplers=[],additional_samplers=[],set_sampler_params={},
                 # parameters for samplers
                 estimator=None,voting='auto',n_jobs=1, return_indices=False,n_seeds_S=1,
                 kind_sel='all',max_iter=100,allow_minority=False,ratio='auto',cv=5,version=1,
                 ver3_samp_ngh=None,n_neighbors_ver3=3,threshold_cleaning=0.5,replacement=False,k=None,n_neighbors=5,
                 k_neighbors=5,m=None,m_neighbors=10,out_step=0.5,kind='regular',svm_estimator=None,smote=None,
                 enn=None,kind_smote=None,kind_enn=None, tomek=None,n_max_subset=None,
                 classifier=None,n_estimators=10,max_samples=1.0,max_features=1.0,bootstrap=True,bootstrap_features=False,
                 oob_score=False,warm_start=False,verbose=0,n_subsets=10,
                # parameters for train/test split
                 random_state=69, test_size=0.2,
                # values passed for these parameters will never be used unless: class.parameter is made None
                scoring_functions=None, classification_objectives=None, need_encoded_y=None,
                 the_higher_the_better=None, the_lower_the_better=None):
        
        # assigning parameters as instance variables
        varses = list(vars(HyperSampler).keys())
        self.variables = varses[len(varses) - varses[::-1].index('__doc__') : varses.index('__init__')]
        class_name = "HyperSampler"+"."
        for v in self.variables:
            # if the class variable for the argument is not empty, assign its value as the instance variable
            if eval("%s != 'N/A'" % (class_name+v)) is True:
                exec("self.%s = %s" % (v,class_name+v))
            # if the class variable is empty, assign to instance the value passed as argument during instantiation
            else:
                exec("self.%s = %s" % (v, v))
        
        # determining the model build used for feature selection
        if self.booster == 'gbtree':
            if self.objective in HyperSampler.classification_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBClassifier(**self.best_params)

            elif self.objective in HyperSampler.regression_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBRegressor(**self.best_params)
        elif self.booster == 'dart' or self.booster == 'gblinear':
            self.best_params['booster'] = self.booster
            self.best_params['objective'] = self.objective
            
        # ASSIGNING SELF.BASE_ESTIMATOR
        self.estimator=None
        self.base_estimator=None
        self.svm_estimator=None
        
        self.sampler_params={'Binary_Undersampler':[Binary_Undersampler,{}],
                'ClusterCentroids':[ClusterCentroids,{'ratio':self.ratio,'random_state':self.random_state,
                                            'estimator':self.estimator,'voting':self.voting,'n_jobs':self.n_jobs}],
                  'CondensedNearestNeighbour':[CondensedNearestNeighbour,{'ratio':self.ratio,
                                            'return_indices':self.return_indices,'random_state':self.random_state,
                                    'n_neighbors':self.n_neighbors,
                                    'n_seeds_S':self.n_seeds_S,'n_jobs':self.n_jobs}],
            'EditedNearestNeighbours':[EditedNearestNeighbours,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,
                                    'n_neighbors':self.n_neighbors,'kind_sel':self.kind_sel,'n_jobs':self.n_jobs}],
            'RepeatedEditedNearestNeighbours':[RepeatedEditedNearestNeighbours,{'ratio':self.ratio,
                    'return_indices':self.return_indices,'random_state':self.random_state,
                                            'n_neighbors':self.n_neighbors,'max_iter':self.max_iter,
                                                'kind_sel':self.kind_sel,'n_jobs':self.n_jobs}],
             'AllKNN':[AllKNN,{'ratio':self.ratio,'return_indices':self.return_indices,'random_state':self.random_state,
                               'n_neighbors':self.n_neighbors,'kind_sel':self.kind_sel,
                               'allow_minority':self.allow_minority,'n_jobs':self.n_jobs}],
            'InstanceHardnessThreshold':[InstanceHardnessThreshold,{'estimator':self.estimator,'ratio':self.ratio,
                                    'return_indices':self.return_indices,'random_state':self.random_state,
                                    'cv':self.cv,'n_jobs':self.n_jobs}],
             'NearMiss':[NearMiss,{'ratio':self.ratio,'return_indices':self.return_indices,'random_state':self.random_state,
                                   'version':self.version,'n_neighbors':self.n_neighbors,
                                    'ver3_samp_ngh':self.ver3_samp_ngh,'n_neighbors_ver3':self.n_neighbors_ver3,
                                   'n_jobs':self.n_jobs}],
             'NeighbourhoodCleaningRule':[NeighbourhoodCleaningRule,{'ratio':self.ratio,
                                    'return_indices':self.return_indices,
                                    'random_state':self.random_state,
                                    'n_neighbors':self.n_neighbors,'kind_sel':self.kind_sel,
                                    'threshold_cleaning':self.threshold_cleaning,'n_jobs':self.n_jobs}],
             'OneSidedSelection':[OneSidedSelection,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,
                                    'n_neighbors':self.n_neighbors,'n_seeds_S':self.n_seeds_S,'n_jobs':self.n_jobs}],
            'RandomUnderSampler':[RandomUnderSampler,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,'replacement':self.replacement}],
             'TomekLinks':[TomekLinks,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,'n_jobs':self.n_jobs}],
             'ADASYN':[ADASYN,{'ratio':self.ratio,'random_state':self.random_state,
                                   'n_neighbors':self.n_neighbors,'n_jobs':self.n_jobs}],
            'RandomOverSampler':[RandomOverSampler,{'ratio':self.ratio,'random_state':self.random_state}],
             'SMOTE':[SMOTE,{'ratio':self.ratio,'random_state':self.random_state,
                                 'k_neighbors':self.k_neighbors,'m_neighbors':self.m_neighbors,
                                 'out_step':self.out_step,'kind':self.kind,'svm_estimator':self.svm_estimator,
                                 'n_jobs':self.n_jobs}],
             'SMOTEENN':[SMOTEENN,{'ratio':self.ratio,'random_state':self.random_state,'smote':self.smote,
                                   'enn':self.enn,'out_step':self.out_step,
                                   'kind_smote':self.kind_smote,
                                   'n_neighbors':self.n_neighbors,'kind_enn':self.kind_enn,'n_jobs':self.n_jobs}],
             'SMOTETomek':[SMOTETomek,{'ratio':self.ratio,'random_state':self.random_state,'smote':self.smote,
                                    'tomek':self.tomek,'out_step':self.out_step,
                                    'kind_smote':self.kind_smote,'n_jobs':self.n_jobs}],
            'BalanceCascade':[BalanceCascade,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,'n_max_subset':self.n_max_subset,
                                    'classifier':self.classifier,'estimator':self.estimator}],
             'EasyEnsemble':[EasyEnsemble,{'ratio':self.ratio,'return_indices':self.return_indices,
                                    'random_state':self.random_state,'replacement':self.replacement,
                                    'n_subsets':self.n_subsets}]}
        if len(self.set_sampler_params) !=0:
            for key,value in self.set_sampler_params.items():
                self.sampler_params[key] = [value[0],value[1]]
        
        # Assigning the resampler if sampler (sampling method) is not "Hyperscale"
        if self.sampler is not 'HyperSampler':
            self.resampler = self.sampler_params[self.sampler][0](**self.sampler_params[self.sampler][1])
            self.performance_scores = None
            self.best_perf = None
            self.best_sampler = None
    
    def fit(self, X, y=None, target=None):
        if y is None:
            if self.target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[self.target])
                X = X.drop(target, axis =1)
        else:
            X = pd.DataFrame(X)
            y = pd.DataFrame(y)
        
        if self.sampler == 'HyperSampler':
            # Encode labels if string when needed
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in HyperSampler.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if y.iloc[:,0].dtype == object:
                    Target_is_string = True
                else:
                    Target_is_string = False

                if Target_is_string == True:
                    L_E = LabelEncoder()
                    L_E = L_E.fit(y.iloc[:,0].astype(str))
                    y.iloc[:,0] = L_E.transform(y.iloc[:,0].astype(str))

            # fit model on all training data            
            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                test_size=self.test_size, random_state=self.random_state, stratify=y)

            if len(self.test_samplers) != 0:
                samplers = self.test_samplers
            else:
                samplers = ['None','Binary_Undersampler','ClusterCentroids', 'CondensedNearestNeighbour',
                        'EditedNearestNeighbours','RepeatedEditedNearestNeighbours', 'AllKNN',
                        'InstanceHardnessThreshold', 'NearMiss', 'NeighbourhoodCleaningRule', 'OneSidedSelection',
                        'RandomUnderSampler', 'TomekLinks', 'ADASYN', 'RandomOverSampler', 'SMOTE', 'SMOTEENN', 
                        'SMOTETomek']#, 'BalanceCascade', 'EasyEnsemble']
            
            if len(self.additional_samplers) != 0:
                samplers = samplers + self.additional_samplers
            
            if len(self.excluded_samplers) !=0:
                for excluded in self.excluded_samplers:
                    if excluded in samplers:
                        samplers.remove(excluded)
            
            if self.objective not in ['binary:logistic','binary:logitraw']\
                                and 'Binary_Undersampler' in samplers:
                samplers.remove('Binary_Undersampler')
            
            Scores = []
            for sampler in samplers:
                if sampler is 'None':
                    X_train_resampled, y_train_resampled = X_train, y_train.values.ravel()

                else:
                    sampler = self.sampler_params[sampler][0](**self.sampler_params[sampler][1])
                    X_train_resampled, y_train_resampled = sampler.fit_sample(X_train, y_train.values.ravel())
                    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
                
                if self.booster is 'gbtree':
                    if self.objective in HyperSampler.classification_objectives:
                        estimator = xgb.XGBClassifier
                    if self.objective in HyperSampler.regression_objectives:
                        estimator = xgb.XGBRegressor
                    selection_model = estimator(**self.best_params)
                    selection_model.fit(X_train_resampled, y_train_resampled,
                                       eval_set=[(X_test,y_test.values.ravel())], verbose=False)
                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = selection_model.evals_result()
                        scorer_used = list(result['validation_0'].keys())[0]
                        #score = np.mean(result['validation_1'][scorer_used])
                        score = result['validation_0'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = HyperSampler.scoring_functions[self.scorer]
                        y_pred = selection_model.predict(X_test)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test.values.ravel()]
                        else:
                            predictions = y_pred
                            y_test_used = y_test.values.ravel()
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)
                        
                elif self.booster == 'dart' or self.booster == 'gblinear':
                    dtrain_resampled = xgb.DMatrix(data=X_train_resampled, label=y_train_resampled)
                    dtest = xgb.DMatrix(data=X_test, label=y_test.values.ravel())

                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = {}
                        selection_model = xgb.train(dtrain=dtrain_resampled, params=self.best_params,
                                                    evals=[(dtest, 'eval')], evals_result=result,
                                                   verbose_eval=False)
                        scorer_used = list(result['eval'].keys())[0]
                        #score = np.mean(result['eval'][scorer_used])
                        score = result['eval'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = HyperSampler.scoring_functions[self.scorer]

                        selection_model = xgb.train(dtrain=dtrain_resampled, params=self.best_params)
                        y_pred = selection_model.predict(dtest)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test.values.ravel()]
                        else:
                            predictions = y_pred
                            y_test_used =  y_test.values.ravel()
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)

            # building table of performance scores
            if self.scorer == 'Auto':
                #self.scorer = 'Average test ' + scorer_used
                self.scorer = 'Best test ' + scorer_used
            self.performance_scores = pd.DataFrame()
            self.performance_scores['Samplers'] = samplers
            self.performance_scores[self.scorer] = Scores

            # Best resampler: returns max possible model performance 
            if self.scorer in HyperSampler.the_higher_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == max(self.performance_scores[self.scorer])]
            elif self.scorer in HyperSampler.the_lower_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == min(self.performance_scores[self.scorer])]
                
            # assigning the best-performing resampler as self.resampler
            self.best_sampler = self.best_perf.iloc[0,0]
            if self.best_sampler is not 'None':
                self.resampler = self.sampler_params[self.best_sampler][0](**self.sampler_params[self.best_sampler][1])
            else:
                self.resampler = None
            
            # reverse label encoding
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in HyperSampler.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if Target_is_string == True:
                    y.iloc[:,0] = L_E.inverse_transform(y.iloc[:,0])
            
            return self
            
        else:
            return self
    
    def sample(self, X, y=None, target=None):
        if y is None:
            if target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[target])
                X = X.drop(target, axis =1)
        else:
            X = pd.DataFrame(X)
            y = pd.DataFrame(y)

        if self.best_sampler is not 'None':
            X_resampled, y_resampled = self.resampler.fit_sample(X, y.values.ravel())
            return pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=y.columns)
        else:
            return pd.DataFrame(X), pd.DataFrame(y)
        
    def fit_sample(self, X, y=None, target=None):
        self.fit(X,y, target=target)
        return self.sample(X, y, target=target)


<span style="font-size:30px;font-weight:bold;color:#b22222"> HyperSampler in action  </span>

In [22]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [23]:
HS = HyperSampler(booster='gbtree',objective='binary:logistic', scorer='recall_score',
                  excluded_samplers=[])#,
                 #test_samplers=['None','SMOTE','SMOTEENN','SMOTETomek'])
HS.fit(X_imputed, y_imputed)
X_resampled, y_resampled = HS.sample(X_imputed, y_imputed)
print("Length of resampled X_train: {}".format(len(X_resampled)))
print("Length of resampled y_train: {}".format(len(y_resampled)))
print(f'\033[1m{Fore.RED}Resampling Results:{Style.RESET_ALL}\033[0m')
print(Counter(y_resampled.values.ravel()))
warnings.filterwarnings('ignore')



Length of resampled X_train: 500
Length of resampled y_train: 500
[1m[31mResampling Results:[0m[0m
Counter({0: 250, 1: 250})


In [24]:
print(f'\033[1m{Fore.RED}Performance scores of resampling methods:{Style.RESET_ALL}\033[0m')
HS.performance_scores

[1m[31mPerformance scores of resampling methods:[0m[0m


Unnamed: 0,Samplers,recall_score
0,,0.68
1,Binary_Undersampler,0.42
2,ClusterCentroids,0.96
3,CondensedNearestNeighbour,0.86
4,EditedNearestNeighbours,0.7
5,RepeatedEditedNearestNeighbours,0.68
6,AllKNN,0.68
7,InstanceHardnessThreshold,0.82
8,NearMiss,0.94
9,NeighbourhoodCleaningRule,0.68


In [25]:
print(f'\033[1m{Fore.RED}Best Resampling Method:{Style.RESET_ALL}\033[0m')
HS.best_perf

[1m[31mBest Resampling Method:[0m[0m


Unnamed: 0,Samplers,recall_score
2,ClusterCentroids,0.96


<a id="scale"></a>

![Imgur](https://i.imgur.com/TTh0Ez6.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Feature Scaling</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/qe9mKrF.png)




<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Feature_Hyperscaler</span><i><span style="font-size:16px;font-weight:bold;color:#3366cc">(best_params={}, booster='gbtree', objective='reg:logistic', scorer='accuracy_score', set_scaler_params={})</span></i>  
 
"""This transformer is not compatible with the current sklearn Pipeline.
    
        Options for scaling methods (parameter "scale=") are the following:
        Normal, Standard, MinMax, Normal, Robust, MaxAbs, and Quantile.
        
        If scale = 'Hyperscale',
            every scaling method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The scaling method that contributes to the best score will be applied to the data to be returned.
        
        This feature scaling transformer works with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        Tuning requires the target (y) to also be passed.
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit_transform method is given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters:  </span>

### scale: str, default='Hyperscale'
* If 'Hyperscale', fit() will perform tests to determine which sklearn feature scaling method returns the best performance.  
* also used to specify a single sklearn feature scaling method

### best_params: dict, default={}
* dictionary of parameters to be used as kwargs for XGBoost models to be used for evaluating performance

### booster: str, default='gbtree'
* specifies the xgboost booster to be used for the evaluation models ('gbtree','gblinear', or 'dart')  

### objective: str, default='reg:logistic'
* specifies the xgboost objective to be used for the evaluation models
* see XGBoost documentation for more details

### scorer: str, default='Auto'
* determines the model evaluation metric used
* if 'Auto', built-in XGBoost metrics will be used
* otherwise, existic sklearn metrics can be passed as strings

### set_scaler_params: dictionary, default={}
* allows user to pass a dictionary for setting parameters of feature scaling methods
* if left empty, a default dictionary will be used


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### scoring_functions: dictionary
* dictonary of metrics available for use

### classification_objectives: list
* list of XGBoost objectives considered as for classification

### regression_objectives: list
* list of XGBoost objectives considered as for regression

### the_higher_the_better: list
* list of metrics of which an increase means improvement in performance

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### need_encoded_y: list
* list of metrics that require an encoded y
* automatically dealt with my the class

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### scaler_params: dictionary
* default dictionary of parameters used for feature scaling methods

### scale: str
* name or determined feature scaling method

### scaler: obj
* scaler object
* fitted to X after .fit()
* should be used later to also transform test set or new data

### performance_scores: pandas DataFrame
* results of testing

### best_perf: pandas DataFrame
* highest performer/s

### best_sampler: str
* name of feature scaling method deemed best


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit(X, y=None, target=None)  
Either fits the pre-determined scaling method to the data or performs tests first before fitting the best method to the data.  
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

### .transform(X, y=None, target=None)  
Using statistics learned from fitted data, returns scaled data.
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

**Returns:**
* X_resampled: *pandas DataFrame, shape (n_samples, n_features)*  
    * rescaled or untouched independent variables / features
* y_resampled: *pandas DataFrame, shape (n_samples,)*    
    * untouched dependent variable

### .fit_transform(X, y=None, target=None)  
Streamlines fit() and transform() methods.

In [26]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [27]:
# Feature Scaler

class Feature_Hyperscaler:
    """This transformer is not compatible with the current sklearn Pipeline.
    
        Options for scaling methods (parameter "scale=") are the following:
        Normal, Standard, MinMax, Normal, Robust, MaxAbs, and Quantile.
        
        If scale = 'Hyperscale',
            every scaling method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The scaling method that contributes to the best score will be applied to the data to be returned.
        
        This feature selection transformer works only with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        Tuning requires the target (y) to also be passed.
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit_transform method is given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""
    
    # Compatible model evaluation metrics: Sklearn metrics
    scoring_functions = {'accuracy_score':accuracy_score,'f1_score':f1_score, 'hamming_loss':hamming_loss,
        'jaccard_similarity_score':jaccard_similarity_score, 'log_loss':log_loss, 'matthews_corrcoef':matthews_corrcoef,
                 'precision_score':precision_score, 'recall_score':recall_score, 'zero_one_loss':zero_one_loss,
                'explained_variance_score':explained_variance_score, 'mean_absolute_error':mean_absolute_error,
                 'mean_squared_error':mean_squared_error, 'mean_squared_log_error':mean_squared_log_error,
                 'median_absolute_error':median_absolute_error, 'r2_score':r2_score}

    classification_objectives = ['reg:logistic','binary:logistic', 'binary:logitraw',
                                'multi:softmax', 'multi:softprob', 'rank:pairwise' ]
    regression_objectives = ['reg:linear','count:poisson','reg:gamma', 'reg:tweedie']
    
    # metrics with which higher value = higher model performance
    the_higher_the_better = ['accuracy_score','f1_score','jaccard_similarity_score',
                                  'precision_score','recall_score',
                                'explained_variance_score','r2_score','Best test auc',
                                'Best test ndcg','Best test map']
    # metrics with which lower value = higher model performance
    the_lower_the_better = ['hamming_loss', 'log_loss','matthews_corrcoef','zero_one_loss','mean_absolute_error',
                           'mean_squared_error','mean_squared_log_error','median_absolute_error',
                           'Best test error','Best test rmse','Best test mae','Best test log loss',
                           'Best test merror','Best test mlogloss']

    # metrics that require encoded target variable
    need_encoded_y = ['recall_score','precision_score','f1_score']
    
    # parameters to inherited from mother class
    set_scaler_params='N/A'
    scale='N/A'
    best_params='N/A'
    booster='N/A'
    objective='N/A'
    scorer='N/A'
    copy='N/A'
    with_mean='N/A'
    with_std='N/A'
    feature_range='N/A'
    norm='N/A'
    with_centering='N/A'
    with_scaling='N/A'
    quantile_range='N/A'
    n_quantiles='N/A'
    output_distribution='N/A'
    ignore_implicit_zeros='N/A'
    subsample='N/A'
    random_state='N/A'
    test_size='N/A'
    
    def __init__(self, scale="Hyperscale", best_params={},
                 booster='gbtree', objective='reg:logistic', scorer='accuracy_score',
                 set_scaler_params={},
                 copy=True, with_mean=True, with_std=True,
                feature_range=(0, 1),
                 norm='l2',
                 with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0),
                 n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False,
                 subsample=100000, random_state=69, test_size=0.2,
                # values passed for these parameters will never be used unless: class.parameter is made None
                scoring_functions=None, classification_objectives=None, need_encoded_y=None,
                 the_higher_the_better=None, the_lower_the_better=None):
        
        # assigning parameters as instance variables
        varses = list(vars(Feature_Hyperscaler).keys())
        self.variables = varses[len(varses) - varses[::-1].index('__doc__') : varses.index('__init__')]
        class_name = "Feature_Hyperscaler"+"."
        for v in self.variables:
            # if the class variable for the argument is not empty, assign its value as the instance variable
            if eval("%s != 'N/A'" % (class_name+v)) is True:
                exec("self.%s = %s" % (v,class_name+v))
            # if the class variable is empty, assign to instance the value passed as argument during instantiation
            else:
                exec("self.%s = %s" % (v, v))
        
        # determining the model build used for feature selection
        if self.booster == 'gbtree':
            if self.objective in Feature_Hyperscaler.classification_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBClassifier(**self.best_params)
            elif self.objective in Feature_Hyperscaler.regression_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBRegressor(**self.best_params)
        elif self.booster == 'dart' or self.booster == 'gblinear':
            self.best_params['booster'] = self.booster
            self.best_params['objective'] = self.objective
        
        self.scaler_params={'StandardScaler':[StandardScaler,{'copy':self.copy, 
                                              'with_mean':self.with_mean, 'with_std':self.with_std}],
                          'MinMaxScaler':[MinMaxScaler,{'feature_range':self.feature_range, 'copy':self.copy}],
                          'Normalizer':[Normalizer,{'norm':self.norm, 'copy':self.copy}],
                            'RobustScaler':[RobustScaler,{'with_centering':self.with_centering,
                                            'with_scaling':self.with_scaling,
                                             'quantile_range':self.quantile_range, 'copy':self.copy}],
                           'MaxAbsScaler':[MaxAbsScaler,{'copy':self.copy}],
                           'QuantileTransformer':[QuantileTransformer,{'n_quantiles':self.n_quantiles, 
                                                  'output_distribution':self.output_distribution,
                                                  'ignore_implicit_zeros':self.ignore_implicit_zeros,
                                                  'subsample':self.subsample,'random_state':self.random_state}]}
        if len(self.set_scaler_params) !=0:
            for key,value in self.set_scaler_params.items():
                self.scaler_params[key] = [value[0],value[1]]
                
        # Assigning the scaler if scale is not "Hyperscale"
        if self.scale is not 'Hyperscale':
            self.scaler = self.scaler_params[self.scale][0](**self.scaler_params[self.scale][1])
            self.performance_scores = None
            self.best_perf = None
            self.best_scaler = None
    
    def fit(self, X, y=None, target=None):
        if y is None:
            if self.target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[self.target])
                X = X.drop(target, axis =1)
        
        if self.scale == 'Hyperscale':
            # Encode labels if string when needed
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in Feature_Hyperscaler.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if y.iloc[:,0].dtype == object:
                    Target_is_string = True
                else:
                    Target_is_string = False

                if Target_is_string == True:
                    L_E = LabelEncoder()
                    L_E = L_E.fit(y.iloc[:,0].astype(str))
                    y.iloc[:,0] = L_E.transform(y.iloc[:,0].astype(str))

            # fit model on all training data            
            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                test_size=self.test_size, random_state=self.random_state, stratify=y)
            
            scales = ['None','StandardScaler','MinMaxScaler',#'Normalizer',
                      'RobustScaler','MaxAbsScaler','QuantileTransformer']
            Scores = []
            for scale in scales:
                if scale is 'None':
                    X_train_scaled = X_train
                    X_test_scaled = X_test
                else:
                    scaler = self.scaler_params[scale][0](**self.scaler_params[scale][1])
                    scaler.fit(X_train)
                    X_train_scaled = scaler.transform(X_train)
                    X_test_scaled = scaler.transform(X_test)
                
                if self.booster is 'gbtree':
                    if self.objective in Feature_Hyperscaler.classification_objectives:
                        estimator = xgb.XGBClassifier
                    if self.objective in Feature_Hyperscaler.regression_objectives:
                        estimator = xgb.XGBRegressor
                    selection_model = estimator(**self.best_params)
                    selection_model.fit(X_train_scaled, y_train.values.ravel(),
                                       eval_set=[(X_test_scaled,y_test.values.ravel())], verbose=False)
                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = selection_model.evals_result()
                        scorer_used = list(result['validation_0'].keys())[0]
                        #score = np.mean(result['validation_1'][scorer_used])
                        score = result['validation_0'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = Feature_Hyperscaler.scoring_functions[self.scorer]
                        y_pred = selection_model.predict(X_test_scaled)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test.iloc[:,0]]
                        else:
                            predictions = y_pred
                            y_test_used = y_test.iloc[:,0]
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)
                        
                elif self.booster == 'dart' or self.booster == 'gblinear':
                    dtrain_scaled = xgb.DMatrix(data=X_train_scaled, label=y_train)
                    dtest_scaled = xgb.DMatrix(data=X_test_scaled, label=y_test)

                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = {}
                        selection_model = xgb.train(dtrain=dtrain_scaled, params=self.best_params,
                                                    evals=[(dtest_scaled, 'eval')], evals_result=result,
                                                   verbose_eval=False)
                        scorer_used = list(result['eval'].keys())[0]
                        #score = np.mean(result['eval'][scorer_used])
                        score = result['eval'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = Feature_Hyperscaler.scoring_functions[self.scorer]

                        selection_model = xgb.train(dtrain=dtrain_scaled, params=self.best_params)
                        y_pred = selection_model.predict(dtest_scaled)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test.iloc[:,0]]
                        else:
                            predictions = y_pred
                            y_test_used = y_test.iloc[:,0]
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)

            # building table of performance scores
            if self.scorer == 'Auto':
                self.scorer = 'Best test ' + scorer_used
            self.performance_scores = pd.DataFrame()
            self.performance_scores['Scalers'] = scales
            self.performance_scores[self.scorer] = Scores

            # Best scaler: returns max possible model performance 
            if self.scorer in Feature_Hyperscaler.the_higher_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == max(self.performance_scores[self.scorer])]
            elif self.scorer in Feature_Hyperscaler.the_lower_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == min(self.performance_scores[self.scorer])]
                
            # assigning the best-performing scaler as scaler
            self.best_scaler = self.best_perf.iloc[0,0]
            if self.best_scaler is not 'None':
                self.scaler = self.scaler_params[self.best_scaler][0](**self.scaler_params[self.best_scaler][1])
                # Fitting the scaler
                self.scaler.fit(X)
            else:
                self.scaler = None
            
            # reverse label encoding
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in Feature_Hyperscaler.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if Target_is_string == True:
                    y.iloc[:,0] = L_E.inverse_transform(y.iloc[:,0])
            
            return self
            
        else:
            # Fitting the scaler
            self.scaler.fit(X)
            return self
    
    def transform(self, X, y=None, target=None):
        if y is None:
            if target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[target])
                X = pd.DataFrame(X.drop(target, axis =1))
        if self.best_scaler is not 'None':
            X_scaled = pd.DataFrame(self.scaler.transform(X), columns=X.columns)
            return X_scaled, pd.DataFrame(y)
        else:
            return pd.DataFrame(X), pd.DataFrame(y)
        
    def fit_transform(self, X, y=None, target=None):
        self.fit(X,y, target=target)
        return self.transform(X, y, target=target)


<span style="font-size:30px;font-weight:bold;color:#b22222"> Feature_Hyperscaler in action  </span>

In [28]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [29]:
FHS = Feature_Hyperscaler(booster='gbtree',objective='binary:logistic', scorer='recall_score')
FHS.fit(X_resampled, y_resampled)
X_scaled, y_scaled = FHS.transform(X_resampled, y_resampled)
print(f'\033[1m{Fore.RED}Scaled X_train:{Style.RESET_ALL}\033[0m')
X_scaled.head(10)

[1m[31mScaled X_train:[0m[0m


Unnamed: 0,index,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,152930.253012,0.552952,0.036295,-0.152578,0.273752,0.205888,0.057598,-0.145254,-0.114393,1.242381,...,-0.10666,-0.007431,0.124388,0.048337,-0.030963,-0.172096,0.118541,-0.024249,-0.020175,56.313754
1,43355.565217,-0.382524,-0.008179,0.350821,0.061924,-0.290693,0.17748,-0.043012,0.096769,-0.076569,...,0.076488,0.03317,-0.096371,-0.025172,0.011808,0.112465,0.018228,-0.007483,0.00184,113.313522
2,253331.879464,0.060237,0.021565,-0.511447,-0.236409,0.169967,0.006083,0.09486,0.020066,-0.015625,...,-0.052852,0.012124,0.011813,0.051461,0.025971,-0.114771,-0.018566,-0.054684,0.004101,82.084517
3,81717.660622,-0.035027,-0.066331,0.510752,0.131712,-0.212058,-0.021352,-0.08636,0.030674,-0.037935,...,0.047827,-0.022004,0.001229,-0.069526,0.049565,0.111146,0.0308,-0.014216,0.016189,81.272664
4,197407.898123,0.202502,-0.064094,-0.53584,-0.181412,0.192768,-0.036185,0.118597,-0.042756,-0.066639,...,-0.044944,0.041093,0.100095,0.0431,0.017195,-0.08037,-0.03493,0.022542,-0.017435,97.96713
5,18027.37037,-0.10603,-0.016584,0.494407,0.076138,-0.325488,0.079424,-0.126039,0.081186,-0.190041,...,0.069033,0.039784,0.032,-0.035586,0.021745,0.119676,-0.019521,-0.011712,0.007288,105.614976
6,142772.716562,0.030177,0.080735,0.040203,-0.012049,0.033646,-0.021354,-0.021753,0.011134,0.010417,...,-0.019619,-0.006042,-0.000722,0.00122,0.001536,0.004685,-0.000676,-0.000622,0.000895,59.971786
7,233146.284653,0.227286,-0.052158,-0.564621,-0.161485,0.184585,-0.131005,-0.047801,-0.040014,-0.013413,...,-0.011769,0.009494,0.06801,0.034763,-0.029526,-0.120826,-0.002214,0.000408,0.000178,79.378424
8,111023.021898,-0.286304,0.12114,0.416509,0.116303,-0.199018,0.059974,-0.061782,0.071597,-0.191857,...,0.008555,-0.03634,-0.123752,-0.027434,0.021983,0.071632,0.049134,-0.002538,0.00614,83.565483
9,277146.5,0.066163,0.296455,-0.586701,-0.089188,0.278897,-0.104586,0.092952,-0.013246,0.019366,...,0.009869,-0.016126,0.075743,0.027539,-0.020466,-0.129376,0.01207,-0.020405,-0.012373,69.95642


In [30]:
print(f'\033[1m{Fore.RED}Performance scores of scaling methods:{Style.RESET_ALL}\033[0m')
FHS.performance_scores

[1m[31mPerformance scores of scaling methods:[0m[0m


Unnamed: 0,Scalers,recall_score
0,,0.96
1,StandardScaler,0.96
2,MinMaxScaler,0.96
3,RobustScaler,0.96
4,MaxAbsScaler,0.96
5,QuantileTransformer,0.96


In [31]:
print(f'\033[1m{Fore.RED}Best scaling method:{Style.RESET_ALL}\033[0m')
FHS.best_perf

[1m[31mBest scaling method:[0m[0m


Unnamed: 0,Scalers,recall_score
0,,0.96
1,StandardScaler,0.96
2,MinMaxScaler,0.96
3,RobustScaler,0.96
4,MaxAbsScaler,0.96
5,QuantileTransformer,0.96


<a id="extract"></a>

![Imgur](https://i.imgur.com/3Mk1UST.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Feature Extraction</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/K19NTwq.png)


<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Feature_Hyperextractor</span><i><span style="font-size:16px;font-weight:bold;color:#3366cc">(extract="Hyperextractor", best_params={}, booster='gbtree', objective='reg:logistic', scorer='Auto', test_extractors=[], excluded_extractors=[], additional_extractors=[], set_extract_params={}, NLP=False, Sparse=False)</span></i>  
 
"""This transformer is not compatible with the current sklearn Pipeline.
    
        Options for feature extraction methods (parameter extract='') are the following:
            FactorAnalysis, FastICA, IncrementalPCA, KernelPCA, LatentDirichletAllocation, 
            MiniBatchDictionaryLearning, MiniBatchSparsePCA, NMF, PCA, SparsePCA,
            TruncatedSVD, LinearDiscriminantAnalysis

.
        
        If extract = 'Hyperextractor',
            every extraction method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The extraction method that contributes to the best score will be applied to the data to be returned.
            
            Feature_Hyperextractor.extractor = chosen resampling class assigned after instantiation 
                                                                        or after fit() if extract = 'Feature_Hyperextractor'
        
        This feature extraction transformer works with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        This class also requires the target (y) to also be passed.
        
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit, transform, and fit_transform methods are given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters:  </span>

### extract: str, default='Hyperextractor'
* If 'Hyperextractor', .transform() will perform tests to determine which sklearn feature extraction method returns the best performance.  
* also used to specify a single sklearn feature extraction method

### best_params: dict, default={}
* dictionary of parameters to be used as kwargs for XGBoost models to be used for evaluating performance

### booster: str, default='gbtree'
* specifies the xgboost booster to be used for the evaluation models ('gbtree','gblinear', or 'dart')  

### objective: str, default='reg:logistic'
* specifies the xgboost objective to be used for the evaluation models
* see XGBoost documentation for more details

### scorer: str, default='Auto'
* determines the model evaluation metric used
* if 'Auto', built-in XGBoost metrics will be used
* otherwise, existic sklearn metrics can be passed as strings

### test_extractors: list, default=[]
* allows user to pass a list of feature extraction methods that will be tested
* if left empty, a default list will be used

### excluded_extractors: list, default=[]
* allows user to pass a list of feature extraction methods that will be excluded from testing

### set_extract_params: dictionary, default={}
* allows user to pass a dictionary for setting parameters of feature extraction methods
* if left empty, a default dictionary will be used
* feature extraction methods not built into the class can be added this way

### additional_extractors: list, default=[]
* new feature extraction methods introduced through set_extract_params can be added to the list of methods to be tested

### NLP: boolean, default=False
* if True, adds methods used for Natural Language Processing (NLP) to list of methods to be tested

### Sparse: boolean, default=False
* if True, adds methods used for processing sparse data to list of methods to be tested

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### scoring_functions: dictionary
* dictonary of metrics available for use

### classification_objectives: list
* list of XGBoost objectives considered as for classification

### regression_objectives: list
* list of XGBoost objectives considered as for regression

### the_higher_the_better: list
* list of metrics of which an increase means improvement in performance

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### need_encoded_y: list
* list of metrics that require an encoded y
* automatically dealt with my the class

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### extract_params: dictionary
* default dictionary of parameters used for feature extraction methods

### extract: str
* name or determined feature extraction method

### extractor: obj
* extractor object
* fitted to X (or also y) after .fit()
* should used later to transform test set or new data

### performance_scores: pandas DataFrame
* results of testing

### best_perf: pandas DataFrame
* highest performer/s

### best_extract: str
* name of feature extraction method deemed best


------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit(X, y=None, target=None)  
Either fits the pre-determined feature extraction method to the data or performs tests first before fitting the best method to the data.  
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

### .transform(X, y=None, target=None)  
Using statistics learned from fitted data, returns extracted data.
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

**Returns:**
* X_extracted: *pandas DataFrame, shape (n_samples, n_features)*  
    * extracted or untouched independent variables / features
* y_extracted: *pandas DataFrame, shape (n_samples,)*    
    * untouched dependent variable

### .fit_transform(X, y=None, target=None)  
Streamlines fit() and transform() methods.

In [32]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [33]:
# Dimensionality Reduction Class

class Feature_Hyperextractor:
    """This transformer is not compatible with the current sklearn Pipeline.
    
        Options for feature extraction methods (parameter extract='') are the following:
            FactorAnalysis, FastICA, IncrementalPCA, KernelPCA, LatentDirichletAllocation, 
            MiniBatchDictionaryLearning, MiniBatchSparsePCA, NMF, PCA, SparsePCA,
            TruncatedSVD, LinearDiscriminantAnalysis

.
        
        If extractor = 'Hyperextractor',
            every extraction method will be applied and tested
            to determine which method best contributes to model performance.
            
            best_params = {dictionary of best parameters for model tuning continuity}
            
            scorer = pass desired Sklearn.metrics scorer as a string
                passing scorer='Auto' entails the use of built-in XGBoost evaluation metrics
            
            The extraction method that contributes to the best score will be applied to the data to be returned.
            
            Feature_Hyperextractor.extractor = chosen resampling class assigned after instantiation 
                                                                        or after fit() if extract = 'Feature_Hyperextractor'
        
        This feature extraction transformer works with any of the three boosters as base learners:
        -gbtree
        -gblinear
        -dart
        
        This class requires that X (features or independent variables) has already been encoded and Imputed. 
        
        This class also requires the target (y) to also be passed.
        
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit, transform, and fit_transform methods are given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""
    
    # Compatible model evaluation metrics: Sklearn metrics
    scoring_functions = {'accuracy_score':accuracy_score,'f1_score':f1_score, 'hamming_loss':hamming_loss,
        'jaccard_similarity_score':jaccard_similarity_score, 'log_loss':log_loss, 'matthews_corrcoef':matthews_corrcoef,
                 'precision_score':precision_score, 'recall_score':recall_score, 'zero_one_loss':zero_one_loss,
                'explained_variance_score':explained_variance_score, 'mean_absolute_error':mean_absolute_error,
                 'mean_squared_error':mean_squared_error, 'mean_squared_log_error':mean_squared_log_error,
                 'median_absolute_error':median_absolute_error, 'r2_score':r2_score}

    classification_objectives = ['reg:logistic','binary:logistic', 'binary:logitraw',
                                'multi:softmax', 'multi:softprob', 'rank:pairwise' ]
    regression_objectives = ['reg:linear','count:poisson','reg:gamma', 'reg:tweedie']
    
    # metrics with which higher value = higher model performance
    the_higher_the_better = ['accuracy_score','f1_score','jaccard_similarity_score',
                                  'precision_score','recall_score',
                                'explained_variance_score','r2_score','Best test auc',
                                'Best test ndcg','Best test map']
    # metrics with which lower value = higher model performance
    the_lower_the_better = ['hamming_loss', 'log_loss','matthews_corrcoef','zero_one_loss','mean_absolute_error',
                           'mean_squared_error','mean_squared_log_error','median_absolute_error',
                           'Best test error','Best test rmse','Best test mae','Best test log loss',
                           'Best test merror','Best test mlogloss']
    
    # metrics that require encoded target variable
    need_encoded_y = ['recall_score','precision_score','f1_score']
    
    
    # parameters to inherited from mother class
    extract='N/A'
    best_params='N/A'
    booster='N/A'
    objective='N/A'
    scorer='N/A'
    test_extractors='N/A'
    excluded_extractors='N/A'
    additional_extractors='N/A'
    set_extract_params='N/A'
    NLP='N/A'
    Sparse='N/A'
    
    # arguments for classes
    Dirichlet_n_components='N/A'
    TruncatedSVD_n_components='N/A'
    n_components='N/A'
    tol='N/A'
    copy='N/A'
    max_iter='N/A'
    noise_variance_init='N/A'
    svd_method='N/A'
    iterated_power='N/A'
    FastICA_algorithm='N/A'
    TruncatedSVD_algorithm='N/A'
    whiten='N/A'
    fun='N/A'
    fun_args='N/A'
    w_init='N/A'
    batch_size='N/A'
    kernel='N/A'
    gamma='N/A'
    degree='N/A'
    coef0='N/A'
    kernel_params='N/A'
    alpha='N/A'
    fit_inverse_transform='N/A'
    eigen_solver='N/A'
    remove_zero_eig='N/A'
    copy_X='N/A'
    n_jobs='N/A'
    doc_topic_prior='N/A'
    topic_word_prior='N/A'
    learning_method='N/A'
    learning_decay='N/A'
    learning_offset='N/A'
    evaluate_every='N/A'
    total_samples='N/A'
    perp_tol='N/A'
    mean_change_tol='N/A'
    max_doc_update_iter='N/A'
    verbose='N/A'
    n_topics='N/A'
    n_iter='N/A'
    fit_algorithm='N/A'
    shuffle='N/A'
    dict_init='N/A'
    transform_algorithm='N/A'
    transform_n_nonzero_coefs='N/A'
    transform_alpha='N/A'
    split_sign='N/A'
    ridge_alpha='N/A'
    callback='N/A'
    method='N/A'
    init='N/A'
    NMF_solver='N/A'
    LDA_solver='N/A'
    beta_loss='N/A'
    l1_ratio='N/A'
    svd_solver='N/A'
    U_init='N/A'
    V_init='N/A'
    shrinkage='N/A'
    priors='N/A'
    store_covariance='N/A'

#     **kwargs
    
    random_state='N/A'
    test_size='N/A'
    
    def __init__(self, extract="Feature_Hyperextractor", best_params={},
                 booster='gbtree', objective='reg:logistic', scorer='Auto',
                 test_extractors=[], excluded_extractors=[],additional_extractors=[],set_extract_params={},
                 NLP=False,Sparse=False,
                 
                 # parameters for extractors
                Dirichlet_n_components=10, TruncatedSVD_n_components=2,TruncatedSVD_algorithm='randomized',
                 n_components=None,tol=0.01,copy=True,max_iter=1000,noise_variance_init=None,svd_method='randomized',
                iterated_power=3,FastICA_algorithm='parallel',whiten=True,fun='logcosh',fun_args=None,w_init=None,
                batch_size=None,kernel='linear',gamma=None,degree=3,coef0=1,kernel_params=None,alpha=1,
                fit_inverse_transform=False,eigen_solver='auto',remove_zero_eig=False,copy_X=True,n_jobs=1,
                doc_topic_prior=None,topic_word_prior=None,learning_method=None,learning_decay=0.7,
                learning_offset=10,evaluate_every=-1,total_samples=1000000,perp_tol=0.1,mean_change_tol=0.001,
                max_doc_update_iter=100,verbose=0,n_topics=None,n_iter=1000,fit_algorithm='lars',shuffle=True,
                dict_init=None,transform_algorithm='omp',transform_n_nonzero_coefs=None,transform_alpha=None,
                split_sign=False,ridge_alpha=0.01,callback=None,method='lars',init=None,NMF_solver='cd',
                beta_loss='frobenius',l1_ratio=0,svd_solver='auto',U_init=None,V_init=None,LDA_solver='svd',
                shrinkage=None,priors=None,store_covariance=False,

                # parameters for train/test split
                 random_state=69, test_size=0.2,
                # values passed for these parameters will never be used unless: class.parameter is made None
                scoring_functions=None, classification_objectives=None, need_encoded_y=None,
                 the_higher_the_better=None, the_lower_the_better=None):
        
        # assigning parameters as instance variables
        varses = list(vars(Feature_Hyperextractor).keys())
        self.variables = varses[len(varses) - varses[::-1].index('__doc__') : varses.index('__init__')]
        class_name = "Feature_Hyperextractor"+"."
        for v in self.variables:
            # if the class variable for the argument is not empty, assign its value as the instance variable
            if eval("%s != 'N/A'" % (class_name+v)) is True:
                exec("self.%s = %s" % (v,class_name+v))
            # if the class variable is empty, assign to instance the value passed as argument during instantiation
            else:
                exec("self.%s = %s" % (v, v))
        
        # determining the model build used for feature selection
        if self.booster == 'gbtree':
            if self.objective in Feature_Hyperextractor.classification_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBClassifier(**self.best_params)

            elif self.objective in Feature_Hyperextractor.regression_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBRegressor(**self.best_params)
        elif self.booster == 'dart' or self.booster == 'gblinear':
            self.best_params['booster'] = self.booster
            self.best_params['objective'] = self.objective
            
        
        self.extract_params={'FactorAnalysis':[FactorAnalysis,{'n_components':self.n_components, 'tol':self.tol,
                                        'copy':self.copy,'max_iter':self.max_iter,'random_state':self.random_state,
                                        'noise_variance_init':self.noise_variance_init,
                                        'svd_method':self.svd_method,'iterated_power':self.iterated_power}],
                'FastICA':[FastICA,{'n_components':self.n_components,'algorithm':self.FastICA_algorithm,
                                    'whiten':self.whiten,'fun':self.fun,'fun_args':self.fun_args,
                                    'max_iter':self.max_iter,'tol':self.tol,'w_init':self.w_init,
                                    'random_state':self.random_state}],
                  'IncrementalPCA':[IncrementalPCA,{'n_components':self.n_components,'whiten':self.whiten,
                                    'copy':self.copy,'batch_size':self.batch_size}],
            'KernelPCA':[KernelPCA,{'n_components':self.n_components,'kernel':self.kernel,'gamma':self.gamma,
                                    'degree':self.degree,'coef0':self.coef0,'kernel_params':self.kernel_params,
                                    'alpha':self.alpha,'fit_inverse_transform':self.fit_inverse_transform,
                                    'eigen_solver':self.eigen_solver,'tol':self.tol,'max_iter':self.max_iter,
                                    'remove_zero_eig':self.remove_zero_eig,'random_state':self.random_state,
                                    'copy_X':self.copy_X,'n_jobs':self.n_jobs}],
            'LatentDirichletAllocation':[LatentDirichletAllocation,{'n_components':self.Dirichlet_n_components,
                                'doc_topic_prior':self.doc_topic_prior,'topic_word_prior':self.topic_word_prior,
                                'learning_method':self.learning_method,'learning_decay':self.learning_decay,
                                'learning_offset':self.learning_offset,'max_iter':self.max_iter,
                                'batch_size':self.batch_size,'evaluate_every':self.evaluate_every,
                                'total_samples':self.total_samples,'perp_tol':self.perp_tol,
                                'mean_change_tol':self.mean_change_tol,
                                'max_doc_update_iter':self.max_doc_update_iter,
                                'n_jobs':self.n_jobs,'verbose':self.verbose,
                                'random_state':self.random_state,'n_topics':self.n_topics}],
             'MiniBatchDictionaryLearning':[MiniBatchDictionaryLearning,{'n_components':self.n_components,
                                'alpha':self.alpha,'n_iter':self.n_iter,'fit_algorithm':self.fit_algorithm,
                                'n_jobs':self.n_jobs,'batch_size':self.batch_size,'shuffle':self.shuffle,
                                'dict_init':self.dict_init,'transform_algorithm':self.transform_algorithm,
                                'transform_n_nonzero_coefs':self.transform_n_nonzero_coefs,
                                'transform_alpha':self.transform_alpha,'verbose':self.verbose,
                                'split_sign':self.split_sign,'random_state':self.random_state}],
            'MiniBatchSparsePCA':[MiniBatchSparsePCA,{'n_components':self.n_components,'alpha':self.alpha,
                                'ridge_alpha':self.ridge_alpha,'n_iter':self.n_iter,'callback':self.callback,
                                'batch_size':self.batch_size,'verbose':self.verbose,'shuffle':self.shuffle,
                                'n_jobs':self.n_jobs,'method':self.method,'random_state':self.random_state}],
             'NMF':[NMF,{'n_components':self.n_components,'init':self.init,'solver':self.NMF_solver,
                            'beta_loss':self.beta_loss,'tol':self.tol,'max_iter':self.max_iter,
                         'random_state':self.random_state,'alpha':self.alpha,'l1_ratio':self.l1_ratio,
                         'verbose':self.verbose,'shuffle':self.shuffle}],
             'PCA':[PCA,{'n_components':self.n_components,'copy':self.copy,'whiten':self.whiten,
                        'svd_solver':self.svd_solver,'tol':self.tol,'iterated_power':self.iterated_power,
                         'random_state':self.random_state}],
             'SparsePCA':[SparsePCA,{'n_components':self.n_components,'alpha':self.alpha,
                        'ridge_alpha':self.ridge_alpha,'max_iter':self.max_iter,'tol':self.tol,
                        'method':self.method,'n_jobs':self.n_jobs,'U_init':self.U_init,'V_init':self.V_init,
                        'verbose':self.verbose,'random_state':self.random_state}],
            'TruncatedSVD':[TruncatedSVD,{'n_components':self.TruncatedSVD_n_components,
                                          'algorithm':self.TruncatedSVD_algorithm,
                            'n_iter':self.n_iter,'random_state':self.random_state,'tol':self.tol}],
             'LinearDiscriminantAnalysis':[LinearDiscriminantAnalysis,{'solver':self.LDA_solver,
                        'shrinkage':self.shrinkage,'priors':self.priors,'n_components':self.n_components,
                        'store_covariance':self.store_covariance,'tol':self.tol}]}
        
        if len(self.set_extract_params) !=0:
            for key,value in self.set_extract_params.items():
                self.extract_params[key] = [value[0],value[1]]
        
        # Assigning the extractor if extract (sampling method) is not "Hyperscale"
        if self.extract is not 'Feature_Hyperextractor':
            self.extractor = self.extract_params[self.extract][0](**self.extract_params[self.extract][1])
            self.performance_scores = None
            self.best_perf = None
            self.best_extract = None
    
    def fit(self, X, y=None, target=None):
        if y is None:
            if self.target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[self.target])
                X = X.drop(target, axis =1)
        
        if self.extract == 'Feature_Hyperextractor':
            # Encode labels if string when needed
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in Feature_Hyperextractor.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if y.iloc[:,0].dtype == object:
                    Target_is_string = True
                else:
                    Target_is_string = False

                if Target_is_string == True:
                    L_E = LabelEncoder()
                    L_E = L_E.fit(y.iloc[:,0].astype(str))
                    y.iloc[:,0] = L_E.transform(y.iloc[:,0].astype(str))

            # fit model on all training data            
            X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                test_size=self.test_size, random_state=self.random_state, stratify=y)
            
            if len(self.test_extractors) != 0:
                extractors = self.test_extractors
            else:
                extractors = ['None','FactorAnalysis', 'FastICA', 'IncrementalPCA', 'KernelPCA',
                              'LatentDirichletAllocation','MiniBatchDictionaryLearning','MiniBatchSparsePCA',
                              'NMF','PCA', 'SparsePCA','TruncatedSVD', 'LinearDiscriminantAnalysis']
            
            if len(self.additional_extractors) != 0:
                extractors = extractors + self.additional_extractors
                
            if len(pd.DataFrame(X).columns) == 2:
                self.extract_params['TruncatedSVD'][1]['n_components'] = 1
            if self.NLP is not True:
                extractors.remove('LatentDirichletAllocation')
                extractors.remove('NMF')
            if self.Sparse is not True:
                extractors.remove('MiniBatchDictionaryLearning')
                extractors.remove('MiniBatchSparsePCA')
                extractors.remove('SparsePCA')
            
            if len(self.excluded_extractors) !=0:
                for excluded in self.excluded_extractors:
                    if excluded in extractors:
                        extractors.remove(excluded)
            
            
            Scores = []
            for extract in extractors:
                if extract is 'None':
                    X_train_extracted, y_train_extracted = X_train, y_train.values.ravel()
                    X_test_extracted, y_test_extracted = X_test, y_test.values.ravel()
                elif extract is 'LinearDiscriminantAnalysis':
                    extract = self.extract_params[extract][0](**self.extract_params[extract][1])
                    extract.fit(X_train, y_train.values.ravel())
                    X_train_extracted, y_train_extracted = extract.transform(X_train), y_train.values.ravel()
                    X_test_extracted, y_test_extracted = extract.transform(X_test), y_test.values.ravel()                    
                else:
                    extract = self.extract_params[extract][0](**self.extract_params[extract][1])
                    extract.fit(X_train)
                    X_train_extracted, y_train_extracted = extract.transform(X_train), y_train.values.ravel()
                    X_test_extracted, y_test_extracted = extract.transform(X_test), y_test.values.ravel()
                
                if self.booster is 'gbtree':
                    if self.objective in Feature_Hyperextractor.classification_objectives:
                        estimator = xgb.XGBClassifier
                    if self.objective in Feature_Hyperextractor.regression_objectives:
                        estimator = xgb.XGBRegressor
                    selection_model = estimator(**self.best_params)
                    selection_model.fit(X_train_extracted, y_train_extracted,
                                       eval_set=[(X_test_extracted,y_test_extracted)], verbose=False)
                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = selection_model.evals_result()
                        scorer_used = list(result['validation_0'].keys())[0]
                        #score = np.mean(result['validation_1'][scorer_used])
                        score = result['validation_0'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = Feature_Hyperextractor.scoring_functions[self.scorer]
                        y_pred = selection_model.predict(X_test_extracted)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test_extracted]
                        else:
                            predictions = y_pred
                            y_test_used = y_test_extracted
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)
                        
                elif self.booster == 'dart' or self.booster == 'gblinear':
                    dtrain_extracted = xgb.DMatrix(data=X_train_extracted, label=y_train_extracted)
                    dtest_extracted = xgb.DMatrix(data=X_test_extracted, label=y_test_extracted)

                    # eval model
                    # using built-in evaluation metrics automatically matched with objective
                    if self.scorer == 'Auto':
                        result = {}
                        selection_model = xgb.train(dtrain=dtrain_extracted, params=self.best_params,
                                                    evals=[(dtest_extracted, 'eval')], evals_result=result,
                                                   verbose_eval=False)
                        scorer_used = list(result['eval'].keys())[0]
                        #score = np.mean(result['eval'][scorer_used])
                        score = result['eval'][scorer_used][-1]
                        Scores.append(score)

                    # using Sklearn metrics
                    else:
                        scoring_function = Feature_Hyperextractor.scoring_functions[self.scorer]

                        selection_model = xgb.train(dtrain=dtrain_extracted, params=self.best_params)
                        y_pred = selection_model.predict(dtest_extracted)
                        if y_pred.dtype != object:
                            predictions = [round(value) for value in y_pred]
                            y_test_used = [round(value) for value in y_test_extracted]
                        else:
                            predictions = y_pred
                            y_test_used =  y_test_extracted
                        score = scoring_function(y_test_used, predictions)
                        if self.scorer == 'accuracy_score':
                            score = score*100.00
                        Scores.append(score)

            # building table of performance scores
            if self.scorer == 'Auto':
                #self.scorer = 'Average test ' + scorer_used
                self.scorer = 'Best test ' + scorer_used
            self.performance_scores = pd.DataFrame()
            self.performance_scores['Extractors'] = extractors
            self.performance_scores[self.scorer] = Scores

            # Best extract: returns max possible model performance 
            if self.scorer in Feature_Hyperextractor.the_higher_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == max(self.performance_scores[self.scorer])]
            elif self.scorer in Feature_Hyperextractor.the_lower_the_better:
                self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                         == min(self.performance_scores[self.scorer])]
                
            # assigning the best-performing extractor as self.extractor
            self.best_extract = self.best_perf.iloc[0,0]
            if self.best_extract is not 'None':
                self.extractor = self.extract_params[self.best_extract][0](**self.extract_params[self.best_extract][1])
                # fitting the extractor
                if self.best_extract is 'LinearDiscriminantAnalysis':
                    self.extractor.fit(X, y.values.ravel())
                else:
                    self.extractor.fit(X)
            else:
                self.extractor = None
            
            # reverse label encoding
            if ((self.booster == 'dart' or self.booster == 'gblinear')\
                            and self.objective in Feature_Hyperextractor.classification_objectives)\
                                                                    or self.scorer in self.need_encoded_y:
                if Target_is_string == True:
                    y.iloc[:,0] = L_E.inverse_transform(y.iloc[:,0])
            
            return self
            
        else:
            if self.extract is 'LinearDiscriminantAnalysis':
                self.extractor.fit(X, y.values.ravel())
            else:
                self.extractor.fit(X)
            return self
    
    def transform(self, X, y=None, target=None):
        if y is None:
            if target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[target])
                X = pd.DataFrame(X.drop(target, axis =1))
        if self.best_extract is not 'None':
            X_extracted = pd.DataFrame(self.extractor.transform(X))
            X_extracted.columns = ['F'+str(col) for col in X_extracted.columns]
            return X_extracted, pd.DataFrame(y)
        else:
            return pd.DataFrame(X), pd.DataFrame(y)
        
    def fit_transform(self, X, y=None, target=None):
        self.fit(X,y, target=target)
        return self.transform(X, y, target=target)



<span style="font-size:30px;font-weight:bold;color:#b22222"> Feature_Hyperextractor in action  </span>

In [34]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [35]:
F_E = Feature_Hyperextractor(booster='gbtree',objective='binary:logistic',scorer='recall_score',# extract='IncrementalPCA',
                            excluded_extractors=['KernelPCA'])
F_E.fit(X_scaled, y_scaled)
X_extracted, y_extracted = F_E.transform(X_scaled, y_scaled)
# X_extracted = pd.DataFrame(F_E.extractor.fit_transform(X_scaled))
print(f'\033[1m{Fore.RED}Features Engineered from X_train:{Style.RESET_ALL}\033[0m')
X_extracted.head(10)

[1m[31mFeatures Engineered from X_train:[0m[0m


Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29
0,0.266251,-0.118402,-0.547101,-0.007242,0.208805,0.033638,0.027600,0.086557,0.709156,-0.002445,...,0.242455,0.370284,-0.141288,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.170784,-0.092613,-0.797599,0.117353,0.172764,0.246944,0.162560,-0.137314,-0.024216,0.037240,...,-0.034937,0.014576,0.006718,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.582985,-0.063014,-0.298098,-0.058132,0.179771,0.096694,0.272291,-0.044316,0.058963,-0.089999,...,0.028158,0.036751,0.003005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.667678,-0.113884,-0.722696,0.074993,0.163021,0.253945,0.136000,-0.163521,0.024748,0.073749,...,-0.043993,-0.034997,-0.002822,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.849560,-0.063234,-0.435702,-0.031894,0.166287,0.081211,0.265267,0.000376,0.061249,-0.013678,...,0.025152,-0.004624,0.001506,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-1.502955,-0.107799,-0.866191,0.114362,0.169591,0.249154,0.113528,-0.145230,-0.020457,0.161684,...,-0.021707,-0.058405,-0.002340,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.133038,-0.117657,-0.567499,0.027753,0.176908,0.163342,0.187957,-0.089073,0.076717,0.014318,...,0.003420,0.025551,-0.009429,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.318257,-0.071643,-0.345541,-0.052795,0.179769,0.083850,0.293792,-0.042039,0.080866,-0.025244,...,0.027648,0.034718,-0.009345,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,-0.283348,-0.103015,-0.641795,0.073479,0.182611,0.246341,0.157299,-0.171478,-0.005351,0.005604,...,-0.087518,-0.000048,-0.013193,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.895305,-0.068355,-0.230015,-0.083006,0.186769,0.061792,0.261119,-0.055599,0.060888,-0.086904,...,-0.020194,0.054259,0.004991,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
print(f'\033[1m{Fore.RED}Performance scores of Feature Extraction methods:{Style.RESET_ALL}\033[0m')
F_E.performance_scores

[1m[31mPerformance scores of Feature Extraction methods:[0m[0m


Unnamed: 0,Extractors,recall_score
0,,0.96
1,FactorAnalysis,0.98
2,FastICA,0.98
3,IncrementalPCA,0.96
4,PCA,0.96
5,TruncatedSVD,0.82
6,LinearDiscriminantAnalysis,0.9


In [37]:
print(f'\033[1m{Fore.RED}Best Feature Extraction Method:{Style.RESET_ALL}\033[0m')
F_E.best_perf

[1m[31mBest Feature Extraction Method:[0m[0m


Unnamed: 0,Extractors,recall_score
1,FactorAnalysis,0.98
2,FastICA,0.98


<a id="select"></a>

![Imgur](https://i.imgur.com/JTRAAnP.png)
<center>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>Feature Selection</tt></span></h1>
</center>
<br>
![Imgur](https://i.imgur.com/qmnAFp0.png)

<span style="font-size:30px;font-weight:bold;color:#3366cc">
class&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Feature_Selector</span><i><span style="font-size:16px;font-weight:bold;color:#3366cc">(booster='gbtree', objective='reg:logistic', random_state=69, test_size=0.2, best_params={}, scorer='Auto')</span></i>  
 
"""This transformer is not compatible with the current sklearn Pipeline.
    
    This feature selection transformer works only with the following boosters as base learners:
        -gbtree
        -dart
                                                                            
    Sklearn.metrics scoring functions can be passed to parameter:
        scorer = 'scoring_function'
    
    Built-in XGBoost evaluation metrics will be used by passing:
        scorer = 'Auto'                                                    
    
    This class requires that X (features or independent variables) has already been encoded and Imputed.
    
    Tuning requires the target (y) to also be passed.
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit_transform method is given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Parameters:  </span>


### best_params: dict, default={}
* dictionary of parameters to be used as kwargs for XGBoost models to be used for evaluating performance

### booster: str, default='gbtree'
* specifies the xgboost booster to be used for the evaluation models ('gbtree','gblinear', or 'dart')  

### objective: str, default='reg:logistic'
* specifies the xgboost objective to be used for the evaluation models
* see XGBoost documentation for more details

### scorer: str, default='Auto'
* determines the model evaluation metric used
* if 'Auto', built-in XGBoost metrics will be used
* otherwise, existic sklearn metrics can be passed as strings

### random_state: int, default=69
* random_state for train_test_split

### test_size: int, default=0.2
* test_size for train_test_split
------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Attributes:  </span>

### scoring_functions: dictionary
* dictonary of metrics available for use

### classification_objectives: list
* list of XGBoost objectives considered as for classification

### regression_objectives: list
* list of XGBoost objectives considered as for regression

### the_higher_the_better: list
* list of metrics of which an increase means improvement in performance

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### need_encoded_y: list
* list of metrics that require an encoded y
* automatically dealt with my the class

### the_lower_the_better: list
* list of metrics of which a decrease means improvement in performance

### performance_scores: pandas DataFrame
* results of testing

### best_perf: pandas DataFrame
* highest performer/s

### best_n: int
* number of top features deemed best

### selected_columns: list of str
* list of names of features deemed best
* used later for feature selection on test set or new data

------------------------------------------------------------------------
<span style="font-size:30px;font-weight:bold;color:#b22222"> Methods:  </span>

### .fit(X, y=None, target=None)  
Performs tests to determine which features to keep.  
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

### .transform(X, y=None, target=None)  
Using results of tests, returns selected features.
**Parameters:**
* X: *array-like or pandas DataFrame, shape (n_samples, n_features)*  
    * independent variables / features 
* y: *array-like or pandas DataFrame, shape (n_samples,)*    
    * dependent variable / target  
* target:*string* (optional)  
    * name of target column  

**Returns:**
* X_extracted: *pandas DataFrame, shape (n_samples, n_features)*  
    * selected independent variables / features
* y_extracted: *pandas DataFrame, shape (n_samples,)*    
    * untouched dependent variable

### .fit_transform(X, y=None, target=None)  
Streamlines fit() and transform() methods.

In [38]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [39]:
# # Feature Selection Class # #

class Feature_Selector:
    """This transformer is not compatible with the current sklearn Pipeline.
    
    This feature selection transformer works only with the following boosters as base learners:
        -gbtree
        -dart
                                                                            
    Sklearn.metrics scoring functions can be passed to parameter:
        scorer = 'scoring_function'
    
    Built-in XGBoost evaluation metrics will be used by passing:
        scorer = 'Auto'                                                    
    
    This class requires that X (features or independent variables) has already been encoded and Imputed.
    
    Tuning requires the target (y) to also be passed.
        If target = 'insert target name' is passed, X = dataframe disregarding position of target column.
        If the fit_transform method is given one positional argument (X) and target=None, 
            the class assumes that it is a DataFrame with the last
            column as the target variable and the rest as the features."""
    
    # Compatible model evaluation metrics: Sklearn metrics
    scoring_functions = {'accuracy_score':accuracy_score,'f1_score':f1_score, 'hamming_loss':hamming_loss,
        'jaccard_similarity_score':jaccard_similarity_score, 'log_loss':log_loss, 'matthews_corrcoef':matthews_corrcoef,
                 'precision_score':precision_score, 'recall_score':recall_score, 'zero_one_loss':zero_one_loss,
                'explained_variance_score':explained_variance_score, 'mean_absolute_error':mean_absolute_error,
                 'mean_squared_error':mean_squared_error, 'mean_squared_log_error':mean_squared_log_error,
                 'median_absolute_error':median_absolute_error, 'r2_score':r2_score}

    classification_objectives = ['reg:logistic','binary:logistic', 'binary:logitraw',
                                'multi:softmax', 'multi:softprob', 'rank:pairwise' ]
    regression_objectives = ['reg:linear','count:poisson','reg:gamma', 'reg:tweedie']
    
    # metrics with which higher value = higher model performance
    the_higher_the_better = ['accuracy_score','f1_score','jaccard_similarity_score',
                                  'precision_score','recall_score',
                                'explained_variance_score','r2_score','Best test auc',
                                'Best test ndcg','Best test map']
    # metrics with which lower value = higher model performance
    the_lower_the_better = ['hamming_loss', 'log_loss','matthews_corrcoef','zero_one_loss','mean_absolute_error',
                           'mean_squared_error','mean_squared_log_error','median_absolute_error',
                           'Best test error','Best test rmse','Best test mae','Best test log loss',
                           'Best test merror','Best test mlogloss']

    # metrics that require encoded target variable
    need_encoded_y = ['recall_score','precision_score','f1_score']
    
    booster='N/A'
    best_params='N/A'
    objective='N/A'
    random_state='N/A'
    test_size='N/A'
    scorer='N/A'

    def __init__(self, booster='gbtree', objective='reg:logistic', 
                 random_state=69, test_size=0.2, best_params={}, scorer='Auto',
                
                 # values passed for these parameters will never be used unless: class.parameter is made None
                 scoring_functions=None, classification_objectives=None, regression_objectives=None,
                the_higher_the_better=None, the_lower_the_better=None, need_encoded_y=None):
        
        
        # assigning parameters as instance variables
        varses = list(vars(Feature_Selector).keys())
        self.variables = varses[len(varses) - varses[::-1].index('__doc__') : varses.index('__init__')]
        class_name = "Feature_Selector"+"."
        for v in self.variables:
            # if the class variable for the argument is not empty, assign its value as the instance variable
            if eval("%s != 'N/A'" % (class_name+v)) is True:
                exec("self.%s = %s" % (v,class_name+v))
            # if the class variable is empty, assign to instance the value passed as argument during instantiation
            else:
                exec("self.%s = %s" % (v, v))
        
    
        # overriding colsample parameters
        self.best_params['colsample_bytree'] = 1
        self.best_params['colsample_bylevel'] = 1
        
        # determining the model build used for feature selection
        if self.booster == 'gbtree':
            if self.objective in Feature_Selector.classification_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBClassifier(**self.best_params)
            elif self.objective in Feature_Selector.regression_objectives:
                self.best_params['objective'] = self.objective
                if 'booster' in list(self.best_params.keys()):
                    del self.best_params['booster']
                self.model = xgb.XGBRegressor(**self.best_params)
        elif self.booster == 'dart':
            self.best_params['booster'] = self.booster
            self.best_params['objective'] = self.objective
        
        
        # making sure there is a scorer
        if self.scorer not in Feature_Selector.the_higher_the_better\
                            and scorer not in Feature_Selector.the_lower_the_better:
            self.scorer = 'Auto'
        else:
            self.performance_scores = None
            self.best_perf = None
            self.best_threshold = None
    
    def fit(self, X, y=None, target=None):
        if y is None:
            if target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[target])
                X = X.drop(self.target, axis =1)
        
        # Encode labels if string when needed
        if ((self.booster == 'dart' or self.booster == 'gblinear')\
                        and self.objective in Feature_Selector.classification_objectives)\
                                                                or self.scorer in self.need_encoded_y:
            if y.iloc[:,0].dtype == object:
                Target_is_string = True
            else:
                Target_is_string = False
            
            if Target_is_string == True:
                L_E = LabelEncoder()
                L_E = L_E.fit(y.iloc[:,0].astype(str))
                y.iloc[:,0] = L_E.transform(y.iloc[:,0].astype(str))

        # fit model on all training data            
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                            test_size=self.test_size, random_state=self.random_state, stratify=y)
        
        # Initialize lists of building performance metrics table
        Threshold = []
        ns = []
        Scores = []
        
        # tree boosters: feature selection
        if self.booster == 'gbtree':
            self.test_model = self.model
            self.test_model.fit(X_train, y_train.values.ravel())

            # Fit model using each importance as a threshold
            thresholds = sorted(self.test_model.feature_importances_)
            
            for thresh in thresholds:
                # select features using threshold
                selection = SelectFromModel(self.test_model, threshold=thresh, prefit=True)
                select_X_train = selection.transform(X_train)
                select_X_test = selection.transform(X_test)
                # train model
                if self.objective in Feature_Selector.classification_objectives:
                    estimator = xgb.XGBClassifier
                if self.objective in Feature_Selector.regression_objectives:
                    estimator = xgb.XGBRegressor
                selection_model = estimator(**self.best_params)
                selection_model.fit(select_X_train, y_train.values.ravel(),
                                   eval_set=[(select_X_test,y_test.values.ravel())], verbose=False)
                
                # eval model
                # using built-in evaluation metrics automatically matched with objective
                if self.scorer == 'Auto':
                    result = selection_model.evals_result()
                    scorer_used = list(result['validation_0'].keys())[0]
                    #score = np.mean(result['validation_1'][scorer_used])
                    score = result['validation_0'][scorer_used][-1]
                    Scores.append(score)
                    ns.append(len(pd.DataFrame(select_X_train).columns))
                    Threshold.append(thresh)

                # using Sklearn metrics
                else:
                    scoring_function = Feature_Selector.scoring_functions[self.scorer]
                    select_X_test = selection.transform(X_test)
                    y_pred = selection_model.predict(select_X_test)
                    if y_pred.dtype != object:
                        predictions = [round(value) for value in y_pred]
                        y_test_used = [round(value) for value in y_test.iloc[:,0]]
                    else:
                        predictions = y_pred
                        y_test_used = y_test.iloc[:,0]
                    score = scoring_function(y_test_used, predictions)
                    if self.scorer == 'accuracy_score':
                        score = score*100.00
                    Scores.append(score)
                    Threshold.append(thresh)
                    ns.append(select_X_train.shape[1])
        
        # dart boosters: feature selection
        elif self.booster == 'dart':
            dtrain = xgb.DMatrix(data=X_train, label=y_train)
            
            # train the dart booster model
            xg_reg = xgb.train(dtrain=dtrain, params=self.best_params, num_boost_round=10)
            importance = xg_reg.get_fscore()
            importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
            importance = pd.DataFrame(importance)
        
            for value in range(1, len(importance)):
                columns = importance.iloc[:,0][:value]
                dtrain_trim = xgb.DMatrix(data=X_train.loc[:,columns], label=y_train)
                dtest_trim = xgb.DMatrix(data=X_test.loc[:,columns], label=y_test)
                
                # eval model
                # using built-in evaluation metrics automatically matched with objective
                if self.scorer == 'Auto':
                    result = {}
                    selection_model = xgb.train(dtrain=dtrain_trim, params=self.best_params,
                                                evals=[(dtest_trim, 'eval')], evals_result=result,
                                               verbose_eval=False)
                    scorer_used = list(result['eval'].keys())[0]
                    #score = np.mean(result['eval'][scorer_used])
                    score = result['eval'][scorer_used][-1]
                    Scores.append(score)
                    ns.append(len(columns))
                    Threshold.append(importance.iloc[value-1,1])
                
                # using Sklearn metrics
                else:
                    scoring_function = Feature_Selector.scoring_functions[self.scorer]

                    selection_model = xgb.train(dtrain=dtrain_trim, params=self.best_params)
                    y_pred = selection_model.predict(dtest_trim)
                    if y_pred.dtype != object:
                        predictions = [round(value) for value in y_pred]
                        y_test_used = [round(value) for value in y_test.iloc[:,0]]
                    else:
                        predictions = y_pred
                        y_test_used = y_test.iloc[:,0]
                    score = scoring_function(y_test_used, predictions)
                    if self.scorer == 'accuracy_score':
                        score = score*100.00
                    Scores.append(score)
                    ns.append(len(columns))
                    Threshold.append(importance.iloc[value-1,1])
        
        # building table of performance scores
        if self.scorer == 'Auto':
            self.scorer = 'Best test ' + scorer_used
        self.performance_scores = pd.DataFrame()
        self.performance_scores['Threshold'] = Threshold
        self.performance_scores['n'] = ns
        self.performance_scores[self.scorer] = Scores

        # Best cut-off of top features: minumum number giving the max possible model performance 
        if self.scorer in Feature_Selector.the_higher_the_better:
            self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                     == max(self.performance_scores[self.scorer])]
            self.best_threshold = self.best_perf[(self.best_perf['n'] == min(self.best_perf['n']))].drop_duplicates()
        elif self.scorer in Feature_Selector.the_lower_the_better:
            self.best_perf = self.performance_scores[self.performance_scores[self.scorer]\
                                                     == min(self.performance_scores[self.scorer])]
            self.best_threshold = self.best_perf[(self.best_perf['n'] == min(self.best_perf['n']))].drop_duplicates()
        
        # store the best number of features
        self.best_n = self.best_threshold.n.iloc[0]
        
        # saving selected columns for use on .predict()
        if self.booster == 'gbtree':
            self.selected_columns = X.columns[np.argsort(self.test_model.feature_importances_)\
                                          [-(self.best_n):]]
        if self.booster == 'dart':
            self.selected_columns = list(columns[:self.best_n])   
        
        # reverse label encoding
        if ((self.booster == 'dart' or self.booster == 'gblinear')\
                        and self.objective in Feature_Selector.classification_objectives)\
                                                                or self.scorer in self.need_encoded_y:
            if Target_is_string == True:
                y.iloc[:,0] = L_E.inverse_transform(y.iloc[:,0])
        
        return self
        
    def transform(self, X, y=None, target=None):
        if y is None:
            if target is None:
                y = pd.DataFrame(X.iloc[:,-1])
                X = pd.DataFrame(X.drop(X.columns[[-1,]], axis=1))
            else:
                y = pd.DataFrame(X[target])
                X = X.drop(target, axis =1)
        # Drop unwanted features
        X = X[self.selected_columns]
        
        return pd.DataFrame(X), pd.DataFrame(y)
    
    def fit_transform(self, X, y=None, target=None):
        self.fit(X, y, target)
        return self.transform(X, y, target)


<span style="font-size:30px;font-weight:bold;color:#b22222"> Feature_Selector in action  </span>

In [40]:
feat_selector = Feature_Selector(booster='gbtree',objective='binary:logistic', scorer='recall_score')
feat_selector = feat_selector.fit(X_extracted, y_extracted)
X_selected, y_selected = feat_selector.transform(X_extracted, y_extracted)
print(f'\033[1m{Fore.RED}Features Selected from X_train:{Style.RESET_ALL}\033[0m')
X_selected.head(10)

[1m[31mFeatures Selected from X_train:[0m[0m


Unnamed: 0,F12,F2,F13,F8,F1
0,0.141758,-0.547101,0.118107,0.709156,-0.118402
1,0.122173,-0.797599,-0.088937,-0.024216,-0.092613
2,0.044194,-0.298098,-0.092114,0.058963,-0.063014
3,0.085675,-0.722696,-0.045364,0.024748,-0.113884
4,0.073172,-0.435702,-0.139703,0.061249,-0.063234
5,0.206897,-0.866191,-0.125036,-0.020457,-0.107799
6,0.041691,-0.567499,-0.066664,0.076717,-0.117657
7,0.050382,-0.345541,-0.113288,0.080866,-0.071643
8,0.033116,-0.641795,-0.015484,-0.005351,-0.103015
9,-0.089322,-0.230015,-0.110771,0.060888,-0.068355


In [41]:
print(f'\033[1m{Fore.RED}Performance Scores for different feature selections:{Style.RESET_ALL}\033[0m')
feat_selector.performance_scores

[1m[31mPerformance Scores for different feature selections:[0m[0m


Unnamed: 0,Threshold,n,recall_score
0,0.0,30,0.96
1,0.0,30,0.96
2,0.0,30,0.96
3,0.0,30,0.96
4,0.0,30,0.96
5,0.0,30,0.96
6,0.0,30,0.96
7,0.0,30,0.96
8,0.008547,22,0.96
9,0.014245,21,0.96


In [42]:
print(f'\033[1m{Fore.RED}Best number of top features:{Style.RESET_ALL}\033[0m')
feat_selector.best_perf

[1m[31mBest number of top features:[0m[0m


Unnamed: 0,Threshold,n,recall_score
25,0.076923,5,0.98


<a id="verdict"></a>
<h1><span style="font-size:48px;font-family:Times New Roman,Times,serif"><tt>VERDICT:</tt></span></h1>


<span style="font-size:30px;font-weight:bold;color:#3366cc">The final tests reached a </span><span style="font-size:30px;font-weight:bold;color:#b22222"> recall score of 98% </span><span style="font-size:30px;font-weight:bold;color:#3366cc"> even before Hyperparameter Tuning has been performed!!! </span>

<a id="test"></a>
<h1><span style="font-size:35px;font-family:Times New Roman,Times,serif"><tt>Applying preprocessing statistics learned from X_train to X_test::</tt></span></h1>

In [63]:
HTML('''<form action="javascript:code_toggle()"><input type="submit" value="Click here to view/hide the raw code."></form>''')

In [64]:
# imputing X_test with the same statistics
X_test_processing = Imputation_Nation(X_fillers=I_N.X_fillers).fit_transform(X_test)

# scaling X_test with the same statistics
if FHS.scaler is not None:
    X_test_processing = FHS.scaler.transform(X_test_processing)
    
# extracting the same features from X_test
if F_E.extractor is not None:
    X_test_processing = pd.DataFrame(F_E.extractor.transform(X_test_processing))
    X_test_processing.columns = ['F'+str(col) for col in X_test_processing.columns]

# selecting the same features from X_test
X_test_processing = pd.DataFrame(X_test_processing).loc[:,feat_selector.selected_columns]
print(f'\033[1m{Fore.RED}Preprocessed X_test ready to use for prediction:{Style.RESET_ALL}\033[0m')
X_test_processing.head(10)

[1m[31mPreprocessed X_test ready to use for prediction:[0m[0m


Unnamed: 0,F12,F2,F13,F8,F1
0,1.498119,-0.563105,-0.771356,0.407072,-0.155116
1,-0.562073,-0.549863,0.188044,-0.152795,-0.173757
2,-0.340447,-0.257558,0.095124,0.213554,-0.13676
3,0.23911,-0.264196,-0.030727,0.14699,-0.139136
4,-0.016318,-0.885908,0.016873,-0.164912,-0.130859
5,-0.087781,-0.901105,-0.661568,-0.200278,-0.208367
6,-0.52104,-0.733932,0.106796,0.116841,-0.193123
7,-0.606422,-0.519089,-0.831701,-0.466483,-0.090066
8,0.384194,-0.570086,-0.881533,0.310844,-0.147488
9,0.699253,-0.45253,-0.248756,0.314546,0.045898


## Image Sources:  
* [Impputation and Encoding Header](https://healthunlimitedbiz.files.wordpress.com/2015/07/puzzle-w-missing-pieces.jpg)  
* [Resampling Header](http://www.guoguiyan.com/laboratory-wallpapers/68519704.html)  
- [Feature Scaling Header](https://www.wzwlh.com/wealth-builder/)  
- [Feature Extraction Header](https://dop4.deviantart.com/art/Human-Transmutation-Circle-299877050)  
- [Feature Selection Header](https://www.bayarea.com/uncategorized/5869/)  
  