In [1]:
# Imports
from typing import List

import pandas as pd

This notebook will provide: 
 - data explonatory 
 - data insights
 - features selection 
 - transforming dataset into required format (for further modelS applies)

### TODO
- find empty columns <-- there is no missing values

## Assumed steps
#### Benchmark
    - dataset_v0
        # EVALUATE 
        - [catboost_v0] untuned classifier on dataset_v0 data set (~75% accuracy)
        - [catboost_v1] tuned classifier on dataset_v0 data set
        
####  Data set Preprocessing
    - dataset_v1
        - data cleaning
            - ignore the tuples 
            - missing values
                - fill with mean | meadian | custom
            - elinimate noise data 
                - binning method & regression & clustering 
        - data transformation
            - normalization
            - attribute selection
            - discretization
        - data reduction
        

### CAVEATS 
- Imbalanced target (700 vs 300)
- Odd mean loan of creditors age of > 65 


### Steps
1. Rename columns names for explicit visual understanding of features  

In [215]:
class Transformer:
    @staticmethod
    def rename_columns(dataset: pd.DataFrame):
        """Rename dataframe columns names.
        Notes:
            target column names based on provided data_description.txt
        Returns:
            pd.DataFrame - dataframe with renamed columns 
        """
    
        target_columns = {
            'X01': 'account_status',
            'X06': 'account_savings',
            'X02': 'credit_duration',
            'X03': 'credit_history',
            'X04': 'credit_purpose',
            'X05': 'credit_amount',
            'X07': 'employment_status',
            'X17': 'employment_description',
            'X08': 'income_installment_rate',
            'X09': 'gender_status',
            'X10': 'credit_guarantors',
            'X11': 'residence',
            'X12': 'owned_property',
            'X13': 'age',
            'X14': 'installment_plans',
            'X15': 'accomondation_type',
            'X16': 'credit_existing_number',
            'X18': 'liable_maintain',
            'X19': 'phone_number',
            'X20': 'foreign_worker',
            'Y': 'y'
        }
        return dataset.rename(columns=target_columns)    
    
    @staticmethod
    def categorical_columns(dataset: pd.DataFrame):
        """Within dataset find categorical columns
        
        Returns:
          List: list of names of categorical columns.
        """
        categorical_columns_indexes = []
        
        columns = dataset.columns
        num_columns = dataset._get_numeric_data().columns # get numerical columns
        
        categorical_columns = sorted(list(set(columns) - set(num_columns)))
        categorical_columns_indexes = [dataset.columns.get_loc(column) for column in categorical_columns]
        return categorical_columns, categorical_columns_indexes
          
    @staticmethod
    def train_test_split(dataset: pd.DataFrame, test_size: int, shuffle=False):
        from sklearn.model_selection import train_test_split
        print(f'Splitting dataset {dataset.shape} with test_size :{test_size}')
        X = dataset.drop(columns=['y'])
        y = dataset['y']
        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=shuffle, test_size=test_size)
        print(f'X_train: {X_train.shape}')
        print(f'y_train: {y_train.shape}')
        print(f'X_test: {X_test.shape}')
        print(f'y_test: {y_test.shape}')
        return X_train, X_test, y_train, y_test
    
    @staticmethod
    def custom_split(dataset: pd.DataFrame, test_percentage: int):
        size = dataset.shape[0]
        train_ratio = (size * test_percentage) // 100
        test_ratio = size - train_ratio

        X_train = pd.DataFrame(dataset[:train_ratio, :-1])
        X_test = pd.DataFrame(dataset[:test_ratio, :-1])

        y_train = dataset[:train_ratio, -1]
        y_test = dataset[:test_ratio, -1]
        return X_train, X_test, y_train, y_test
    
    @staticmethod
    def data_cleaning(dataframe: pd.DataFrame):
        """Applies follwoing transformations:   
            - data cleaning
                - ignore the tuples # when the dataset is large and multiple values are missing within a tuple.
                - missing values
                    - fill with mean | meadian | custom
                - encoding
        """
        categorical_columns = [column for column in dataset_v1.columns if dataset_v1[f'{column}'].dtype == 'object']
    
        for column_name in categorical_columns: 
            labels = dataframe[f'{column_name}'].astype('category').cat.categories.tolist()
            replace_map_comp = {f'{column_name}' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
            dataframe.replace(replace_map_comp, inplace=True)
        return dataframe
    
    @staticmethod
    def data_transformation(dataframe: pd.DataFrame, features=20):
        """Applies follwoing transformations:
            - data transformation
                - normalization # preprocessing.MinMaxScaler()
                - attribute selection # SelectKBest(chi2, k=features)
                - discretization # muted for now 
        """
        from sklearn import preprocessing
        from sklearn.feature_selection import chi2
        from sklearn.feature_selection import SelectKBest
        original_columns = ['account_status', 'credit_duration', 'credit_history', 'credit_purpose', 'credit_amount', 'account_savings',
                            'employment_status', 'income_installment_rate', 'gender_status','credit_guarantors', 'residence', 'owned_property', 'age',
                            'installment_plans', 'accomondation_type', 'credit_existing_number', 'employment_description',
                            'liable_maintain', 'phone_number', 'foreign_worker', 'y'
                           ]
        
        values = dataframe.values #returns a numpy array
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(values)
        buff_df = pd.DataFrame(x_scaled)
        
        buff_df.columns = original_columns # cast back original column namings 
        
        X = buff_df.drop(columns=['y'], axis=1)
        y = buff_df['y']
        
        dataframe = SelectKBest(chi2, k=features).fit_transform(X, y)
        return dataframe
    

In [216]:
class Models:
    @staticmethod
    def _catboost_classifier(X_train, X_test, y_train, y_test, cat_features, **kwargs):
        """CatBoost Classifier.
        
        Notes:
          - cat_features: CatBoost model requires list of indexes which denote categorical columns;
        """
        
        from catboost import CatBoostClassifier, Pool
        from sklearn.metrics import accuracy_score
        
        train_data = Pool(
            data=X_train.values,
            label=y_train,
            cat_features=cat_features
        )
        model = CatBoostClassifier(iterations=30)
        model.fit(train_data)
        predictions = model.predict(X_test.values)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy
    
    @staticmethod
    def _xgboost_classifier(X_train, X_test, y_train, y_test, **kwargs):
        """Apply XGBoost Classifier.
        Notes:
          When using XGBoost we need to convert categorical variables into numeric.
        """
        from xgboost import XGBClassifier
        from sklearn.metrics import accuracy_score
        
        model = XGBClassifier()
        model.fit(X_train, y_train, verbose=True)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy
        

## Entry Point

In [221]:
models = Models()
transformer = Transformer()


row_data = pd.read_csv('./dataset/project_data.csv', delimiter=';')
# print(row_data)


# # Benchmark
dataset_v0 = transformer.rename_columns(row_data)
categorical_columns, categorical_columns_indexes = transformer.categorical_columns(dataset_v0) 
X_train_v0, X_test_v0, y_train_v0, y_test_v0 = transformer.train_test_split(dataset_v0,
                                                                            test_size=.3,
                                                                            shuffle=True
                                                                           )

# - dataset_v0 && - [catboost_v0] untuned classifier on dataset_v0 data set (~75% accuracy)
dataset_v0_catboost_v0 = models._catboost_classifier(X_train_v0,
                                                     X_test_v0,
                                                     y_train_v0,
                                                     y_test_v0,
                                                     cat_features=categorical_columns_indexes
                                                    )

# - dataset_v0 && - [catboost_v1] tuned classifier on dataset_v0 data set (~__% accuracy)
# @TODO fulfill


# Beating Benchmark
dataset_v1 = transformer.rename_columns(row_data)
dataset_v1 = transformer.data_cleaning(dataset_v1)
dataset_v1 = transformer.data_transformation(dataset_v1, features=5) # return ndarray
X_train_v1, X_test_v1, y_train_v1, y_test_v1 = transformer.custom_split(dataset_v1,
                                                                        test_percentage=30
                                                                       )


# - dataset_v1 && - [catboost_v0] untuned classifier 
dataset_v1_catboost_v0 = models._catboost_classifier(X_train_v1,
                                                     X_test_v1,
                                                     y_train_v1,
                                                     y_test_v1,
                                                     cat_features=[]
                                                    )

# - dataset_v1 && - [xgboost_classifier_v0] untuned classifier
dataset_v1_xgboost_classifier_v0 = models._xgboost_classifier(X_train_v1,
                                                              X_test_v1,
                                                              y_train_v1,
                                                              y_test_v1
                                                             )

Splitting dataset (1000, 21) with test_size :0.3
X_train: (700, 20)
y_train: (700,)
X_test: (300, 20)
y_test: (300,)
Learning rate set to 0.22043
0:	learn: 0.6143646	total: 4.14ms	remaining: 120ms
1:	learn: 0.5655417	total: 7.62ms	remaining: 107ms
2:	learn: 0.5255207	total: 11.1ms	remaining: 100ms
3:	learn: 0.5003964	total: 14.4ms	remaining: 93.6ms
4:	learn: 0.4860743	total: 18.5ms	remaining: 92.7ms
5:	learn: 0.4671726	total: 21.8ms	remaining: 87.4ms
6:	learn: 0.4503195	total: 25.2ms	remaining: 82.7ms
7:	learn: 0.4395020	total: 28.4ms	remaining: 78ms
8:	learn: 0.4348984	total: 31.3ms	remaining: 73.1ms
9:	learn: 0.4243975	total: 34.2ms	remaining: 68.5ms
10:	learn: 0.4171205	total: 37.6ms	remaining: 64.9ms
11:	learn: 0.4071788	total: 40.5ms	remaining: 60.7ms
12:	learn: 0.4006725	total: 43.8ms	remaining: 57.3ms
13:	learn: 0.3961232	total: 46.8ms	remaining: 53.5ms
14:	learn: 0.3920035	total: 49.9ms	remaining: 49.9ms
15:	learn: 0.3870953	total: 53.1ms	remaining: 46.5ms
16:	learn: 0.3780961	

In [222]:
print(dataset_v0_catboost_v0, dataset_v1_catboost_v0, dataset_v1_xgboost_classifier_v0)

0.7533333333333333 0.9628571428571429 0.9671428571428572
