In [49]:
# Imports
from typing import List

import pandas as pd

This notebook will provide: 
 - data explonatory 
 - data insights
 - features selection 
 - transforming dataset into required format (for further modelS applies)

### TODO
- find empty columns 

### CAVEATS 
- Imbalanced target (700 vs 300)
- Odd mean loan of creditors age of > 65 


In [3]:
row_data = pd.read_csv('./dataset/project_data.csv', delimiter=';')
row_data

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X12,X13,X14,X15,X16,X17,X18,X19,X20,Y
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,...,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,...,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,...,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,...,A124,23,A143,A153,1,A173,1,A192,A201,2


### Steps
1. Rename columns names for explicit visual understanding of features  

In [93]:

class Transformer:
    @staticmethod
    def rename_columns(dataset: pd.DataFrame):
        """Rename dataframe columns names.
        Notes:
            target column names based on provided data_description.txt
        Returns:
            pd.DataFrame - dataframe with renamed columns 
        """
    
        target_columns = {
            'X01': 'account_status',
            'X06': 'account_savings',
            'X02': 'credit_duration',
            'X03': 'credit_history',
            'X04': 'credit_purpose',
            'X05': 'credit_amount',
            'X07': 'employment_status',
            'X17': 'employment_description',
            'X08': 'income_installment_rate',
            'X09': 'gender_status',
            'X10': 'credit_guarantors',
            'X11': 'residence',
            'X12': 'owned_property',
            'X13': 'age',
            'X14': 'installment_plans',
            'X15': 'accomondation_type',
            'X16': 'credit_existing_number',
            'X18': 'liable_maintain',
            'X19': 'phone_number',
            'X20': 'foreign_worker',
            'Y': 'y'
        }
        return dataset.rename(columns=target_columns)    
    
    @staticmethod
    def categorical_columns(dataset: pd.DataFrame):
        """Within dataset find categorical columns
        
        Returns:
          List: list of names of categorical columns.
        """
        categorical_columns_indexes = []
        
        columns = dataset.columns
        num_columns = dataset._get_numeric_data().columns # get numerical columns
        
        categorical_columns = sorted(list(set(columns) - set(num_columns)))
        categorical_columns_indexes = [df.columns.get_loc(column) for column in categorical_columns]
        return categorical_columns, categorical_columns_indexes
        
    
    @staticmethod
    def cast_categorical_column(column: pd.Series):
        pass
    
    @staticmethod
    def train_test_split(dataset: pd.DataFrame, test_size: int, shuffle=False):
        from sklearn.model_selection import train_test_split
        print(f'Splitting dataset {dataset.shape} with test_size :{test_size}')
        X = dataset.drop(columns=['y'])
        y = dataset['y']
        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=shuffle, test_size=test_size)
        print(f'X_train: {X_train.shape}')
        print(f'y_train: {y_train.shape}')
        print(f'X_test: {X_test.shape}')
        print(f'y_test: {y_test.shape}')
        return X_train, X_test, y_train, y_test

class Models:
    @staticmethod
    def _catboost_classifier(X_train, X_test, y_train, y_test, indexes):
        """Apply Bare CatBoost Classifier.
        
        Notes:
          - cat_features: CatBoost model requires list of indexes which denote categorical columns;
        """
        
        from catboost import CatBoostClassifier, Pool
        from sklearn.metrics import accuracy_score
        
        train_data = Pool(
            data=X_train.values,
            label=y_train,
            cat_features=indexes
        )
        model = CatBoostClassifier(iterations=30)
        model.fit(train_data)
        predictions = model.predict(X_test.values)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy
        

## Entry Point

In [92]:
def benchmark(dataframe : pd.DataFrame):
        """get benchmark on unpreprocessed dataset
        """
        X_train, X_test, y_train, y_test = transformer.train_test_split(df, test_size=.2, shuffle=True)
        accuracy_score = models._catboost_classifier(X_train,
                                                     X_test,
                                                     y_train,
                                                     y_test,
                                                     categorical_columns_indexes
                                                    )
        
        print(f'Benchmark accuracy: {accuracy_score}')                                          
        return accuracy_score

    
transformer = Transformer()
models = Models()

df = transformer.rename_columns(row_data)
categorical_columns, categorical_columns_indexes = transformer.categorical_columns(df) 

benchmark(dataframe=df)

Splitting dataset (1000, 21) with test_size :0.2
X_train: (800, 20)
y_train: (800,)
X_test: (200, 20)
y_test: (200,)
Learning rate set to 0.233364
0:	learn: 0.6094262	total: 3.97ms	remaining: 115ms
1:	learn: 0.5612022	total: 7.91ms	remaining: 111ms
2:	learn: 0.5232553	total: 11.5ms	remaining: 104ms
3:	learn: 0.5026108	total: 14.5ms	remaining: 94.2ms
4:	learn: 0.4875634	total: 17.9ms	remaining: 89.6ms
5:	learn: 0.4659295	total: 21.4ms	remaining: 85.6ms
6:	learn: 0.4533570	total: 25.1ms	remaining: 82.4ms
7:	learn: 0.4367730	total: 28.7ms	remaining: 78.8ms
8:	learn: 0.4178797	total: 32.7ms	remaining: 76.3ms
9:	learn: 0.4103726	total: 36.1ms	remaining: 72.1ms
10:	learn: 0.4052194	total: 39.3ms	remaining: 67.9ms
11:	learn: 0.3994428	total: 42.2ms	remaining: 63.3ms
12:	learn: 0.3922787	total: 45.9ms	remaining: 60ms
13:	learn: 0.3874081	total: 48.7ms	remaining: 55.7ms
14:	learn: 0.3710495	total: 51.4ms	remaining: 51.4ms
15:	learn: 0.3634918	total: 54.5ms	remaining: 47.7ms
16:	learn: 0.3551922

0.755