In [373]:
import configparser
import json

In [361]:
config = configparser.ConfigParser()
config.read("../config_example.yaml")

['../config_example.yaml']

In [364]:
config['mysection']['dataset_creation']

"\napplication_records_fp: 'credit_card_data/application_record.csv'\ncredit_records_fp: 'credit_card_data/credit_record.csv'\n\noutput_path: 'processed_data'\ndataset_name: 'labelled_dataset'"

In [354]:
print(config)

<configparser.ConfigParser object at 0x7fc7be45ed30>


In [375]:
with open('../config_example.json', 'r') as config_file:
    config = json.load(config_file)

In [378]:
config['dataset_creation']['application_records_fp']

'credit_card_data/application_record.csv'

In [328]:
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

In [23]:
import yaml

In [31]:
from datetime import datetime

now = datetime.now()

In [384]:
credit_rec = pd.read_csv('credit_record.csv')

In [145]:
credit_rec.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [385]:
app_rec = pd.read_csv('application_record.csv')

In [126]:
app_rec.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [321]:
app_rec.FLAG_MOBIL.describe()

count    438557.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: FLAG_MOBIL, dtype: float64

In [320]:
app_rec.nunique()

ID                     438510
CODE_GENDER                 2
FLAG_OWN_CAR                2
FLAG_OWN_REALTY             2
CNT_CHILDREN               12
AMT_INCOME_TOTAL          866
NAME_INCOME_TYPE            5
NAME_EDUCATION_TYPE         5
NAME_FAMILY_STATUS          5
NAME_HOUSING_TYPE           6
DAYS_BIRTH              16379
DAYS_EMPLOYED            9406
FLAG_MOBIL                  1
FLAG_WORK_PHONE             2
FLAG_PHONE                  2
FLAG_EMAIL                  2
OCCUPATION_TYPE            18
CNT_FAM_MEMBERS            13
dtype: int64

In [11]:
credit_rec.ID.duplicated()

0    False
1     True
2     True
3     True
4    False
Name: ID, dtype: bool

In [302]:
from sklearn.base import BaseEstimator, TransformerMixin

In [342]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import datetime
import os

def read_config(filepath):

    return pd.read_json(filepath)


def read_file(filepath):

    if filepath.split('.')[-1] == 'csv':
        data = pd.read_csv(filepath)

    elif filepath.split('.')[-1] == 'parquet':
        data = pd.read_parquet(filepath)

    else:
        print('Unrecognised filetype')

    return data


class upload_dataset():

    def upload(self, final_dataset, config):

        name = config['dataset_creation']['dataset_name']
        name += datetime.now().strftime("%Y%m%d-%H%M%S")

        pd.to_csv(os.join(config['dataset_creation']['output_path'], name))


class NameDropper(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        return data.drop(columns=self.cols)


class DataLabeler(BaseEstimator, TransformerMixin):

    def __init__(self, debt_def = {'C': 'Good_Debt', 'X': 'Good_Debt', '0': 'Good_Debt',
                                   '1': 'Neutral_Debt', '2': 'Neutral_Debt',
                                   '3': 'Bad_Debt', '4': 'Bad_Debt', '5': 'Bad_Debt'}):
        self.debt_def = debt_def

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data = data.replace({'STATUS': self.debt_def})

        debt_counts = data.value_counts(subset=['ID', 'STATUS']).unstack(fill_value=0)

        debt_counts.loc[(debt_counts['Good_Debt'] > debt_counts['Neutral_Debt']), 'label'] = 1
        debt_counts.loc[(debt_counts['Good_Debt'] > debt_counts['Bad_Debt']), 'label'] = 1
        debt_counts.loc[(debt_counts['Neutral_Debt'] > debt_counts['Good_Debt']), 'label'] = 0
        debt_counts.loc[(debt_counts['Neutral_Debt'] > debt_counts['Bad_Debt']), 'label'] = 1
        debt_counts.loc[(debt_counts['Bad_Debt'] > debt_counts['Good_Debt']), 'label'] = 0
        debt_counts.loc[(debt_counts['Bad_Debt'] > debt_counts['Neutral_Debt']), 'label'] = 0

        return debt_counts.reset_index()


class OneHotEncode(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        one_hot = pd.get_dummies(data[self.cols])

        data = data.join(one_hot)

        data = data.drop(columns = self.cols)
        return data


class LabelJoiner(BaseEstimator, TransformerMixin):

    def __init__(self, labels):
        self.labels = labels

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data = data.merge(self.labels, on='ID', how='left')
        return data
    
class ColumnMapper(BaseEstimator, TransformerMixin):

    def __init__(self, col, mapper):
        self.mapper = mapper
        self.col = col

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data[self.col] = data[self.col].replace(self.mapper)
        return data
    
class MinMaxScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, col):
        self.col = col
        
    def fit(self, data, y=None):
        
        training = data[data.train == 1]
        
        self.max_value = training[self.col].max()
        self.min_value = training[self.col].min()
        return self

    def transform(self, data):
        data[self.col] = (data[self.col] - self.min_value) / (self.max_value - self.min_value)
        return data
    
class DropDuplicates(BaseEstimator, TransformerMixin):
    
    def __init__(self, col):
        self.col = col
        
    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data = data.drop_duplicates(subset=[self.col])
        return data
    
    
class TrainTestSplit(BaseEstimator, TransformerMixin):

    def __init__(self, training_size = 0.3):
        self.training_size = training_size

    def fit(self, data, y=None):
        return self

    def transform(self, data):
        data = data.sort_values('label', ascending=True)
        
        num_labelled = data.label.notnull().sum()
        num_unlabelled = data.label.isna().sum()
        
        training = np.random.choice(a = [0,1], size = num_labelled, p = [self.training_size, 1 - self.training_size])
        
        training = np.pad(training, (0, num_unlabelled), constant_values=-1)
        
        data['train'] = training
        return data

In [256]:
labelled_set.label.notnull().sum()

36457

In [257]:
labelled_set.label.mean()

0.9951723948761555

In [270]:
labelled_set.sort_values('label', ascending=True)

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,...,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,label
43615,F,N,Y,2,135000.0,Civil marriage,With parents,-9862,-1868,1,...,0,0,0,0,0,0,0,1,0,0.0
434593,F,N,N,2,180000.0,Single / not married,House / apartment,-16611,-8369,1,...,0,0,0,1,0,0,0,0,1,0.0
434589,M,N,N,1,157500.0,Married,House / apartment,-16424,-1458,1,...,0,0,0,1,0,0,0,0,1,0.0
434588,M,N,N,1,157500.0,Married,House / apartment,-16424,-1458,1,...,0,0,0,1,0,0,0,0,1,0.0
45029,F,Y,N,0,56250.0,Married,House / apartment,-23773,365243,1,...,1,0,0,0,0,0,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,M,N,Y,0,135000.0,Separated,House / apartment,-22717,365243,1,...,1,0,0,0,0,0,0,0,1,
438553,F,N,N,0,103500.0,Single / not married,House / apartment,-15939,-3007,1,...,0,0,0,1,0,0,0,0,1,
438554,F,N,N,0,54000.0,Single / not married,With parents,-8169,-372,1,...,0,0,0,0,0,1,0,0,0,
438555,F,N,Y,0,72000.0,Married,House / apartment,-21673,365243,1,...,1,0,0,0,0,0,0,0,1,


In [162]:
from sklearn.pipeline import Pipeline

In [392]:
pipe_credit = Pipeline([
    ('Generate Labels',DataLabeler()),
    ('Drop Debt Columns', NameDropper(cols=['Bad_Debt', 'Good_Debt', 'Neutral_Debt']))
])

In [393]:
labels = pipe_credit.fit_transform(credit_rec)

In [394]:
labels

STATUS,ID,label
0,5001711,1.0
1,5001712,1.0
2,5001713,1.0
3,5001714,1.0
4,5001715,1.0
...,...,...
45980,5150482,1.0
45981,5150483,1.0
45982,5150484,1.0
45983,5150485,1.0


In [381]:
pipe=Pipeline([
    ('Join Labels', LabelJoiner(labels=labels)),
    ('Test Train Split', TrainTestSplit()),
    ('Scale Income', MinMaxScaler('AMT_INCOME_TOTAL')),
    ('One Hot Encode', OneHotEncode(cols=['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'])),
    ('Gender Map', ColumnMapper(col='CODE_GENDER', mapper={'M':1, 'F':0})),
    ('Car Map', ColumnMapper(col='FLAG_OWN_CAR', mapper={'Y':1, 'N':0})),
    ('Realty Map', ColumnMapper(col='FLAG_OWN_REALTY', mapper={'Y':1, 'N':0})),
    ('Redundant Columns', NameDropper(cols=['FLAG_MOBIL'])),
    ('Drop Duplicates', DropDuplicates(col='ID'))])

In [386]:
result = pipe.fit_transform(app_rec)

In [387]:
result

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,...,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
43615,5125934,0,0,1,2,0.069767,-9862,-1868,1,0,...,0,0,0,0,0,0,0,0,0,1
434593,5054525,0,0,0,2,0.098837,-16611,-8369,0,0,...,0,0,1,0,0,1,0,0,0,0
434589,5054068,1,0,0,1,0.084302,-16424,-1458,0,0,...,1,0,0,0,0,1,0,0,0,0
434588,5054063,1,0,0,1,0.084302,-16424,-1458,0,0,...,1,0,0,0,0,1,0,0,0,0
45029,5135673,0,1,0,0,0.018895,-23773,365243,0,1,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,1,0,1,0,0.069767,-22717,365243,0,0,...,0,1,0,0,0,1,0,0,0,0
438553,6840222,0,0,0,0,0.049419,-15939,-3007,0,0,...,0,0,1,0,0,1,0,0,0,0
438554,6841878,0,0,0,0,0.017442,-8169,-372,1,0,...,0,0,1,0,0,0,0,0,0,1
438555,6842765,0,0,1,0,0.029070,-21673,365243,0,0,...,1,0,0,0,0,1,0,0,0,0


In [250]:
app_rec.shape

(438557, 18)

In [251]:
labels.shape

(45985, 2)