# Applying data minimization to loans data

In this tutorial we will show how to perform data minimization for ML models using the minimization module.

This will be demonstarted using the Loans dataset.

## Load and preprocess data

In [18]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

def modify_specific_features(data):
    data = data.rename(
        columns={"loan_amnt": "loan_amount", "funded_amnt": "funded_amount", "funded_amnt_inv": "investor_funds",
                 "int_rate": "interest_rate", "annual_inc": "annual_income"})

    date_format = '%b-%Y'
    dt_series = pd.to_datetime(data['issue_d'], format=date_format)
    data['year'] = dt_series.dt.year

    dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)
    data['earliest_cr_year'] = dt_series.dt.year

    dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)
    data['last_credit_pull_year'] = dt_series.dt.year

    # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)
    dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)
    data['last_pymnt_year'] = dt_series.dt.year

    data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))

    data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))

    data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))

    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)
    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])
    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)
    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)
    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)

    data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))

    west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
    south_west = ['AZ', 'TX', 'NM', 'OK']
    south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']
    mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
    north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']

    data['region'] = np.nan

    def finding_regions(state):
        if state in west:
            return 'West'
        elif state in south_west:
            return 'SouthWest'
        elif state in south_east:
            return 'SouthEast'
        elif state in mid_west:
            return 'MidWest'
        elif state in north_east:
            return 'NorthEast'

    data['region'] = data['addr_state'].apply(finding_regions)
    return data

def fill_missing(data):
    for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',
                'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',
                'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',
                'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',
                'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',
                'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',
                'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',
                'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',
                'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',
                'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):
        try:
            data[col] = data[col].fillna(0)
        except KeyError:
            print('missing column ' + col)

    for col in ('settlement_status', 'emp_title', 'region'):
        try:
            data[col] = data[col].fillna('NA')
        except KeyError:
            print('missing column ' + col)
            
    return data

def modify_label(data):
    bad_loan = ["Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period",
                "Late (16-30 days)", "Late (31-120 days)"]

    data['label'] = np.nan

    def loan_condition(status):
        if status in bad_loan:
            return 0
        else:
            return 1

    data['label'] = data['loan_status'].apply(loan_condition)
    return data

def remove_unwanted_features(data):
    features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',
              'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',
              'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
              'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
              'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',
              'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
              'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
              'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
              'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
              'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',
              'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',
              'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',
              'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',
              'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',
              'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',
              'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',
              'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',
              'issue_d', 'earliest_cr_line', 'last_credit_pull_d',
              'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',
              'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',
              'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']
    return data.drop(features_to_remove, axis=1)

def split_data(data, create_validation):
    # divide into 3 datasets for training, validation and test
    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_set, test_set in stratified.split(data, data['label']):
        train = data.iloc[train_set]
        test = data.iloc[test_set] # 20% of data

    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)
    for train_set, test_set in stratified.split(train, train['label']):
        model_train = data.iloc[train_set] # 40% of data
        generalizer_train = data.iloc[test_set] # 40% of data

    validation = None
    if create_validation:
        stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
        for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):
            generalizer_train = data.iloc[train_set]  # 30% of data
            validation = data.iloc[test_set]  # 10% of data
    return train, test, generalizer_train, validation

In [19]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedShuffleSplit

input_file_path = "/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv"
dataset = pd.read_csv(input_file_path, low_memory=False)
dataset = shuffle(dataset, random_state=14)

dataset = modify_specific_features(dataset)
dataset = fill_missing(dataset)
dataset = modify_label(dataset)
dataset = remove_unwanted_features(dataset)
        
train, test, generalizer_train, _ = split_data(dataset, False)

x_train = train.drop('label', axis=1)
y_train = train['label']
x_test = test.drop('label', axis=1)
y_test = test['label']

x_train

Unnamed: 0,loan_amount,funded_amount,investor_funds,term,interest_rate,installment,grade,sub_grade,emp_length,home_ownership,...,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,tax_liens,hardship_flag,disbursement_method,year,earliest_cr_year,region
371269,28000,28000,28000.0,60,16.55,689.12,D,D2,0,OWN,...,Individual,0,0,0,0,N,Cash,2015,1997,West
413904,15000,15000,15000.0,36,6.03,456.54,A,A1,1,MORTGAGE,...,Individual,0,0,0,0,N,Cash,2015,2003,NorthEast
390183,8000,8000,8000.0,36,17.86,288.66,D,D5,0,RENT,...,Individual,0,0,0,0,N,Cash,2015,2000,West
171520,4750,4750,4750.0,36,16.99,169.33,D,D3,2,OWN,...,Individual,0,0,0,0,N,Cash,2015,1992,SouthEast
79658,3950,3950,3950.0,36,10.99,129.30,B,B4,4,RENT,...,Individual,0,0,0,0,N,Cash,2015,2010,MidWest
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397229,26500,26500,26500.0,60,8.67,545.87,B,B1,10,MORTGAGE,...,Individual,0,0,0,0,N,Cash,2015,1994,SouthEast
110958,18900,18900,18900.0,36,5.32,569.17,A,A1,0,RENT,...,Individual,0,0,0,0,N,Cash,2015,1999,SouthEast
91253,18000,18000,18000.0,60,11.53,396.14,B,B5,0,RENT,...,Individual,0,0,0,0,N,Cash,2015,1987,NorthEast
61757,14450,14450,14375.0,60,23.99,415.62,F,F3,1,MORTGAGE,...,Individual,0,0,0,0,N,Cash,2015,2006,MidWest


## Train decision tree model

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

features = x_train.columns
categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',
                        'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']
# QI parameter determines which features will be minimized.
QI = ["annual_income", "zip_code", "dti", "last_pymnt_amnt", "total_rec_int"]

numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
encoded_train = preprocessor.fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded_train, y_train)

encoded_test = preprocessor.transform(x_test)
print('Base model accuracy: ', model.score(encoded_test, y_test))

Base model accuracy:  0.9442762322041345


## Run minimization
We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data).

In [25]:
from apt.minimization import GeneralizeToRepresentative
from sklearn.model_selection import train_test_split

# default target_accuracy is 0.998
minimizer = GeneralizeToRepresentative(model, 
                                       categorical_features=categorical_features, 
                                       features_to_minimize=QI,
                                       encoder=preprocessor)

# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the
# resulting accuracy on test data will be closer to the desired target accuracy (when working with training
# data it could result in a larger gap)
# Don't forget to leave a hold-out set for final validation!
generalizer_train_small = generalizer_train[:2000]
x_test_small = x_test[:2000]
y_test_small = y_test[:2000]
X_generalizer_train = generalizer_train_small.drop('label', axis=1)
features_names = features.tolist()

encoded_generalizer_train = preprocessor.transform(X_generalizer_train)
x_train_predictions = model.predict(encoded_generalizer_train)
minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)
transformed = minimizer.transform(x_test_small, features_names=features_names)

encoded_transformed = preprocessor.transform(transformed)
print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))

Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250
Improving accuracy
feature to remove: zip_code
Removed feature: zip_code, new relative accuracy: 0.861250
feature to remove: total_rec_int
Removed feature: total_rec_int, new relative accuracy: 0.912500
feature to remove: last_pymnt_amnt
Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500
feature to remove: dti
Removed feature: dti, new relative accuracy: 0.995000
feature to remove: annual_income
Removed feature: annual_income, new relative accuracy: 1.000000
Accuracy on minimized data:  0.9425


#### Let's see what features were generalized

In [26]:
generalizations = minimizer.generalizations
print(generalizations)

{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}


We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).

Let's change to a slightly lower target accuracy.

In [28]:
# We allow a 2% deviation in accuracy from the original model accuracy
minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, 
                                        categorical_features=categorical_features, 
                                        features_to_minimize=QI,
                                        encoder=preprocessor)

minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)
transformed2 = minimizer2.transform(x_test_small, features_names=features_names)

encoded_transformed2 = preprocessor.transform(transformed2)
print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))
generalizations2 = minimizer2.generalizations
print(generalizations2)

Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250
Improving accuracy
feature to remove: zip_code
Removed feature: zip_code, new relative accuracy: 0.861250
feature to remove: total_rec_int
Removed feature: total_rec_int, new relative accuracy: 0.912500
feature to remove: last_pymnt_amnt
Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500
Accuracy on minimized data:  0.933
{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.95499

Two features are generalized: annual income and debt to income ratio