In [1]:
import pandas as pd
import scipy.stats
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#Load data
df = pd.read_excel('D:/My stuff/School/Master/Master Thesis/Data/UCI Machine Learning Repository/Default of credit card clients Taiwan/default of credit card clients.xls', skiprows = 1)

In [3]:
#Check shape
df.shape

(30000, 25)

# Data adjustments

In [4]:
#Drop the ID column
df.drop('ID', axis = 1, inplace = True)

In [5]:
#Rename target
df.rename(columns = {'default payment next month' : 'Target'}, inplace = True)

# Drop variables with a low fill rate

In [6]:
#Drop variables with too many missing values
miss_perc = df.isna().sum() / df.shape[0] > 0.2
print(f'Dropped {sum(miss_perc)} columns with percentage of missings higher than 0.2')
df = df.loc[:, np.invert(miss_perc)]

Dropped 0 columns with percentage of missings higher than 0.2


# Retain categorical features with at most two unique values

In [7]:
#Get number of unique values
n_unique = df.nunique() #Get number of unique values (missings disregarded)
cat_to_drop = (n_unique != 2) & (df.dtypes == 'object').values
print(f'Dropping {sum(cat_to_drop)} categorical features with more or less than 2 unique values')
df = df.loc[:, np.invert(cat_to_drop)] #Drop the features

Dropping 0 categorical features with more or less than 2 unique values


In [8]:
#Drop education column (multiple values but integer encoded)
df.drop('EDUCATION', axis = 1, inplace = True)

In [9]:
#Drop marriage column (multiple values but integer encoded)
df.drop('MARRIAGE', axis = 1, inplace = True)

# Create dummy variables

In [10]:
#Create dummy variables
df['SEX'] = df['SEX'].map({2 : 1, 1 : 0}, na_action = 'ignore')
df.rename(columns = {'SEX' : 'FEMALE'}, inplace = True)

# Drop correlated

In [11]:
#Specifications
dep_var = 'Target'
thres = 0.75 #Arbitrary for now

In [12]:
#Calculate the correlations
indep_vars = list(set(df.columns.to_list()) - set([dep_var])) #Specify the list of independent variables
corr_mat = df.corr() #Calculate the correlation matrix
dep_cor = corr_mat[dep_var].copy() #Extract the correlations with the dependent variable
corr_mat.drop(dep_var, axis = 0) #Drop the row with correlations with the dependent variable
corr_mat.drop(dep_var, axis = 1) #Drop the column with the correlations with the dependent variable
corr_mat.values[np.tril_indices_from(corr_mat.values)] = np.nan #Leave only the upper triangle
corr_mat = corr_mat.unstack().dropna().reset_index() #Unstack to a table
corr_mat.columns = ['Var 1', 'Var 2', 'Corr'] #Rename columns for clarity
corr_mat = corr_mat.loc[np.argsort(-corr_mat['Corr'].abs(), ), :].reset_index(drop = True) #Sort in absolute values

In [13]:
#Drop correlated features
n_dropped = 0 #Initiate the number of dropped features
for i in corr_mat.index[corr_mat['Corr'].abs() >= thres]: #Loop through all correlations higher than threshold (in absolute value)
    var1, var2 = corr_mat.loc[i, 'Var 1'], corr_mat.loc[i, 'Var 2'] #Store variable names
    if (var1 not in indep_vars) | (var2 not in indep_vars):
        continue #Skip the iteration if one of the variables has already been disregarded
    var_types = df.dtypes[[var1, var2]].to_list() #Store variable types
    if var_types.count('object') == 1: #If only one of the variables is categorical, retain the numerical one
        var_to_drop = [var1, var2][var_types.index('object')] #Store the categorical variable to drop
        not_dropped_var = list(set([var1, var2]) - set(var_to_drop))[0] #Store the numerical variable to retain
    else:
        min_id = np.argmin(dep_cor[corr_mat.loc[i, ['Var 1', 'Var 2']].values].abs()) #Find the id of the variable with the smallest absolute correlation with the dependent variable
        var_to_drop = corr_mat.loc[i, f'Var {min_id + 1}'] #Get the name of the variable to be dropped
        not_dropped_var = corr_mat.loc[i, f'Var {abs(min_id - 2)}'] #Get the name of the variable that was not dropped (for logging purposes)
    indep_vars.remove(var_to_drop) #Drop the variable
    print(f'Variable {var_to_drop} dropped due to correlation of {corr_mat.loc[i, "Corr"]:.2%} with {not_dropped_var}')
    n_dropped += 1
print(f'Dropped {n_dropped} features due to high correlation')

Variable BILL_AMT2 dropped due to correlation of 95.15% with BILL_AMT1
Variable BILL_AMT6 dropped due to correlation of 94.62% with BILL_AMT5
Variable BILL_AMT5 dropped due to correlation of 94.01% with BILL_AMT4
Variable BILL_AMT4 dropped due to correlation of 92.40% with BILL_AMT3
Variable BILL_AMT3 dropped due to correlation of 89.23% with BILL_AMT1
Variable PAY_5 dropped due to correlation of 81.98% with PAY_4
Variable PAY_4 dropped due to correlation of 77.74% with PAY_3
Variable PAY_3 dropped due to correlation of 76.66% with PAY_2
Dropped 8 features due to high correlation


In [14]:
#Filter out dropped variables
df = df.loc[:, indep_vars + [dep_var]]

In [15]:
#Calculate VIF
vif_dict = {val:round(variance_inflation_factor(df.loc[:, indep_vars].dropna(), idx), 2) for idx, val in enumerate(indep_vars)}
vif_dict

{'BILL_AMT1': 1.96,
 'PAY_AMT4': 1.23,
 'LIMIT_BAL': 3.78,
 'PAY_AMT1': 1.32,
 'PAY_6': 1.67,
 'PAY_AMT6': 1.21,
 'PAY_2': 2.29,
 'FEMALE': 2.22,
 'PAY_0': 1.89,
 'PAY_AMT5': 1.22,
 'PAY_AMT2': 1.24,
 'PAY_AMT3': 1.27,
 'AGE': 3.77}

In [16]:
#Check shape
df.shape

(30000, 14)

# Remove NAs

In [17]:
#Drop NAs
df_no_nas = df.dropna()
print(f'{df_no_nas.shape[0]} out of {df.shape[0]} observations left after removing NAs ({df.shape[0] - df_no_nas.shape[0]} observations dropped)')

30000 out of 30000 observations left after removing NAs (0 observations dropped)


# Remove categorical variables

In [18]:
#Remove categorical variables
df_no_cats = df_no_nas.loc[:, df_no_nas.nunique() > 2].copy()
df_no_cats['Target'] = df_no_nas['Target'] #Retain target

In [19]:
#Check shape
df_no_cats.shape

(30000, 13)

# Get info about final state of the data set

In [20]:
#Check shape
df_no_nas.shape

(30000, 14)

In [21]:
#Get average of the target variable
df_no_nas['Target'].mean()

0.2212

In [22]:
#Check number of categorical variables
cat_vars_final_no = sum(df_no_nas.nunique() <= 2) - 1
cat_vars_final_no

1

In [23]:
#Check number of numerical variables
df_no_nas.shape[1] - cat_vars_final_no - 1

12

# Trim outliers

In [24]:
#Trim outliers
df_no_outs = df_no_nas.copy()
for i in indep_vars:
    mask = (df_no_outs[i] >= np.percentile(df_no_outs[i], 1)) & (df_no_outs[i] <= np.percentile(df_no_outs[i], 99))
    print(f'{i}: {sum(np.invert(mask))} observations dropped')
    df_no_outs = df_no_outs.loc[mask, :]
print(f'Total observations dropped: {df_no_nas.shape[0] - df_no_outs.shape[0]}')

BILL_AMT1: 599 observations dropped
PAY_AMT4: 294 observations dropped
LIMIT_BAL: 148 observations dropped
PAY_AMT1: 290 observations dropped
PAY_6: 128 observations dropped
PAY_AMT6: 286 observations dropped
PAY_2: 144 observations dropped
FEMALE: 0 observations dropped
PAY_0: 53 observations dropped
PAY_AMT5: 281 observations dropped
PAY_AMT2: 278 observations dropped
PAY_AMT3: 275 observations dropped
AGE: 307 observations dropped
Total observations dropped: 3083


# Export

In [25]:
#Export all versions of the data set
df_no_nas.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/CreditCardTaiwan_main.csv', index = False)
df_no_cats.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/CreditCardTaiwan_no_cats.csv', index = False)
df_no_outs.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/CreditCardTaiwan_no_outs.csv', index = False)