In [1]:
import pandas as pd
import scipy.stats
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#Load data
df = pd.read_csv('D:/My stuff/School/Master/Master Thesis/Data/UCI Machine Learning Repository/South German Credit Data Set/SouthGermanCredit.csv')

In [3]:
#Check shape
df.shape

(1000, 21)

# Data adjustments

In [4]:
#Rename target
df.rename(columns = {'credit_risk' : 'Target'}, inplace = True)

# Drop variables with a low fill rate

In [5]:
#Drop variables with too many missing values
miss_perc = df.isna().sum() / df.shape[0] > 0.2
print(f'Dropped {sum(miss_perc)} columns with percentage of missings higher than 0.2')
df = df.loc[:, np.invert(miss_perc)]

Dropped 0 columns with percentage of missings higher than 0.2


# Retain categorical features with at most two unique values

In [6]:
#Get number of unique values
n_unique = df.nunique() #Get number of unique values (missings disregarded)
cat_to_drop = (n_unique != 2) & (df.dtypes == 'object').values
print(f'Dropping {sum(cat_to_drop)} categorical features with more or less than 2 unique values')
df = df.loc[:, np.invert(cat_to_drop)] #Drop the features

Dropping 14 categorical features with more or less than 2 unique values


# Create dummy variables

In [7]:
#Create dummy variables
for i in df.columns[df.dtypes == 'object']: #Loop through categorical columns
    df[i] = df[i].map({j:k for j, k in zip(df[i].unique(), [0, 1])}, na_action = 'ignore')

# Drop correlated

In [8]:
#Specifications
dep_var = 'Target'
thres = 0.75 #Arbitrary for now

In [9]:
#Calculate the correlations
indep_vars = list(set(df.columns.to_list()) - set([dep_var])) #Specify the list of independent variables
corr_mat = df.corr() #Calculate the correlation matrix
dep_cor = corr_mat[dep_var].copy() #Extract the correlations with the dependent variable
corr_mat.drop(dep_var, axis = 0) #Drop the row with correlations with the dependent variable
corr_mat.drop(dep_var, axis = 1) #Drop the column with the correlations with the dependent variable
corr_mat.values[np.tril_indices_from(corr_mat.values)] = np.nan #Leave only the upper triangle
corr_mat = corr_mat.unstack().dropna().reset_index() #Unstack to a table
corr_mat.columns = ['Var 1', 'Var 2', 'Corr'] #Rename columns for clarity
corr_mat = corr_mat.loc[np.argsort(-corr_mat['Corr'].abs(), ), :].reset_index(drop = True) #Sort in absolute values

In [10]:
#Drop correlated features
n_dropped = 0 #Initiate the number of dropped features
for i in corr_mat.index[corr_mat['Corr'].abs() >= thres]: #Loop through all correlations higher than threshold (in absolute value)
    var1, var2 = corr_mat.loc[i, 'Var 1'], corr_mat.loc[i, 'Var 2'] #Store variable names
    if (var1 not in indep_vars) | (var2 not in indep_vars):
        continue #Skip the iteration if one of the variables has already been disregarded
    var_types = df.dtypes[[var1, var2]].to_list() #Store variable types
    if var_types.count('object') == 1: #If only one of the variables is categorical, retain the numerical one
        var_to_drop = [var1, var2][var_types.index('object')] #Store the categorical variable to drop
        not_dropped_var = list(set([var1, var2]) - set(var_to_drop))[0] #Store the numerical variable to retain
    else:
        min_id = np.argmin(dep_cor[corr_mat.loc[i, ['Var 1', 'Var 2']].values].abs()) #Find the id of the variable with the smallest absolute correlation with the dependent variable
        var_to_drop = corr_mat.loc[i, f'Var {min_id + 1}'] #Get the name of the variable to be dropped
        not_dropped_var = corr_mat.loc[i, f'Var {abs(min_id - 2)}'] #Get the name of the variable that was not dropped (for logging purposes)
    indep_vars.remove(var_to_drop) #Drop the variable
    print(f'Variable {var_to_drop} dropped due to correlation of {corr_mat.loc[i, "Corr"]:.2%} with {not_dropped_var}')
    n_dropped += 1
print(f'Dropped {n_dropped} features due to high correlation')

Dropped 0 features due to high correlation


In [11]:
#Filter out dropped variables
df = df.loc[:, indep_vars + [dep_var]]

In [12]:
#Calculate VIF
vif_dict = {val:round(variance_inflation_factor(df.loc[:, indep_vars].dropna(), idx), 2) for idx, val in enumerate(indep_vars)}
vif_dict

{'people_liable': 1.21,
 'telephone': 1.86,
 'duration': 5.61,
 'foreign_worker': 1.06,
 'age': 3.68,
 'amount': 4.08}

# Remove NAs

In [13]:
#Drop NAs
df_no_nas = df.dropna()
print(f'{df_no_nas.shape[0]} out of {df.shape[0]} observations left after removing NAs ({df.shape[0] - df_no_nas.shape[0]} observations dropped)')

1000 out of 1000 observations left after removing NAs (0 observations dropped)


# Remove categorical variables

In [14]:
#Remove categorical variables
df_no_cats = df_no_nas.loc[:, df_no_nas.nunique() > 2].copy()
df_no_cats['Target'] = df_no_nas['Target'] #Retain target

In [15]:
#Check shape
df_no_cats.shape

(1000, 4)

# Get info about final state of the data set

In [16]:
#Check shape
df_no_nas.shape

(1000, 7)

In [17]:
#Get average of the target variable
df_no_nas['Target'].mean()

0.3

In [18]:
#Check number of categorical variables
cat_vars_final_no = sum(df_no_nas.nunique() <= 2) - 1
cat_vars_final_no

3

In [19]:
#Check number of numerical variables
df_no_nas.shape[1] - cat_vars_final_no - 1

3

# Trim outliers

In [20]:
#Trim outliers
df_no_outs = df_no_nas.copy()
for i in indep_vars:
    mask = (df_no_outs[i] >= np.percentile(df_no_outs[i], 1)) & (df_no_outs[i] <= np.percentile(df_no_outs[i], 99))
    print(f'{i}: {sum(np.invert(mask))} observations dropped')
    df_no_outs = df_no_outs.loc[mask, :]
print(f'Total observations dropped: {df_no_nas.shape[0] - df_no_outs.shape[0]}')

people_liable: 0 observations dropped
telephone: 0 observations dropped
duration: 8 observations dropped
foreign_worker: 0 observations dropped
age: 11 observations dropped
amount: 20 observations dropped
Total observations dropped: 39


# Export

In [21]:
#Export both versions of the data set
df_no_nas.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/SouthGermanCredit_main.csv', index = False)
df_no_cats.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/SouthGermanCredit_no_cats.csv', index = False)
df_no_outs.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/SouthGermanCredit_no_outs.csv', index = False)