In [1]:
import pandas as pd
import scipy.stats
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
#Load data
df = pd.read_csv('D:/My stuff/School/Master/Master Thesis/Data/Kaggle/Give me some credit/cs-training.csv')

In [3]:
#Check shape
df.shape

(150000, 12)

# Data adjustments

In [4]:
#Drop the ID column
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
#Rename columns for clarity
df.rename(columns = {'SeriousDlqin2yrs' : 'Target', 'age' : 'Age'}, inplace = True)

In [6]:
#Drop observation with zero age
df = df.loc[df['Age'] != 0, :]
df.shape #Check shape after removal

(149999, 11)

# Drop drop variables with a low fill rate

In [7]:
#Get percentage of missing values
df.isna().sum() / df.shape[0] #Nothing higher than 20%

Target                                  0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
Age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198208
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.026160
dtype: float64

# Remove correlated

In [8]:
#Specifications
dep_var = 'Target'
thres = 0.75

In [9]:
#Calculate the correlations
indep_vars = list(set(df.columns.to_list()) - set([dep_var])) #Specify the list of independent variables
corr_mat = df.corr() #Calculate the correlation matrix
dep_cor = corr_mat[dep_var].copy() #Extract the correlations with the dependent variable
corr_mat.drop(dep_var, axis = 0) #Drop the row with correlations with the dependent variable
corr_mat.drop(dep_var, axis = 1) #Drop the column with the correlations with the dependent variable
corr_mat.values[np.tril_indices_from(corr_mat.values)] = np.nan #Leave only the upper triangle
corr_mat = corr_mat.unstack().dropna().reset_index() #Unstack to a table
corr_mat.columns = ['Var 1', 'Var 2', 'Corr'] #Rename columns for clarity
corr_mat = corr_mat.loc[np.argsort(-corr_mat['Corr'].abs(), ), :].reset_index(drop = True) #Sort in absolute values

In [10]:
#Drop correlated features
n_dropped = 0 #Initiate the number of dropped features
for i in corr_mat.index[corr_mat['Corr'].abs() >= thres]: #Loop through all correlations higher than threshold (in absolute value)
    if (corr_mat.loc[i, 'Var 1'] not in indep_vars) | (corr_mat.loc[i, 'Var 2'] not in indep_vars):
        continue #Skip the iteration if one of the variables has already been disregarded
    min_id = np.argmin(dep_cor[corr_mat.loc[i, ['Var 1', 'Var 2']].values].abs()) #Find the id of the variable with the smallest absolute correlation with the dependent variable
    var_to_drop = corr_mat.loc[i, f'Var {min_id + 1}'] #Get the name of the variable to be dropped
    indep_vars.remove(var_to_drop) #Drop the variable
    not_dropped_var = corr_mat.loc[i, f'Var {abs(min_id - 2)}'] #Get the name of the variable that was not dropped (for logging purposes)
    print(f'Variable {var_to_drop} dropped due to correlation of {corr_mat.loc[i, "Corr"]:.2%} with {not_dropped_var}')
    n_dropped += 1
print(f'Dropped {n_dropped} features due to high correlation')

Variable NumberOfTime60-89DaysPastDueNotWorse dropped due to correlation of 99.28% with NumberOfTimes90DaysLate
Variable NumberOfTimes90DaysLate dropped due to correlation of 98.36% with NumberOfTime30-59DaysPastDueNotWorse
Dropped 2 features due to high correlation


In [11]:
#Filter out dropped variables
df = df.loc[:, indep_vars + [dep_var]]

In [12]:
#Check VIF
{val:round(variance_inflation_factor(df.loc[:, indep_vars].dropna(), idx), 2) for idx, val in enumerate(indep_vars)}

{'RevolvingUtilizationOfUnsecuredLines': 1.0,
 'NumberOfDependents': 1.46,
 'NumberRealEstateLoansOrLines': 2.3,
 'MonthlyIncome': 1.24,
 'DebtRatio': 1.01,
 'Age': 3.91,
 'NumberOfOpenCreditLinesAndLoans': 4.71,
 'NumberOfTime30-59DaysPastDueNotWorse': 1.01}

# Remove NAs

In [13]:
#Drop NAs
df_no_nas = df.dropna()
print(f'{df_no_nas.shape[0]} out of {df.shape[0]} observations left after removing NAs ({df.shape[0] - df_no_nas.shape[0]} observations dropped)')

120268 out of 149999 observations left after removing NAs (29731 observations dropped)


# Remove categorical variables

In [14]:
#Remove categorical variables
df_no_cats = df_no_nas.loc[:, df_no_nas.nunique() > 2].copy()
df_no_cats['Target'] = df_no_nas['Target'] #Retain target

In [15]:
#Check shape
df_no_cats.shape

(120268, 9)

# Get info about final state of the data set

In [16]:
#Check shape
df_no_nas.shape

(120268, 9)

In [17]:
#Get average of the target variable
df_no_nas['Target'].mean()

0.06948648019423288

# Trim outliers

In [18]:
#Trim outliers
df_no_outs = df_no_nas.copy()
for i in indep_vars:
    mask = (df_no_outs[i] >= np.percentile(df_no_outs[i], 1)) & (df_no_outs[i] <= np.percentile(df_no_outs[i], 99))
    print(f'{i}: {sum(np.invert(mask))} observations dropped')
    df_no_outs = df_no_outs.loc[mask, :]
print(f'Total observations dropped: {df_no_nas.shape[0] - df_no_outs.shape[0]}')

RevolvingUtilizationOfUnsecuredLines: 1203 observations dropped
NumberOfDependents: 911 observations dropped
NumberRealEstateLoansOrLines: 663 observations dropped
MonthlyIncome: 1068 observations dropped
DebtRatio: 1164 observations dropped
Age: 1897 observations dropped
NumberOfOpenCreditLinesAndLoans: 1942 observations dropped
NumberOfTime30-59DaysPastDueNotWorse: 1016 observations dropped
Total observations dropped: 9864


# Export

In [19]:
#Export all versions of the data set
df.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/GiveMeSomeCredit_nas.csv', index = False)
df_no_nas.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/GiveMeSomeCredit_main.csv', index = False)
df_no_outs.to_csv('D:/My stuff/School/Master/Master Thesis/Data/Final data/GiveMeSomeCredit_no_outs.csv', index = False)