In [1]:
import math
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, kruskal, pointbiserialr, chi2_contingency, pearsonr, f_oneway, f
from statsmodels.formula.api import ols

  import pandas.util.testing as tm


# Preprocessing

1. Load data

In [2]:
df_raw = pd.read_csv('german.data', header=None, delimiter=' ')

Specify type of variables (no ordinal treatment because ordering is not clear)

In [3]:
# based on description at https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
numeric = [False, True, False, False, True, False, False, True, False, False, 
           True, False, True, False, False, True, False, True, False, False, False]
nominal = np.where(np.array(numeric) == True, False, True)

2. Set column names

In [4]:
cols = ['Status_of_existing_checking_account','Duration_in_month', 
        'Credit_history', 'Purpose', 'Credit_amount', 'Savings_accountbonds', 
        'Present_employment_since', 'Installment_rate_in_percentage_of_disposable_income', 
        'Personal_status_and_sex', 'Other_debtorsguarantors', 'Present_residence_since', 
        'Property', 'Age_in_years', 'Other_installment_plans', 'Housing', 
        'Number_of_existing_credits_at_this_bank', 'Job', 'Number_of_people_being_liable_to_provide_maintenance_for', 
        'Telephone', 'Foreign_worker', 'Creditworthiness']
df_raw.columns = cols
df_raw.sort_values('Creditworthiness', kind='stable', inplace=True,ignore_index=True)

3. Recode nominal variables

In [5]:
cleanup_nums = {"Status_of_existing_checking_account": {"A11":0, "A14":3, "A12":1, "A13":2},
                "Credit_history": {"A34":4, "A32":2, "A30":0, "A31":1, "A33":3},
                "Purpose": {"A43":3, "A46":6, "A42":2, "A41":1, "A40":0, "A49":9, "A44":4, "A45":5, "A410":10, "A48":8},
                "Savings_accountbonds": {"A65":5, "A61":1, "A63":3, "A64":4, "A62":2},
                "Present_employment_since": {"A75":5, "A74":4, "A73":3, "A72":2, "A71":1},
                "Personal_status_and_sex": {"A93":1, "A91":1, "A92":2, "A94":1},
                "Other_debtorsguarantors": {"A101":1, "A103":3, "A102":2},
                "Property": {"A121":1, "A122":2, "A124":4, "A123":3},
                "Other_installment_plans": {"A143":3, "A141":1, "A142":2},
                "Housing": {"A152":2, "A153":3, "A151":1},
                "Job": {"A173":3, "A172":2, "A174":4, "A171":1},
                "Telephone": {"A192":2, "A191":1},
                "Foreign_worker": {"A201": 1, "A202":2},
                "Creditworthiness": {1: 0, 2: 1}}
df_nom = df_raw.loc[:,list(df_raw.columns[nominal])]
df_nom.replace(cleanup_nums, inplace=True) 

4. Min-max normalize numeric variables

In [7]:
df_num = df_raw.loc[:,list(df_raw.columns[numeric])]
scaled=np.subtract(df_num.values,np.min(df_num.values,axis=0))/np.subtract(np.max(df_num.values,axis=0),np.min(df_num.values,axis=0))
df_num = pd.DataFrame(scaled, columns=df_num.columns)

5. Since the numeric variable 'Number_of_people_being_liable_to_provide_maintenance_for' is dichotomous, it's going to be treated as nominal.

In [8]:
df_num['Number_of_people_being_liable_to_provide_maintenance_for'].unique()

array([0., 1.])

In [9]:
df_nom['Number_of_people_being_liable_to_provide_maintenance_for'] = df_num.loc[:,'Number_of_people_being_liable_to_provide_maintenance_for']
df_nom['Number_of_people_being_liable_to_provide_maintenance_for'] = df_nom['Number_of_people_being_liable_to_provide_maintenance_for'].astype('object')
df_num = df_num.drop(['Number_of_people_being_liable_to_provide_maintenance_for'], axis=1)

6. Merge the two

In [10]:
df = pd.concat([df_nom,df_num],axis=1)

# Functions

In [11]:
## Anova, r-squared between continuous protected (DV) and independent nominal
def ols_Rsquared(num_cols, full_dataset, nom_cols):
  """
  Calculate ordinary least squares one-way Anova using statsmodels.formula.api
  ...
  Parameters
  ----------
  num_cols : pandas.index array
    array containing numeric only colummns
    
  full_dataset : pd.DataFrame
    a pd.DataFrame object of containing both numeric and nominal variables

  nom_cols : pandas.index array
    array containing nominal only columns

  Returns
  -------
  pd.DataFrame object with the following columns: (1,2) variables compared with each other, 
  (3) the R-squared, (3) F-statistic, (4) F-critical value, (5) Significant if the
  F-statistic is larger than the F-critical value.
  """
  fits = []
  significance = ''
  for col_num in num_cols:
    for col_nom in nom_cols:
      fit = ols(col_num+' ~ C('+col_nom+')', data = full_dataset).fit()
      k = len(np.unique(df[col_nom]))
      f_critical = f.ppf(q=1-.005, dfn=k-1, dfd=len(full_dataset)-k) # 1-.01 (0.01), 1-.005 (0.05): 95% confidence interval.
      f_stat = fit.fvalue
      if f_stat > f_critical:
        significance = 'significant'
      else:
        significance = 'non-significant'
      fits.append([col_nom,col_num, str(round(fit.rsquared,3)), round(fit.fvalue,3), round(f_critical,2), significance])

  return pd.DataFrame(fits, columns=['Attribute1', 'Attribute2','Correlation', 'F-Statistic', 'F-critical', 'Significance'])

## Pearson R between continuous & continuous
def pearsonR(df_num, full_df):
  """
  Calculate Pearson correlatio coefficient and p-value using 
  scipy.stats.pearson_r
  ...
  Parameters
  ----------
  df_num : pandas.index array
    array containing numeric only colummns
    
  full_df : pd.DataFrame
    a pd.DataFrame object the total set of variables

  Returns
  -------
  pd.DataFrame object with the following columns: (1,2) the variables compared with each other, 
  (2) Pearson's correlation coefficient, (3) the Pi-value, (4) 'F-critical' not applicable here,
  (5) Significant if the Pi_valus is larger than 0.05
  """
  coeff = []
  significance = ''
  for i in df_num:
    for y in df_num:
      rho, pi = pearsonr(full_df[i], full_df[y])
      if pi < 0.05:
        significance = 'significant'
      else:
        significance = 'non-significant'
      coeff.append([i,y,round(rho,3),round(pi,3), 'NaN', significance])
  return pd.DataFrame(coeff, columns=['Attribute1', 'Attribute2','Correlation', 'Pi-value', 'F-critical', 'Significance'])

def cramersv(df_nom, lambda_= None):
  """
  Calculate Cramer's V (multiple categories) coefficient using scipy.stats chi2_contingency
  ...
  Parameters
  ----------
  prot_nominal : pd.DataFrame['column']
    a pd.DataFrame column containing the nominal protected variable
    
  df_nom : pd.DataFrame
    a pd.DataFrame object containing all nominal variables

  lambda_ : pd.DataFrame['column']

  Returns
  -------
  pd.DataFrame object with the following columns: (1,2) the variables compared with each other, 
  (2) Cramer's V coefficient, (3) the Pi-value, (4) 'F-critical' not applicable here,
  (5) Significant if the Pi_valus is larger than 0.05
  """
  p = []
  significance = ''
  for col1 in df_nom:
    for col2 in df_nom:
      if (len(np.unique(df_nom[col1])) == 2) & (len(np.unique(df_nom[col2])) == 2):
        pct=pd.crosstab(df_nom[col1], df_nom[col2]).values
        chi2, pi, _, _=chi2_contingency(pct, lambda_=lambda_)
        corr = np.sqrt(chi2 / len(df_nom))
        if pi < 0.05:
          significance = 'significant'
        else:
          significance = 'non-significant'
        p.append([col1, col2 ,round(corr,3), round(pi,3), 'NaN', significance])
      else:
        ct=pd.crosstab(df_nom[col1], df_nom[col2]).values
        chi2, pi, _, _=chi2_contingency(ct, lambda_=lambda_)
        corr = np.sqrt(((chi2 / len(df_nom))/min(ct.shape[0]-1,ct.shape[1]-1)))
        if pi < 0.05:
          significance = 'significant'
        else:
          significance = 'non-significant'
        p.append([col1, col2, round(corr,3), round(pi,3), 'NaN', significance])

  return pd.DataFrame(p, columns=['Attribute1', 'Attribute2','Correlation', 'Pi-value', 'F-critical','Significance'])

# Weight Matrix

**Pearson's correlation**
1. Continous/Continous: Pearson's r, p-value
2. Continous/nom: OLS, R squared, F-statistic (Pearson's based)
3. Nom/nom (dichotomous & non-dichotomous): Cramer's V (Pearson's based), p-value

In [12]:
## Secenario 1: continuous continuous
con_con = pearsonR(df_num, df) # Pi value
## Scenario 2: continuous nominal
con_nom = ols_Rsquared(df_num.columns, df, df_nom.columns) # F-statistic
## Scenario 3: nominal nominal
nom_nom = cramersv(df_nom)

In [13]:
# merge the threee scenarios in long format
merge=pd.concat((con_con.iloc[:,[0,1,2,-1]],con_nom.iloc[:,[0,1,2,-1]],nom_nom.iloc[:,[0,1,2,-1]]),ignore_index=True)

# initiate matrix
mat = pd.DataFrame(np.empty((len(cols),len(cols))),columns=np.sort(cols),index=np.sort(cols),dtype=str)

# create matrix & add the asterisks to indicate significance
for row_idx in range(len(merge)):
  row=merge.iloc[row_idx,:].values
  with np.printoptions(precision=2, suppress=True):
    cor = str(round(float(row[2]),2))
    if row[-1] == 'significant':
      mat.at[row[0],row[1]] = cor+'*'
      mat.at[row[1],row[0]] = cor+'*'
    else:
      mat.at[row[0],row[1]] = cor
      mat.at[row[1],row[0]] = cor

In [14]:
mat

Unnamed: 0,Age_in_years,Credit_amount,Credit_history,Creditworthiness,Duration_in_month,Foreign_worker,Housing,Installment_rate_in_percentage_of_disposable_income,Job,Number_of_existing_credits_at_this_bank,Number_of_people_being_liable_to_provide_maintenance_for,Other_debtorsguarantors,Other_installment_plans,Personal_status_and_sex,Present_employment_since,Present_residence_since,Property,Purpose,Savings_accountbonds,Status_of_existing_checking_account,Telephone
Age_in_years,1.0*,0.03,0.03*,0.01*,-0.04,0.0,0.09*,0.06,0.03*,0.15*,0.01*,0.0,0.0,0.03*,0.17*,0.27*,0.05*,0.03*,0.01,0.01,0.02*
Credit_amount,0.03,1.0*,0.04*,0.02*,0.62*,0.0,0.04*,-0.27*,0.11*,0.02,0.0,0.01,0.0,0.01*,0.01,0.03,0.1*,0.14*,0.02*,0.02*,0.08*
Credit_history,0.03*,0.04*,1.0*,0.25*,0.04*,0.07,0.1*,0.01,0.07,0.35*,0.1*,0.09,0.21*,0.12*,0.1*,0.01,0.08,0.17*,0.07,0.14*,0.07
Creditworthiness,0.01*,0.02*,0.25*,1.0*,0.05*,0.08*,0.14*,0.01,0.04,0.0,0.0,0.08*,0.11*,0.07*,0.14*,0.0,0.15*,0.18*,0.19*,0.35*,0.03
Duration_in_month,-0.04,0.62*,0.04*,0.05*,1.0*,0.02*,0.04*,0.07*,0.05*,-0.01,0.0,0.0,0.01,0.01,0.01,0.03,0.09*,0.07*,0.01,0.01*,0.03*
Foreign_worker,0.0,0.0,0.07,0.08*,0.02*,0.99*,0.07,0.01*,0.1*,0.0,0.07*,0.12*,0.04,0.04,0.08,0.0,0.14*,0.17*,0.04,0.08,0.1*
Housing,0.09*,0.04*,0.1*,0.14*,0.04*,0.07,1.0*,0.01,0.13*,0.0,0.13*,0.06,0.09*,0.23*,0.17*,0.09*,0.55*,0.21*,0.04,0.1*,0.12*
Installment_rate_in_percentage_of_disposable_income,0.06,-0.27*,0.01,0.01,0.07*,0.01*,0.01,1.0*,0.01,0.02,0.01,0.0,0.0,0.01,0.02*,0.05,0.0,0.03*,0.0,0.01,0.0
Job,0.03*,0.11*,0.07,0.04,0.05*,0.1*,0.13*,0.01,1.0*,0.0,0.15*,0.08*,0.07,0.09*,0.31*,0.0,0.19*,0.2*,0.07,0.07*,0.43*
Number_of_existing_credits_at_this_bank,0.15*,0.02,0.35*,0.0,-0.01,0.0,0.0,0.02,0.0,1.0*,0.01*,0.0,0.0,0.01*,0.02*,0.09*,0.0,0.02,0.01,0.01,0.0
