In [1]:
import pandas as pd
from scipy.io.arff import loadarff

# Load data
raw_data = loadarff("dataset_31_credit-g.arff")
df = pd.DataFrame(raw_data[0])

df.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,b'<0',6.0,b'critical/other existing credit',b'radio/tv',1169.0,b'no known savings',b'>=7',4.0,b'male single',b'none',...,b'real estate',67.0,b'none',b'own',2.0,b'skilled',1.0,b'yes',b'yes',b'good'
1,b'0<=X<200',48.0,b'existing paid',b'radio/tv',5951.0,b'<100',b'1<=X<4',2.0,b'female div/dep/mar',b'none',...,b'real estate',22.0,b'none',b'own',1.0,b'skilled',1.0,b'none',b'yes',b'bad'
2,b'no checking',12.0,b'critical/other existing credit',b'education',2096.0,b'<100',b'4<=X<7',2.0,b'male single',b'none',...,b'real estate',49.0,b'none',b'own',1.0,b'unskilled resident',2.0,b'none',b'yes',b'good'
3,b'<0',42.0,b'existing paid',b'furniture/equipment',7882.0,b'<100',b'4<=X<7',2.0,b'male single',b'guarantor',...,b'life insurance',45.0,b'none',b'for free',1.0,b'skilled',2.0,b'none',b'yes',b'good'
4,b'<0',24.0,b'delayed previously',b'new car',4870.0,b'<100',b'1<=X<4',3.0,b'male single',b'none',...,b'no known property',53.0,b'none',b'for free',2.0,b'skilled',2.0,b'none',b'yes',b'bad'


In [2]:
# Some columns are of data type "byte" --> convert them to strings
for col, dtype in df.dtypes.items():
    if dtype == object:  # Only process byte object columns.
        df[col] = df[col].apply(lambda x: x.decode("utf-8"))

In [3]:
# Set missing values:
# - "no known savings" of variable "savings_status"
# - "no known property" of variable "property_magnitude"

df["savings_status"] = df["savings_status"].replace({"no known savings": None})
df["property_magnitude"] = df["property_magnitude"].replace({"no known property": None})

In [4]:
incomplete_variables = df.columns[df.isna().sum() > 0]
complete_variables = df.columns.drop(incomplete_variables)

# Investigate distribution of incomplete variables
for var in incomplete_variables:
    print(df[var].value_counts())
    print(f"{df[var].isna().sum()} missing values\n")

savings_status
<100           603
100<=X<500     103
500<=X<1000     63
>=1000          48
Name: count, dtype: int64
183 missing values

property_magnitude
car               332
real estate       282
life insurance    232
Name: count, dtype: int64
154 missing values


In [5]:
from scipy.stats import chi2_contingency, kruskal

# Identify which complete variables have a dependency relationship with which incomplete variables
alpha = .05
dependent_variables = {}
for var1 in incomplete_variables:
    dependent_variables[var1] = []
    
    print(f"{var1}:")
    for var2 in complete_variables:
        if df[var2].dtype == "object":
            # G-Test
            observed = pd.crosstab(df[var1], df[var2])
            _, p, _, _ = chi2_contingency(observed, lambda_="log-likelihood")
        else:
            # Kruskal-Wallis Test
            samples = []
            for value in df[var1].unique():
                if value is None:
                    continue
                
                samples.append(df[var2][df[var1] == value])
            
            _, p = kruskal(*samples)

        if p < alpha:
            dependent_variables[var1].append(var2)
            print(f"- {p:.3f} {var2}")

savings_status:
- 0.000 checking_status
- 0.001 other_parties
- 0.000 class
property_magnitude:
- 0.000 duration
- 0.000 purpose
- 0.000 credit_amount
- 0.016 employment
- 0.000 other_parties
- 0.000 job
- 0.000 own_telephone
- 0.000 foreign_worker
- 0.014 class


In [6]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Missing value estimation


def one_hot_encode(data):
    ohe_data = data.copy()
    
    for col in data.columns:
        if data[col].dtype == 'object':
            dummy_columns = pd.get_dummies(data[col], prefix=col, drop_first=(data[col].nunique() == 2))
            ohe_data = pd.concat([ohe_data, dummy_columns], axis=1)
            ohe_data = ohe_data.drop(col, axis=1)
    
    return ohe_data


random_state = 20 # TODO: set to "None" after optimization
for var in incomplete_variables:
    X = one_hot_encode(df[dependent_variables[var]])
    
    # Drop instances from X for which var is missing
    y = df[var][~df[var].isna()]
    X = X.iloc[y.index]
    
    # Stratified k-fold cross-validation
    skf = StratifiedKFold(shuffle=True, random_state=random_state)
    scores = {"train": [], "test": []}
    
    # Determine class and sample weights
    class_frequencies = y.value_counts()
    classes = class_frequencies.index.tolist()
    class_weights = {classes[0]: 1}
    for c in classes[1:]:
        class_weights[c] = class_frequencies.iloc[0] / class_frequencies.loc[c]
    
    sample_weights = y.apply(lambda x: class_weights[x])
    
    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        # Train/test split
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        w_train = sample_weights.iloc[train_idx]
        
        X_test = X.iloc[test_idx]
        y_test = y.iloc[test_idx]
        w_test = sample_weights.iloc[test_idx]
        
        # Train a classifier w/ sample weights
        clf = RandomForestClassifier(max_depth=5, random_state=random_state).fit(X_train, y_train, sample_weight=w_train)
        
        # Evaluate the classifier using f1-score w/ sample weights
        scores["train"].append(f1_score(y_train, clf.predict(X_train), average="macro", sample_weight=w_train))
        scores["test"].append(f1_score(y_test, clf.predict(X_test), average="macro", sample_weight=w_test))
    
    print(var)
    print(f"- Train: {np.mean(scores['train']):.3f} ± {np.std(scores['train']):.3f}")
    print(f"- Test: {np.mean(scores['test']):.3f} ± {np.std(scores['test']):.3f}")

savings_status
- Train: 0.368 ± 0.030
- Test: 0.323 ± 0.029
property_magnitude
- Train: 0.633 ± 0.005
- Test: 0.477 ± 0.025
