Semi-Supervised Learning for Credit-Card Approval Project

Introduction: 
Dataset, that has been retrieved from remote server has over 90% of unlabeled data and 
at most 10% respectively. We need to predict labels of bad client for over 460000 samples, based on 
user personal information, provided. This Notebook prepares the dataset for the potential re-tranining by 
using semi-supervised learning technique

In [104]:
# let's start by loading the dataset 
import pandas, numpy
applications = pandas.read_csv("application_record.csv")
credit_records = pandas.read_csv("credit_record.csv")

In [105]:
# labeling data, presented in the credit records 

# Relabling Credit Records Dataset

CREDIT_MAPPING = {
    "ID": "client_id",
    "STATUS": "due_status",
    "MONTHS_BALANCE": "balance_month"
}
credit_records.rename(columns=CREDIT_MAPPING, inplace=True)

# Relabling Applications Records Dataset
APPLICATION_MAPPING = {
    "ID": "client_id",
    "CODE_GENDER": "gender",
    "FLAG_OWN_CAR": "has_car",
    "FLAG_OWN_REALTY": "has_realty",
    "CNT_CHILDREN": "total_children",
    "AMT_INCOME_TOTAL": "annual_income",
    "NAME_INCOME_TYPE": "income_category",
    "NAME_EDUCATION_TYPE": "education_category",
    "NAME_FAMILY_STATUS": "family_status",
    "NAME_HOUSING_TYPE": "living_place",
    "DAYS_BIRTH": "birthday",
    "FLAG_MOBIL": "has_mobile_phone",
    "FLAG_PHONE": "has_phone",
    "FLAG_WORK_PHONE": "has_work_phone",
    "FLAG_EMAIL": "has_email",
    "CNT_FAM_MEMBERS": "family_size",
    "OCCUPATION_TYPE": "job",
    'DAYS_EMPLOYED': 'days_employed',
}

applications.rename(columns=APPLICATION_MAPPING, inplace=True)


Null Values handling (Application Records)

In [106]:
applications.isnull().sum()

client_id                  0
gender                     0
has_car                    0
has_realty                 0
total_children             0
annual_income              0
income_category            0
education_category         0
family_status              0
living_place               0
birthday                   0
days_employed              0
has_mobile_phone           0
has_work_phone             0
has_phone                  0
has_email                  0
job                   134203
family_size                0
dtype: int64

In [107]:
missing_job = 'missing'
applications['job'].fillna(value=missing_job, inplace=True)

Null Values Handling (Credit Records)

In [108]:
credit_records.isna().sum()

client_id        0
balance_month    0
due_status       0
dtype: int64

Application Record Features

In [109]:
import math 

applications = applications.drop_duplicates(subset=["client_id"], keep='last', inplace=False)

applications['age'] = applications['birthday'].apply(lambda day: math.floor(abs(day) / 365))
applications['employed'] = applications['days_employed'].apply(lambda days: True if days < 0 else False)

dup_clients = applications['client_id'].value_counts().to_frame("number_of_apps").reset_index()
applications['exp_applicant'] = numpy.where(dup_clients['number_of_apps'] > 1, True, False)

applications.drop(columns=['birthday', 'days_employed'], inplace=True)

def create_married_feature(applications: pandas.DataFrame):

    non_mar_statuses = set(['Single / not married', 'Separated', 'Widow'])
    applications['family_status'] = applications['family_status'].apply(
        func=lambda status: True if status not in non_mar_statuses else False
    )
    return applications.rename(columns={'family_status': 'married'}, inplace=False)


def create_gender_feature(applications: pandas.DataFrame):

    encoded_data = pandas.get_dummies(
        applications['gender']
    ).rename(columns={'F': 'Female', 'M': 'Male'})

    applications = pandas.concat([applications, encoded_data], axis=1)
    applications.drop(columns=['gender'], inplace=True)
    return applications

applications = create_married_feature(applications)
applications = create_gender_feature(applications)

In [110]:
# Cleansing credit records dataset 
DUE_DATATYPES = {
    "0": 0,
    "1": 0,
    "2": 1,
    "3": 1,
    "4": 1,
    "5": 1,
    "X": 0,
    "C": 0
}
credit_records['due_status'] = credit_records['due_status'].map(lambda item: DUE_DATATYPES[item])

Credit Record Features

In [111]:
# introducing credit records features (bad client status)
credit_features = pandas.DataFrame()
credit_features['client_id'] = credit_records['client_id'].unique()
credit_features['overdues'] = pandas.Series(credit_records.groupby('client_id')['due_status'].sum().to_numpy())
credit_features['credit_window'] = numpy.abs(credit_records['balance_month'].min())
credit_features['overdue_perc'] = numpy.round(credit_features['overdues'] / credit_features['credit_window'], 2)

# annotating bad client status
credit_features['bad_client'] = numpy.where(credit_features['overdues'] > 0, True, False)
credit_features = credit_features[['client_id', 'bad_client']]

credit_features['bad_client'].value_counts()

bad_client
False    45318
True       667
Name: count, dtype: int64

Merging tables together

In [112]:
merged_dataset = applications.merge(credit_features, on='client_id', how='left')
merged_dataset

Unnamed: 0,client_id,has_car,has_realty,total_children,annual_income,income_category,education_category,married,living_place,has_mobile_phone,...,has_phone,has_email,job,family_size,age,employed,exp_applicant,Female,Male,bad_client
0,5008804,Y,Y,0,427500.0,Working,Higher education,True,Rented apartment,1,...,0,0,missing,2.0,32,True,False,False,True,False
1,5008805,Y,Y,0,427500.0,Working,Higher education,True,Rented apartment,1,...,0,0,missing,2.0,32,True,False,False,True,False
2,5008806,Y,Y,0,112500.0,Working,Secondary / secondary special,True,House / apartment,1,...,0,0,Security staff,2.0,58,True,False,False,True,False
3,5008808,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,False,House / apartment,1,...,1,1,Sales staff,1.0,52,True,False,True,False,False
4,5008809,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,False,House / apartment,1,...,1,1,Sales staff,1.0,52,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438505,6840104,N,Y,0,135000.0,Pensioner,Secondary / secondary special,False,House / apartment,1,...,0,0,missing,1.0,62,False,False,False,True,
438506,6840222,N,N,0,103500.0,Working,Secondary / secondary special,False,House / apartment,1,...,0,0,Laborers,1.0,43,True,False,True,False,
438507,6841878,N,N,0,54000.0,Commercial associate,Higher education,False,With parents,1,...,0,0,Sales staff,1.0,22,True,False,True,False,
438508,6842765,N,Y,0,72000.0,Pensioner,Secondary / secondary special,True,House / apartment,1,...,0,0,missing,2.0,59,False,False,True,False,


Preparing dataset

In [113]:
def encode_bool_features(dataset: pandas.DataFrame):
    """
    Function standardizes boolean features to the following format 
        True: (Y, 1)
        False: (N, 0)
    Args:
        dataset: pandas.DataFrame object, containing boolean features
    """
    bool_features = ['has_car', 'has_realty', 'has_phone', 'has_email', 'has_mobile_phone', 'has_work_phone']
    if not all(feature in dataset.columns for feature in bool_features): return 

    for feature in bool_features:
        dataset[feature] = dataset[feature].map(
            {
                'Y': True,
                'N': False,
                1: True,
                0: False
            }
        )

encode_bool_features(merged_dataset)


Semi Supervised Learning Analysis

In [None]:
# Separating data into labeled and unlabeled datasets 

labeled_data = merged_dataset[merged_dataset['bad_client'].isnull()]
unlabeled_data = merged_dataset[merged_dataset['bad_client'].isnull()]

In [None]:
# Splitting data into training, validation and test sets
from sklearn.model_selection import train_test_split 

Xl_data, Yl_data = labeled_data.drop(columns=['bad_client']), labeled_data['bad_client']
X_train, X_test, Y_train, Y_test = train_test_split(Xl_data, Yl_data, stratify=Yl_data, test_size=0.4)

# Splitting into training and validation sets 

training_set = (X_train, Y_train.astype(numpy.bool_))
test_set = (X_test, Y_test.astype(numpy.bool_))

In [None]:
# splitting unlabeled data
X_unlabeled_data = unlabeled_data.drop(columns=['bad_client'], inplace=False)

Semi Supervised Learning (Learning on Labeled Data)

In [None]:
from sklearn.semi_supervised import LabelPropagation
import sklearn.exceptions, logging
from sklearn.metrics import classification_report

Logger = logging.getLogger(__name__)

prop_params = {
    'kernel': 'knn',
    'n_neighbors': 4,
    'n_jobs': 2,
}

prop = LabelPropagation(**prop_params)
# training classifier propagation using training set 
prop.fit(training_set[0], training_set[1])

predicted_labels = pandas.DataFrame() 

def predict_unlabeled_data():
    try:
        probs = numpy.array(prop.predict_proba(X_test))
        predicted_labels['pos_prob'] = probs[:, 1]
        predicted_labels['neg_prob'] = probs[:, 0]
        predicted_labels['value'] = prop.predict(test_set[0])
    except(sklearn.exceptions.NotFittedError) as fit_err:
        Logger.debug("unlabeled data is invalid. %s" % fit_err)
        raise RuntimeError("Failed to label data")

predict_unlabeled_data()

classification_report(
    y_true=test_set[1], 
    y_pred=predicted_labels['value'],
    target_names=['bad_client', 'good_client']
)

"""             precision    recall  f1-score   support
  bad_client       0.89      0.95      0.92     14337
  good_client      0.95      0.88      0.91     14336
  
  accuracy                            0.92     28673
  macro avg       0.92      0.92      0.92     28673
  weighted avg    0.92      0.92      0.92     28673
"""

'             precision    recall  f1-score   support\n  bad_client       0.89      0.95      0.92     14337\n  good_client      0.95      0.88      0.91     14336\n  \n  accuracy                            0.92     28673\n  macro avg       0.92      0.92      0.92     28673\n  weighted avg    0.92      0.92      0.92     28673\n'

Predicting Labels

In [None]:
unlabeled_metrics = pandas.DataFrame()

unlabeled_metrics['prob'] = numpy.array(prop.predict_proba(X_unlabeled_data))[:, 1]
unlabeled_metrics['value'] = prop.predict(X_unlabeled_data)

unlabeled_metrics

Unnamed: 0,prob,value
0,0.0,False
1,0.0,False
2,0.0,False
3,0.0,False
4,0.0,False
...,...,...
402048,0.0,False
402049,0.0,False
402050,0.0,False
402051,0.0,False


Setting up missing labels

In [None]:
unlabeled_metrics.set_index(unlabeled_data.index, inplace=True)

unlabeled_data.loc[:, 'bad_client'] = unlabeled_metrics['value']

merged_dataset.loc[merged_dataset['bad_client'].isnull(), 'bad_client'] = unlabeled_data['bad_client']

In [None]:
merged_dataset

Unnamed: 0,client_id,has_car,has_realty,income_category,education_category,married,living_place,has_mobile_phone,has_work_phone,has_phone,...,job,employed,exp_applicant,bad_client,Female,Male,total_children,age,annual_income,family_size
12,6153651,True,True,4,1,True,4,True,True,True,...,0,True,False,False,False,True,-0.580439,0.239169,0.757090,-0.215761
26,6153712,False,True,4,3,False,4,True,False,False,...,18,True,False,False,True,False,0.816766,-1.165751,-0.687067,-0.215761
43,6153733,True,True,0,3,True,4,True,False,True,...,4,True,False,False,False,True,-0.580439,1.117244,-0.687067,-0.215761
44,6153734,True,True,0,3,True,4,True,False,True,...,4,True,False,False,False,True,-0.580439,1.117244,-0.687067,-0.215761
45,6153735,True,True,0,3,True,4,True,False,True,...,4,True,False,False,False,True,-0.580439,1.117244,-0.687067,-0.215761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8316,5150482,True,True,4,3,True,4,True,False,False,...,3,True,False,False,True,False,0.816766,-1.253559,-0.274450,0.904770
8317,5150483,True,True,4,3,True,4,True,False,False,...,3,True,False,False,True,False,0.816766,-1.253559,-0.274450,0.904770
8318,5150484,True,True,4,3,True,4,True,False,False,...,3,True,False,False,True,False,0.816766,-1.253559,-0.274450,0.904770
8319,5150485,True,True,4,3,True,4,True,False,False,...,3,True,False,False,True,False,0.816766,-1.253559,-0.274450,0.904770


Splitting data into training, testing and validation sets

In [None]:
from sklearn.model_selection import train_test_split 

x, y = merged_dataset.drop(columns=['bad_client'], inplace=False), merged_dataset['bad_client']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y, test_size=0.5)

x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.5, stratify=y_train, random_state=1)

training_set = pandas.concat([x_train, y_train], axis=0)
testing_set = pandas.concat([x_test, y_test], axis=0)
validation_set = pandas.concat([x_validation, y_validation], axis=0)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

def k_fold_target_encoding(feature_name, dataset):
    """
    Implementation of K Fold Target Encoding
    
    Args:
        x_data: independent feature x
        y_data: dependent feature y
    """
    if not len(dataset): return 

    encoded_data = pandas.Series(numpy.zeros(len(dataset)))
    encoder = StratifiedKFold(n_splits=5)

    x, y = dataset.drop(columns=['bad_client'], inplace=False), dataset['bad_client']

    for train_idx, test_idx in encoder.split(x, y):

        X_train, X_val = x.iloc[train_idx], x.iloc[test_idx]
        means = X_train.groupby(feature_name)['bad_client'].mean()
        mapped_data = X_val['job'].map(means, inplace=False)

        encoded_data.iloc[test_idx] = mapped_data

    return encoded_data

def encode_job_feature(dataset: pandas.DataFrame):
    """
    Function encodes job feature using Label Encoder

    Args:
        dataset: pandas.DataFrame object, containing job feature
    """
    if not 'job' in dataset.columns: return 
    encoded_data = k_fold_target_encoding(feature_name="job", dataset=dataset)
    dataset['job'] = encoded_data

def encode_education_feature(dataset: pandas.DataFrame):
    """
    Function encodes education category feature using Ordinal Encoding 

    What steps function performs:

        1. Cleans and upgrades category labels 
        2. Selects the right education order (hierarchy) in descending order (Academic Degree, Higher Education, ...)
        3. Encodes Categories 

    Args:
        dataset: pandas.DataFrame object, containing education feature
    """
    if not 'education_category' in dataset.columns: return 

    dataset['education_category'] = dataset['education_category'].map({
    'Secondary / secondary special': 'special secondary',
    'Incomplete higher': 'incomplete higher education',
    'Academic degree': 'academic degree',
    'Higher education': 'higher education',
    'Lower secondary': 'lower secondary'
    })

    education_categories = [
        "academic degree",
        "higher education",
        "incomplete higher education",
        "special secondary",
        "lower secondary",
    ]

    encoder = OrdinalEncoder(categories=[education_categories], dtype=numpy.int8)
    dataset['education_category'] = encoder.fit_transform(dataset[['education_category']])

def encode_living_place_feature(dataset: pandas.DataFrame):
    """
    Function encodes `living place` dataset using Ordinal Encoding 
    
    Args:
        dataset: pandas.DataFrame object, containing living_place feature
    """
    if not 'living_place' in dataset.columns: return 
    living_categories = [
        'Co-op apartment',
        'Office apartment',
        'Municipal apartment',
        'Rented apartment', 
        'House / apartment',
        'With parents',
    ]
    encoder = OrdinalEncoder(dtype=numpy.int8, categories=[living_categories])
    dataset['living_place'] = encoder.fit_transform(dataset[['living_place']])


def encode_income_category_feature(dataset: pandas.DataFrame):
    """
    Function encodes income category feature using Label Encoding

    Args:
        dataset: pandas.DataFrame object, containing income category feature
    """
    if not 'income_category' in dataset.columns: return

    if not 'income_category' in dataset.columns: return 
    encoded_data = k_fold_target_encoding(feature_name="job", dataset=dataset)
    dataset['income_category'] = encoded_data


def encode_dataset(merged_dataset: pandas.DataFrame):
    """
    Encodes all feature dataset using appropriate
    feature encoding techniques 
    """
    encode_job_feature(merged_dataset)
    encode_education_feature(merged_dataset)
    encode_living_place_feature(merged_dataset)
    encode_income_category_feature(merged_dataset)
    return merged_dataset


Feature Encoding for each dataset

In [None]:
training_set = encode_dataset(training_set)
testing_set = encode_dataset(testing_set)
validation_set = encode_dataset(training_set)

Setting Datatypes

In [None]:
def set_datatypes(df):
    # Numeric Data
    df['client_id'] = df['client_id'].astype(numpy.int64)
    df['total_children'] = df['total_children'].astype(numpy.int8)
    df['annual_income'] = df['annual_income'].astype(numpy.int64)
    df['age'] = df['age'].astype(numpy.int8)

    # Boolean Data
    df['has_car'] = df['has_car'].astype(numpy.bool_)
    df['has_realty'] = df['has_realty'].astype(numpy.bool_)
    df['has_phone'] = df['has_phone'].astype(numpy.bool_)
    df['has_mobile_phone'] = df['has_mobile_phone'].astype(numpy.bool_)
    df['married'] = df['married'].astype(numpy.bool_)
    df['Male'] = df['Male'].astype(numpy.bool_)
    df['Female'] = df['Female'].astype(numpy.bool_)
    df['employed'] = df['employed'].astype(numpy.bool_)
    df['exp_applicant'] = df['exp_applicant'].astype(numpy.bool_)

    # Categorical Data
    df['income_category'] = df['income_category'].astype(numpy.int8)
    df['living_place'] = df['living_place'].astype(numpy.int8)
    df['education_category'] = df['education_category'].astype(numpy.int8)
    df['job'] = df['job'].astype(numpy.int8)

set_datatypes(training_set)
set_datatypes(testing_set)
set_datatypes(validation_set)

Standardization

In [None]:
# before training models, we need to make sure, that numeric data falls in the same scales 
# otherwise some machine learning algorithms might end up having poor prediction ability 

from sklearn.preprocessing import StandardScaler 

numeric_features = ['total_children', 'age', 'annual_income', 'family_size']
other_features = merged_dataset.drop(columns=numeric_features, inplace=False).columns

def scale_numeric_features(feature_dataset: pandas.DataFrame):
    """
    Function scales numeric features using Standard Scaler 
    std = 1, mean = 0
    """
    scaler = StandardScaler()
    df = pandas.DataFrame(
    scaler.fit_transform(feature_dataset), 
    columns=feature_dataset.columns
    )
    df.set_index(merged_dataset.index, inplace=True)
    return df

numeric_dataset = scale_numeric_features(
    feature_dataset=merged_dataset[numeric_features]
)

In [None]:
training_set = pandas.concat([
    training_set[other_features],
    numeric_dataset
], axis=1)

testing_set = pandas.concat([
    testing_set[other_features],
    numeric_dataset
], axis=1)

validation_set = pandas.concat([
    validation_set[other_features],
    numeric_dataset
], axis=1)

Class Imbalance Handling

In [None]:
# Splitting data

from imblearn.over_sampling import SMOTE 


def balance_dataset(encoded_dataset: pandas.DataFrame):
    X_data = encoded_dataset.drop(columns=['bad_client'], inplace=False)
    Y_data = encoded_dataset['bad_client'].astype(numpy.bool_)

    smote_classifier = SMOTE(k_neighbors=5, random_state=42)
    X_sampled, Y_sampled = smote_classifier.fit_resample(X_data, Y_data)
    balanced_set = pandas.concat([X_sampled, Y_sampled], axis=1).sort_values('client_id')
    return balanced_set 


training_set = balance_dataset(training_set)
testing_set = balance_dataset(testing_set)
validation_set = balance_dataset(validation_set)

Saving new datasets to a CSV Files

In [None]:
training_set.to_csv("training_set.csv")
test_set.to_csv('testing_set.csv')
validation_set.to_csv("validation_set.csv")
