In [1]:
import pandas, numpy
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
applications = pandas.read_csv("./application_record.csv")
credit_records = pandas.read_csv("./credit_record.csv")

In [3]:
# Let's take a look at each dataset individually 
applications.head(10)

In [4]:
# Lets' see how the credit record looks like 
# 1 - due less 30 days
# 2 - due 30 - 60 days 
# 3 - due 60 - 90 days
# 4 - due 90 - 150 days 
# 5 - due over 150 days 

credit_records.head(10)

In [5]:
# Relabling Credit Records Dataset

CREDIT_MAPPING = {
    "ID": "client_id",
    "STATUS": "due_status",
    "MONTHS_BALANCE": "balance_month"
}
credit_records.rename(columns=CREDIT_MAPPING, inplace=True)

# Relabling Applications Records Dataset
APPLICATION_MAPPING = {
    "ID": "client_id",
    "CODE_GENDER": "gender",
    "FLAG_OWN_CAR": "has_car",
    "FLAG_OWN_REALTY": "has_realty",
    "CNT_CHILDREN": "total_children",
    "AMT_INCOME_TOTAL": "annual_income",
    "NAME_INCOME_TYPE": "income_category",
    "NAME_EDUCATION_TYPE": "education_category",
    "NAME_FAMILY_STATUS": "family_status",
    "NAME_HOUSING_TYPE": "living_place",
    "DAYS_BIRTH": "birthday",
    "FLAG_MOBIL": "has_mobile_phone",
    "FLAG_PHONE": "has_phone",
    "FLAG_WORK_PHONE": "has_work_phone",
    "FLAG_EMAIL": "has_email",
    "CNT_FAM_MEMBERS": "family_size",
    "OCCUPATION_TYPE": "job",
    'DAYS_EMPLOYED': 'days_employed',
}

applications.rename(columns=APPLICATION_MAPPING, inplace=True)

In [6]:
# Editing Dataset Labels for better understanding and easier usage 
applications.info()
applications.isna().sum()

In [7]:
# Let's see how many nan values we have in credit records dataset
credit_records.info() 
credit_records.isna().sum()

In [8]:
# Let's see how many unique application records we have
applications['client_id'].nunique()

In [9]:
# Let's see how many unique credit record we have
credit_records["client_id"].nunique()

In [10]:
# let's see how many intersections to we have 
applications.merge(credit_records, on="client_id")["client_id"].nunique()

In [11]:
# Calculating credit window for each individual customer 

grouped = credit_records.groupby('client_id')
customer_credit = pandas.DataFrame()

customer_credit['open_month'] = grouped['balance_month'].min() # smallest value of MONTHS_BALANCE, is the month when loan was granted
customer_credit['end_month'] = grouped['balance_month'].max() # biggest value of MONTHS_BALANCE, might be observe over or canceling account

customer_credit["credit_window"] = customer_credit["end_month"] - customer_credit["open_month"] + 1
customer_credit.reset_index(inplace=True)

customer_credit = customer_credit[['client_id', 'credit_window']]



In [12]:
# Joining with main credit table and finding credit difference
print(credit_records)

DUE_DATATYPES = {
    "0": 0,
    "1": 0,
    "2": 1,
    "3": 1,
    "4": 1,
    "5": 1,
    "X": 0,
    "C": 0
}

def convert_due_status(status: str):
    """
    Transforms digit due statuses to meaningful names
    everything, that is higher than 1 marks as 'overdue', which indicates, 
    that client had a huge pay out delay and might be not reliable
    """
    if not isinstance(status, str): return 0

    if status.isdigit():
        if int(status) > 1: 
            return 1 
    return 0

# Mapping values according to specified standards

credit_records["due_status"] = credit_records['due_status'].map(lambda item: DUE_DATATYPES[str(item)])

# Overdue metrics per client
overdue_per_client = credit_records[["client_id", "due_status"]].groupby(
by=["client_id"])['due_status'].sum().to_frame('overdue_frequency').reset_index()

customer_credit = customer_credit.merge(overdue_per_client, on='client_id', how='inner')

customer_credit["bad_client"] = overdue_per_client["overdue_frequency"].apply(
func=lambda item: True if item > 0 else False)

customer_credit = customer_credit[["client_id", "credit_window", "bad_client"]]

def set_datatypes(dataset: pandas.DataFrame):
    """
    Optimizing datatypes for dataset compression
    """
    dataset["client_id"] = dataset["client_id"].astype(numpy.int64)
    dataset["credit_window"] = dataset["credit_window"].astype(numpy.int16)
    return dataset

completed_feature_dataset = set_datatypes(customer_credit)

# After Optimizing our dataset, let's check it's characteristics 

completed_feature_dataset.info()

In [13]:
# merging with main application records using left join (we want to keep customer's which does not have any )

In [14]:
# Let's see how many clients are good (can be potentially considered by the banks) and bad clients

sns.countplot(data=completed_feature_dataset, x="bad_client")
plt.xlabel("Proportion of good and bad customers. Bad - 1; Good - 0")

In [15]:
# Let's investigate majority class to get more information about it 

data = completed_feature_dataset

In [16]:
# let's try different techniques for managing imbalanced datasets

In [17]:
# Sampling Techniques for dealing with imbalanced datasets
import typing, logging
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score
import pandas 
import constants 

Logger = logging.getLogger(__name__)

def rose_over_sampling(
    X_train: typing.Union[pandas.DataFrame, pandas.Series],
    Y_train: pandas.Series
):
    if not len(X_train) or not len(Y_train): return None, None
    try:
        sampler  = RandomOverSampler(random_state=1)
        X_resampled, Y_resampled = sampler.fit_resample(X=X_train, y=Y_train)
        return X_resampled, Y_resampled 
    except(TypeError, ValueError) as sampling_exception:
        Logger.debug("Failed to perform ROSE Over Sampling Technique, Exception Arised. [%s]"
        % sampling_exception)
        return None, None

def rose_under_sampling(X_train: pandas.DataFrame, Y_train: pandas.Series):
    if not len(X_train) or not len(Y_train): return None, None
    try:
        sampler = RandomUnderSampler(random_state=1)
        X_resampled, Y_resampled = sampler.fit_resample(X=X_train, y=Y_train)
        return X_resampled, Y_resampled 
    except(TypeError, ValueError) as sampling_exception:
        Logger.debug("Failed to perform ROSE Under Sampling Technique, Exception Arised. [%s]"
        % sampling_exception)
        return None, None
    

def smote_sampling(
    X_train: typing.Union[pandas.DataFrame, pandas.Series], 
    Y_train: typing.Union[pandas.DataFrame, pandas.Series]):

    if not len(X_train) or not len(Y_train): return None, None 
    try:
        smote_tech = SMOTE(random_state=1, k_neighbors=constants.K_SMOTE_NEIGHBORS)
        X_resampled, Y_resampled = smote_tech.fit_resample(X_train, Y_train)
        return X_resampled, Y_resampled
    except(TypeError, ValueError) as train_exception:
        Logger.debug("Failed to balance data using SMOTE Technique, exception raised. [%s]" % train_exception)
        return None, None 


In [18]:
# let's split data and try out SMOTE method on our data

X_data = data.drop(columns=["bad_client"])
Y_data = data["bad_client"]

from sklearn.model_selection import train_test_split 

# Splitting data on training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, random_state=1)

# Transforming each dataset individually 

SM_XR_sampled, SM_YR_sampled = smote_sampling(X_train, Y_train)

In [19]:
# Let's visualize new data
SM_YR_sampled.value_counts()

In [20]:
# Performing Feature Encodings to the dataset
sns.countplot(SM_YR_sampled.reset_index())

In [21]:
# Merging with application records feature dataset
credit_feature_dataset = pandas.concat([SM_XR_sampled, SM_YR_sampled], axis=1)

feature_dataset = credit_feature_dataset.merge(applications, on="client_id")

# Dropping duplicated rows
feature_dataset.drop_duplicates(inplace=True)

In [22]:
# Imputing missing values before encoding

feature_dataset.isna().sum()

In [23]:
# Using Mode Imputation for Nominal Categorical Feature 'Job'
MISSING_JOB = "missing_job"
feature_dataset['job'].fillna(MISSING_JOB, inplace=True)


In [24]:
# Using Imputation for Education and Family Status categories 
feature_dataset['education_category'] = feature_dataset['education_category'].apply(func=lambda item: item.lower()) 
feature_dataset['family_status'] = feature_dataset['family_status'].apply(func=lambda item: item.lower())

In [25]:
# Engineering features for application records 

import math 

feature_dataset['employed']  = feature_dataset['days_employed'].apply(func=lambda day: False if day > 0 else True)
feature_dataset['age'] = feature_dataset['birthday'].apply(func=lambda day: math.floor(abs(day) / 365))

feature_dataset.drop(columns=["days_employed", "birthday"], inplace=True)

In [26]:
# Checking the imputation status 
feature_dataset.isna().sum()

In [27]:
# Encoding Binary Features

def encode_binary_features(binary_dataset: pandas.DataFrame):
    """
    Function encodes binary features to corresponding boolean values 
    Yes - True
    No - False
    
    It is implied to convert binary features to appropriate format, because 
    some models does not do well with categorical variables and requires feature tranformation
    
    Args:
        dataset: input dataset, which contains binary features
    """
    try:
        binary_dataset.apply(func=(lambda item: True if item == "Y" else False), axis=1)
        return None
    except():
        pass
    
def encode_gender_feature(gender_feature: pandas.Series) -> pandas.DataFrame:
    """
    Encodes Gender Feature using One-Hot Encoding
    Args:
        gender_feature - pandas.Series object, which contains 
        about the client's gender
    """
    gender_features = pandas.get_dummies(gender_feature).rename(
    columns={'F': 'Female', 'M': 'Male'})
    return gender_features


binary_features = ["has_car", "has_mobile_phone", 
"has_phone", "has_email", "has_work_phone", "has_realty"]

encode_binary_features(feature_dataset[binary_features])
gender_features = encode_gender_feature(feature_dataset['gender'])


# Dropping gender column 
feature_dataset.drop(columns=['gender'], axis=1)

# combining with encoded gender frame 
feature_dataset = pandas.concat([feature_dataset, gender_features], axis=1)

feature_dataset.isna().sum()

In [None]:
# Encoding Multi class categorical features 

In [None]:
# Analyzing Education Category 

feature_dataset['education_category'].nunique()

In [None]:
# There are 5 unique groups in the Education Category Dataset Feature 
# Let's review each of the them 
feature_dataset['education_category'].unique()

In [30]:
# Education Data has certain airarphy, in that case, we can consider applying Ordinal Encoder 

from sklearn.preprocessing import OrdinalEncoder 

def scale_hierarhical_feature(data_series: pandas.Series, categories: typing.List[str]):
    """
    Function scales feature using Ordinal Encodind
    
    Notes:
        1. Feature should have logical hierarchy, otherwise it would not make sense
        Common example is school grades: 2, 3, 4, 5, which denotes corresponding feedback about work 
        in hierarchical order
        
    Args:
        1. data_series: pandas.Series object, that should be encoded
        2. categories: hierarchically ordered list of categories (from top to bottom)
        
    Returns:
        pandas.Series object with ordinal-encoded values
    """
    if data_series.isna().sum() > 0:
        raise ValueError("Series contains null values")
        
    scaler = OrdinalEncoder(categories=categories) 
    scaled_data = scaler.fit_transform(data_series)
    return scaled_data

# Updating Education Category Labels 

feature_dataset['education_category'] = feature_dataset['education_category'].map({
    'secondary / secondary special': 'special secondary',
    'incomplete higher': 'incomplete higher education',
    'academic degree': 'academic degree',
    'higher education': 'higher education',
    'lower secondary': 'lower secondary'
})

feature_dataset["education_category"] = scale_hierarhical_feature(
    data_series=feature_dataset[['education_category']],
    categories=[
        [
        "academic degree",
        "higher education",
        "incomplete higher education",
        "special secondary",
        "lower secondary"
        ]
    ]
)

feature_dataset['education_category']


In [None]:
# Encoding Family Status categorical feature

In [38]:
# Analyzing Family status unique categories 

feature_dataset['education_category'].unique() 

In [None]:
# Estimating which approach would be the most applicable in our case scenario 

In [37]:
def encode_family_status(feature_series: pandas.Series):
    if feature_series.isna().sum() > 0: 
        raise TypeError("Feature Series contains null values")
    try:
        return pandas.get_dummies(feature_series)
    except(TypeError, ValueError, AttributeError) as exc:
        Logger.error(exc)
        return numpy.fill(len(feature_series), 1), "unknown value", dtype='string')

        
encoded_family_status = encode_family_status(feature_dataset['family_status'])


# concatenating tables together 
feature_dataset = pandas.concat([feature_dataset, encoded_family_status], axis=1)

# dropping family status column, as we no longer need it
feature_dataset.drop(columns=["family_status"], inplace=True)

feature_dataset['']


In [31]:
# Standartizing Numeric Features 

numeric_set = feature_dataset.select_dtypes(include='number').columns
numeric_set = numeric_set[1:].tolist()

numeric_set

In [32]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler() 
scaled_data = pandas.DataFrame(scaler.fit_transform(feature_dataset[numeric_set]), 
columns=numeric_set)

scaled_data.isna().sum()

In [35]:
# Merging back to the main feature table 

feature_dataset[numeric_set] = scaled_data

feature_dataset.reset_index(inplace=True)

feature_dataset.info()

In [36]:
feature_dataset

In [None]:
# Encoding Multi-class categories

In [None]:
# Encoding 