### Credit Assesment Classification Project

#### Imports

In [304]:
import sklearn
import pandas as pd
import numpy as np
from dateutil import relativedelta

In [305]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

  train=pd.read_csv("train.csv")


#### Getting Baseline for Classification

In [306]:
baseline_acc_train=train["Credit_Score"].value_counts()
baseline_acc_train

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

In [307]:
total_samples = baseline_acc_train.sum()
majority_count = baseline_acc_train.max()
percentage_majority = (majority_count / total_samples) * 100

print("Baseline prediction when guessing majority class for training dataset: {:.2f}%".format(percentage_majority))

Baseline prediction when guessing majority class for training dataset: 53.17%


#### Familiarizing with the data

Fixing the occurences of _ in float/int type objects

In [308]:
train['SSN_Location'] = train['SSN'].str[:3]

In [309]:
for column in ['Annual_Income',"Age", 'Num_of_Loan', 'Num_of_Delayed_Payment', "Changed_Credit_Limit", "Outstanding_Debt", "Amount_invested_monthly","Monthly_Balance","SSN_Location"]:
    train[column] = train[column].str.replace('_', '')

for column in ['Age', 'Num_of_Loan', "SSN_Location"]:
    try:
        train[[column]] = train[[column]].astype(int)
    except:
        pass

for column in train[['Annual_Income', 'Num_of_Delayed_Payment', "Changed_Credit_Limit", "Outstanding_Debt", "Amount_invested_monthly","Monthly_Balance"]]:
    try:
        train[[column]] = train[[column]].astype(float)
    except:
        pass

Dealing with some Na values by filling them with inferred values for the same client and othertimes by changing Nas to different values

In [310]:
train['Monthly_Inhand_Salary'] = train.groupby('Customer_ID')['Monthly_Inhand_Salary'].transform(lambda x: x.fillna(x.median()))
train['Monthly_Inhand_Salary'].fillna(train['Monthly_Inhand_Salary'].median(), inplace=True)

In [311]:
train.sort_values(['Customer_ID', 'Month'], inplace=True)
train['Num_of_Delayed_Payment'].fillna(method='ffill', inplace=True)
train['Num_of_Delayed_Payment'].fillna(train['Num_of_Delayed_Payment'].median(), inplace=True)

In [312]:
train['Num_Credit_Inquiries'].fillna(method='ffill', inplace=True)
train['Num_Credit_Inquiries'].fillna(train['Num_Credit_Inquiries'].median(), inplace=True)

In [313]:
train['Type_of_Loan'].fillna('Missing_Information', inplace=True)

In [314]:
train['Amount_invested_monthly'] = train.groupby('Customer_ID')['Amount_invested_monthly'].transform(lambda x: x.fillna(x.median()))
train["Amount_invested_monthly"].fillna(train['Amount_invested_monthly'].median(), inplace=True)

In [315]:
train['Monthly_Balance'] = train.groupby('Customer_ID')['Monthly_Balance'].transform(
    lambda x: x.fillna(x.median() if x.notna().any() else np.nan)
)

train["Monthly_Balance"].fillna(train['Monthly_Balance'].median(), inplace=True)


In [316]:
train.sort_values(['Customer_ID', 'Month', 'Credit_History_Age'], inplace=True)
train['Credit_History_Age'] = train.groupby('Customer_ID')['Credit_History_Age'].transform(
    lambda x: x.fillna(x.ffill().bfill())
)

def calculate_month_difference(start_month, end_month):
    start_year = 1
    end_year = 1

    if start_month > end_month:
        end_year = 2

    months_diff = abs(start_month - end_month)
    years_diff = end_year - start_year

    return relativedelta.relativedelta(months=months_diff, years=years_diff)

for index, row in train.iterrows():
    if pd.isna(row['Credit_History_Age']):
        client_id = row['Customer_ID']
        month_num = row['Month']
        
        prev_row = train.loc[(train['Customer_ID'] == client_id) & (train['Month'] < month_num), 'Credit_History_Age'].dropna().tail(1)
        next_row = train.loc[(train['Customer_ID'] == client_id) & (train['Month'] > month_num), 'Credit_History_Age'].dropna().head(1)
        
        if not prev_row.empty and not next_row.empty:
            prev_age = pd.to_timedelta(prev_row.values[0])
            next_age = pd.to_timedelta(next_row.values[0])
            
            month_diff = calculate_month_difference(prev_age.months, next_age.months)
            true_age = prev_age + month_diff
            
            train.at[index, 'Credit_History_Age'] = str(true_age.years) + ' Years and ' + str(true_age.months) + ' Months'



train['Credit_History_Age'].fillna(train['Credit_History_Age'].mode()[0], inplace=True)

Fixing invalid values 

In [317]:
min_age = 10
max_age = 150

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(1, len(train)):
    if train.loc[i, 'Age'] < min_age or train.loc[i, 'Age'] > max_age:
        if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID']:
            train.loc[i, 'Age'] = train.loc[i-1, 'Age']

median_age = train['Age'].median()
for i in range(1, len(train)):
    if train.loc[i, 'Age'] < min_age or train.loc[i, 'Age'] > max_age:
        train.loc[i, 'Age'] = median_age

In [318]:
min_income = 0
max_income = 10
num_changed_values=0

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(1, len(train)):
    wrong_income=False
    if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID'] and (train.loc[i, 'Annual_Income']*max_income<train.loc[i-1, 'Annual_Income'] or train.loc[i, 'Annual_Income']>max_income*train.loc[i-1, 'Annual_Income']):
        wrong_income=True
    if train.loc[i, 'Annual_Income'] < min_income or wrong_income:
        try:
            if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID'] or train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID']:
                if train.loc[i, 'Annual_Income'] < min_income:
                    train.loc[i, 'Annual_Income']=max(train.loc[i-1, 'Annual_Income'],train.loc[i, 'Annual_Income'])
                else:
                    train.loc[i, 'Annual_Income'] = min(train.loc[i-1, 'Annual_Income'],train.loc[i, 'Annual_Income'])
                num_changed_values+=1
        except:
            pass
print(num_changed_values)

986


In [319]:
min_accounts = 0
max_accounts = 100
num_changed_values=0

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(0, len(train)-1):
    if train.loc[i, 'Num_Bank_Accounts'] < min_accounts or train.loc[i, 'Num_Bank_Accounts'] > max_accounts:
        if train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID'] and train.loc[i, 'Num_Bank_Accounts'] != train.loc[i+1, 'Num_Bank_Accounts']:
            train.loc[i, 'Num_Bank_Accounts'] = train.loc[i+1, 'Num_Bank_Accounts']
            num_changed_values+=1
print(num_changed_values)

1103


In [320]:
min_cards = 0
max_cards = 100
num_changed_values=0

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(0, len(train)-1):
    if train.loc[i, 'Num_Credit_Card'] < min_cards or train.loc[i, 'Num_Credit_Card'] > max_cards:
        if train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID'] and train.loc[i, 'Num_Credit_Card'] != train.loc[i+1, 'Num_Credit_Card']:
            train.loc[i, 'Num_Credit_Card'] = train.loc[i+1, 'Num_Credit_Card']
            num_changed_values+=1
print(num_changed_values)

1858


In [321]:
upper_range=np.percentile(train['Interest_Rate'], 98)
lower_range=0

median_value = train['Interest_Rate'].median()
train.loc[(train['Interest_Rate'] > upper_range) | (train['Interest_Rate'] < lower_range), 'Interest_Rate'] = median_value

In [322]:
upper_range=np.percentile(train['Num_of_Loan'], 99.5)
lower_range=0

median_value = train['Num_of_Loan'].median()
train.loc[(train['Num_of_Loan'] > upper_range) | (train['Num_of_Loan'] < lower_range), 'Num_of_Loan'] = median_value

In [323]:
unique_values = set()
for value in train['Type_of_Loan']:
    for sub_value in value.split(', '):
        unique_values.add(sub_value.replace("and ",""))

for value in unique_values:
    train[value] = train['Type_of_Loan'].apply(lambda x: 1 if value in x else 0)

train.drop(columns=["Type_of_Loan"],inplace=True)

In [324]:
min_delays = 0
max_delays = 5
num_changed_values=0

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(1, len(train)):
    wrong_income=False
    if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID'] and (train.loc[i, 'Num_of_Delayed_Payment']*max_income<train.loc[i-1, 'Num_of_Delayed_Payment'] or train.loc[i, 'Num_of_Delayed_Payment']>max_income*train.loc[i-1, 'Num_of_Delayed_Payment']):
        wrong_income=True
    if train.loc[i, 'Num_of_Delayed_Payment'] < min_income or wrong_income:
        try:
            if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID'] or train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID']:
                if train.loc[i, 'Num_of_Delayed_Payment'] < min_income:
                    train.loc[i, 'Num_of_Delayed_Payment']=max(train.loc[i-1, 'Num_of_Delayed_Payment'],train.loc[i, 'Num_of_Delayed_Payment'])
                else:
                    train.loc[i, 'Num_of_Delayed_Payment'] = min(train.loc[i-1, 'Num_of_Delayed_Payment'],train.loc[i, 'Num_of_Delayed_Payment'])
                num_changed_values+=1
        except:
            pass
print(num_changed_values)

3197


In [325]:
upper_range=np.percentile(train['Num_of_Delayed_Payment'], 99.8)
lower_range=0

median_value = train['Num_of_Delayed_Payment'].median()
train.loc[(train['Num_of_Delayed_Payment'] > upper_range) | (train['Num_of_Delayed_Payment'] < lower_range), 'Num_of_Delayed_Payment'] = median_value

In [326]:
num_changed_values=0
train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(0, len(train)-1):
    if train.loc[i, 'Changed_Credit_Limit']=='':
        if train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID']:
            train.loc[i, 'Changed_Credit_Limit'] = train.loc[i+1, 'Changed_Credit_Limit']
        else:
            train.loc[i, 'Changed_Credit_Limit'] = 0
        num_changed_values+=1

for i in range(0,len(train)):
    if train.loc[i, 'Changed_Credit_Limit']=='':
        train.loc[i, 'Changed_Credit_Limit'] = 0
        num_changed_values+=1

train[['Changed_Credit_Limit']] = train[['Changed_Credit_Limit']].astype(float)
print(num_changed_values)

2123


In [327]:
upper_range=np.percentile(train['Num_Credit_Inquiries'], 98.3)
lower_range=0

train.sort_values(by=['Customer_ID', 'Month'], inplace=True)
train.reset_index(drop=True, inplace=True)

for i in range(0, len(train)-1):
    if train.loc[i, 'Num_Credit_Inquiries'] < lower_range or train.loc[i, 'Num_Credit_Inquiries'] > upper_range:
        if train.loc[i, 'Customer_ID'] == train.loc[i+1, 'Customer_ID'] and train.loc[i, 'Num_Credit_Inquiries'] != train.loc[i+1, 'Num_Credit_Inquiries']:
            train.loc[i, 'Num_Credit_Inquiries'] = train.loc[i+1, 'Num_Credit_Inquiries']
            num_changed_values+=1
        try:
            if train.loc[i, 'Customer_ID'] == train.loc[i-1, 'Customer_ID'] and train.loc[i, 'Num_Credit_Inquiries'] != train.loc[i-1, 'Num_Credit_Inquiries']:
                train.loc[i, 'Num_Credit_Inquiries'] = train.loc[i-1, 'Num_Credit_Inquiries']
                num_changed_values+=1
        except:
            pass

median_value = train['Num_Credit_Inquiries'].median()
for i in range(0,len(train)-1):
    if train.loc[i, 'Num_Credit_Inquiries'] < lower_range or train.loc[i, 'Num_Credit_Inquiries'] > upper_range:
        train.loc[i,"Num_Credit_Inquiries"] = median_value

print(num_changed_values)

4195


Assuming ______ in Occupation means 'unemployed'

In [328]:
to_replace = '_______'
replacement = 'Unemployed'

train['Occupation'] = train['Occupation'].replace(to_replace, replacement)

Assuming --- in Credit_Mix means 'Missing'

In [338]:
to_replace = '_'
replacement = 'Missing'

train['Credit_Mix'] = train['Credit_Mix'].replace(to_replace, replacement)

Dropping ID, Customer_ID and Name columns. Extracting first 3 values from SSN column, adding column to show the result of classification for the previous loan request made by the client.

In [329]:
train = train.sort_values(["Customer_ID", 'Month'])
train['Previous_Classification'] = ''
for index, row in train.iterrows():
    if index > 0 and train.at[index-1, "Customer_ID"] == row["Customer_ID"]:
        train.at[index, 'Previous_Classification'] = train.at[index-1, 'Credit_Score']
    else:
        train.at[index, 'Previous_Classification'] = "NCR"

train.drop(columns=["ID","Customer_ID", "Name","SSN"],inplace=True)

In [330]:
train.dtypes

Month                        object
Age                           int32
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   int32
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                 object
SSN_Location                 object
Home Equity Loan              int64
Credit-Builder Loan           int64
Mortgage Loan                 int64
Auto Loan                   

In [331]:
train

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Credit-Builder Loan,Mortgage Loan,Auto Loan,Student Loan,Debt Consolidation Loan,Missing_Information,Payday Loan,Not Specified,Personal Loan,Previous_Classification
0,April,17,Lawyer,30625.94,2706.161667,6,5,27,2,64,...,1,0,0,0,0,0,0,0,0,NCR
1,August,18,Lawyer,30625.94,2706.161667,6,5,27,2,57,...,1,0,0,0,0,0,0,0,0,Poor
2,February,17,Lawyer,30625.94,2706.161667,6,5,27,2,62,...,1,0,0,0,0,0,0,0,0,Poor
3,January,17,Lawyer,30625.94,2706.161667,6,5,27,2,62,...,1,0,0,0,0,0,0,0,0,Poor
4,July,18,Lawyer,30625.94,2706.161667,6,5,27,2,62,...,1,0,0,0,0,0,0,0,0,Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,January,29,Scientist,41398.44,3749.870000,8,7,13,6,25,...,0,1,1,0,0,0,1,0,0,Standard
99996,July,30,Scientist,41398.44,3749.870000,8,7,13,6,23,...,0,1,1,0,0,0,1,0,0,Standard
99997,June,29,Scientist,41398.44,3749.870000,8,7,13,6,23,...,0,1,1,0,0,0,1,0,0,Good
99998,March,29,Scientist,41398.44,3749.870000,8,7,13,6,25,...,0,1,1,0,0,0,1,0,0,Standard


Dealing with missing/incorrect values

In [342]:
train["Outstanding_Debt"].value_counts()

Outstanding_Debt
460.46     24
1109.03    24
1360.45    24
1151.70    24
51.50      16
           ..
1379.31     8
262.19      8
39.28       8
258.33      8
1701.88     8
Name: count, Length: 12203, dtype: int64

In [343]:
for val in train["Outstanding_Debt"]:
    print(val)

1562.91
1562.91
1562.91
1562.91
1562.91
1562.91
1562.91
1562.91
202.68
202.68
202.68
202.68
202.68
202.68
202.68
202.68
1030.2
1030.2
1030.2
1030.2
1030.2
1030.2
1030.2
1030.2
473.14
473.14
473.14
473.14
473.14
473.14
473.14
473.14
1233.51
1233.51
1233.51
1233.51
1233.51
1233.51
1233.51
1233.51
340.22
340.22
340.22
340.22
340.22
340.22
340.22
340.22
2773.09
2773.09
2773.09
2773.09
2773.09
2773.09
2773.09
2773.09
849.69
849.69
849.69
849.69
849.69
849.69
849.69
849.69
648.36
648.36
648.36
648.36
648.36
648.36
648.36
648.36
869.59
869.59
869.59
869.59
869.59
869.59
869.59
869.59
1852.86
1852.86
1852.86
1852.86
1852.86
1852.86
1852.86
1852.86
665.82
665.82
665.82
665.82
665.82
665.82
665.82
665.82
568.21
568.21
568.21
568.21
568.21
568.21
568.21
568.21
706.96
706.96
706.96
706.96
706.96
706.96
706.96
706.96
4913.15
4913.15
4913.15
4913.15
4913.15
4913.15
4913.15
4913.15
154.96
154.96
154.96
154.96
154.96
154.96
154.96
154.96
1835.67
1835.67
1835.67
1835.67
1835.67
1835.67
1835.67
1835.67


In [334]:
train.isna().sum()

Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
SSN_Location                0
Home Equity Loan            0
Credit-Builder Loan         0
Mortgage Loan               0
Auto Loan                   0
Student Loan                0
Debt Consolidation Loan     0
Missing_Information         0
Payday Loan                 0
Not Specified               0
Personal L

In [335]:
min_age = 10
max_age = 150

# Count and identify invalid values
invalid_values = train.loc[(train['Age'] < min_age) | (train['Age'] > max_age), 'Age']
num_invalid_values = len(invalid_values)

# Print the number of invalid values and the invalid values themselves
print("Number of invalid values:", num_invalid_values)
print("Invalid values:", invalid_values.unique().tolist())

Number of invalid values: 0
Invalid values: []
