# Load Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/data.csv')

In [None]:
df.head(5)

Unnamed: 0,The purpose of the loan.,The interest rate of the loan (more risky borrowers are assigned higher interest rates).,The monthly installments owed by the borrower if the loan is funded.,The FICO credit score of the borrower.,The number of days the borrower has had a credit line.,The borrower's number of inquiries by creditors in the last 6 months.,The number of times the borrower had been 30+ days past due on a payment in the past 2 years.,1 if the loan is not fully paid
0,Purpose,Rate,Instalment,Score,Days_Credit,Inquiries,Delay_2yrs,Fully_Paid
1,debt_consolidation,0.1189,829.1,737,5640,0,0,Yes
2,credit_card,0.1071,228.22,707,2760,0,0,Yes
3,debt_consolidation,0.1357,366.86,682,4710,1,0,Yes
4,debt_consolidation,0.1008,162.34,712,2700,1,0,Yes


In [None]:
df.columns = df.iloc[0]
df = df[1:]
df = df.reset_index(drop=True)

In [None]:
df.head(5)

Unnamed: 0,Purpose,Rate,Instalment,Score,Days_Credit,Inquiries,Delay_2yrs,Fully_Paid
0,debt_consolidation,0.1189,829.1,737,5640,0,0,Yes
1,credit_card,0.1071,228.22,707,2760,0,0,Yes
2,debt_consolidation,0.1357,366.86,682,4710,1,0,Yes
3,debt_consolidation,0.1008,162.34,712,2700,1,0,Yes
4,credit_card,0.1426,102.92,667,4066,0,1,Yes




---



# Data Type

In [None]:
numerical_col = ['Rate', 'Instalment','Score','Days_Credit','Inquiries','Delay_2yrs']

def check_datatype(df , numerical_col):
    for col in numerical_col:
        print(col, ":", df[col].dtype)

In [None]:
check_datatype(df , numerical_col)

Rate : object
Instalment : object
Score : object
Days_Credit : object
Inquiries : object
Delay_2yrs : object


In [None]:
def convertor(df , numerical_col):
    for col in numerical_col:
        df[col]=df[col].astype(float)

In [None]:
convertor(df , numerical_col)

In [None]:
check_datatype(df , numerical_col)

Rate : float64
Instalment : float64
Score : float64
Days_Credit : float64
Inquiries : float64
Delay_2yrs : float64




---



# Target Variable

In [None]:
df['Fully_Paid'].unique()

array(['Yes', 'No'], dtype=object)

In [None]:
df['Fully_Paid'] = df['Fully_Paid'].replace({'Yes':1,'No':0})

In [None]:
df['Fully_Paid'].unique()

array([1, 0])



---



# Correlation Numerical ~ Target

In [None]:
from scipy.stats import pointbiserialr

numerical_col = ['Rate', 'Instalment','Score','Days_Credit','Inquiries','Delay_2yrs']
target = 'Fully_Paid'

correlation = {}

for col in numerical_col:
    corr, _ = pointbiserialr(df[col], df[target])
    correlation[col] = corr

correlation_df = pd.DataFrame(list(correlation.items()), columns=['Variable', 'Correlation']).sort_values(by='Correlation', ascending = False)

In [None]:
correlation_df

Unnamed: 0,Variable,Correlation
2,Score,0.191202
5,Delay_2yrs,0.110748
3,Days_Credit,0.08281
1,Instalment,-0.082602
4,Inquiries,-0.106988
0,Rate,-0.274115




---



# Correlation Categorical ~ Target

In [None]:
from scipy.stats import chi2_contingency

categorical_col = ['Purpose']
target = 'Fully_Paid'

chi2_results = {}

for col in categorical_col:
    crosstab = pd.crosstab(df[col], df[target])
    chi2, p, _, _ = chi2_contingency(crosstab)
    chi2_results[col] = (chi2, p)

chi2_result_df = pd.DataFrame(list(chi2_results.items()) , columns=['Variable', 'Chi2_p'])

In [None]:
chi2_result_df

Unnamed: 0,Variable,Chi2_p
0,Purpose,"(8.810452832191963, 0.18452203894824112)"




---

