In [1]:
# Import all of the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Create individual columns
Credit_Score = [700,800,900,950]
Income = [50000,60000,70000,80000,90000]
DTI_Ratios = [20,30,40,50]
Employment = ['Employed','Unemployed','Self-Employed','Student']
Loan_Amount = [10000,20000,30000,40000,50000]
Purpose = ['Home Improvement','Business','Education','Personal Loan']
Credit_History = [5,10,15,20]
Open_Accounts = [1,2,3,4]
Utilization = [30,40,50,60]
Delinquencies = [0,1,2,3]
Bankruptcies = [0,1]
Housing = ['Rent','Own']

In [3]:
# Generate synthetic data
creditworthiness = []

for i in range(1,10000):
    credit_score = np.random.randint(1,500)
    income = np.random.randint(1,900)
    dti_ratio = np.random.randint(1,50)
    employment = np.random.choice(Employment)
    loan_amount = np.random.randint(1,500)
    purpose = np.random.choice(Purpose)
    credit_history = np.random.randint(1,20)
    open_accounts = np.random.randint(1,4)
    utilization = np.random.randint(1,60)
    delinquencies = np.random.randint(0,4)
    bankruptcies = np.random.choice(Bankruptcies)
    housing = np.random.choice(Housing)

  # Add all of the columns
    creditworthiness.append([credit_score,income,dti_ratio,employment,loan_amount,purpose,credit_history,open_accounts,
                             utilization,delinquencies,bankruptcies,housing])

In [4]:
# Define the column names
columns = ['Credit_Score', 'Income', 'DTI_Ratios', 'Employment', 'Loan_Amount', 'Purpose',
           'Credit_History', 'Open_Accounts', 'Utilization', 'Delinquencies', 'Bankruptcies', 'Housing']

In [5]:
# Create DataFrame
credit = pd.DataFrame(creditworthiness, columns=columns)

# Save to CSV
credit.to_csv("creditworthiness.csv", index=False) # Changed creditworthiness to credit

In [6]:
# Load the sample dataset
# Observe first five rows
credit = pd.read_csv("creditworthiness.csv")
credit.head()

Unnamed: 0,Credit_Score,Income,DTI_Ratios,Employment,Loan_Amount,Purpose,Credit_History,Open_Accounts,Utilization,Delinquencies,Bankruptcies,Housing
0,268,769,37,Student,415,Education,1,3,47,3,1,Rent
1,90,597,4,Employed,494,Personal Loan,19,1,49,1,0,Own
2,134,249,24,Student,64,Education,7,2,48,0,1,Rent
3,410,864,38,Student,122,Personal Loan,5,1,21,0,1,Rent
4,351,364,6,Self-Employed,433,Education,18,2,30,0,1,Rent


In [7]:
# Observe last five rows
credit.tail()

Unnamed: 0,Credit_Score,Income,DTI_Ratios,Employment,Loan_Amount,Purpose,Credit_History,Open_Accounts,Utilization,Delinquencies,Bankruptcies,Housing
9994,106,817,33,Employed,217,Education,3,1,45,0,0,Own
9995,262,473,44,Unemployed,319,Personal Loan,11,1,25,1,1,Own
9996,243,265,24,Employed,328,Personal Loan,4,2,16,0,1,Rent
9997,142,515,45,Self-Employed,67,Home Improvement,9,3,52,2,1,Rent
9998,82,429,17,Employed,123,Personal Loan,14,1,37,2,0,Own


In [8]:
# Encode all of the categorical variables
for col in credit:
    if credit[col].dtype == 'O':
        codes, _ = pd.factorize(credit[col])
        credit[col] = codes + 1

In [9]:
# Observe first five rows again
credit.head()

Unnamed: 0,Credit_Score,Income,DTI_Ratios,Employment,Loan_Amount,Purpose,Credit_History,Open_Accounts,Utilization,Delinquencies,Bankruptcies,Housing
0,268,769,37,1,415,1,1,3,47,3,1,1
1,90,597,4,2,494,2,19,1,49,1,0,2
2,134,249,24,1,64,1,7,2,48,0,1,1
3,410,864,38,1,122,2,5,1,21,0,1,1
4,351,364,6,3,433,1,18,2,30,0,1,1


In [10]:
# Introduce more complex conditions and randomness
for index in credit.index:

    # Probabilistic approach based on multiple factors
    default_prob = 0.0

    if credit.loc[index, 'Credit_Score'] < 600:
        default_prob += 0.3
    if credit.loc[index, 'DTI_Ratios'] > 40:
        default_prob += 0.2
    if credit.loc[index, 'Delinquencies'] > 1:
        default_prob += 0.25
    if credit.loc[index, 'Bankruptcies'] > 0:
        default_prob += 0.4

    # Add some randomness to the decision
    if np.random.rand() < default_prob:
        credit.loc[index, 'default'] = 1
    else:
        # Assign 0 if the conditions for default are not met
        credit.loc[index, 'default'] = 0

# Fill any remaining NaN values with 0 to be sure
credit['default'] = credit['default'].fillna(0)

In [11]:
# Model Training and Evaluation
X = credit.drop('default', axis=1)
y = credit['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Scale and transform the training and test datasets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Initialize the model
model = RandomForestClassifier(random_state=42, max_depth=5, n_estimators=50, min_samples_split=5)
model.fit(X_train, y_train)

In [14]:
# Predict and print the performance metric scores
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.74
Precision: 0.75
Recall: 0.91
F1-Score: 0.82
