In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

## Importing dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
dataset=pd.read_csv("/content/drive/MyDrive/ETH/dataset/Preprocessed_Missing_dataset.csv")

## Data preparation

In [4]:
# shows count of rows and columns
dataset.shape

(100000, 28)

In [5]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.94496,22 Years and 2 Months,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good


In [6]:
dataset.Credit_Score.unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  int64  
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  int64  
 13  Type_of_Loan              100000 non-null  ob

In [8]:
# Select specific data
columns_to_select = ['Customer_ID','Name', "Age", "Occupation", "Annual_Income", 'Monthly_Inhand_Salary', 'Type_of_Loan', 'Credit_Score' ]

In [9]:
# Select the specified columns
selected_dataset = dataset[columns_to_select]

In [10]:
# explore missing values post missing value fix
selected_dataset.isna().sum()

Unnamed: 0,0
Customer_ID,0
Name,0
Age,0
Occupation,0
Annual_Income,0
Monthly_Inhand_Salary,0
Type_of_Loan,0
Credit_Score,0


In [11]:
df1 = selected_dataset.dropna()

In [12]:
df1.shape

(100000, 8)

In [13]:
sampled_dataset = df1.sample(n=3000, random_state=42)
sampled_dataset.shape

(3000, 8)

In [14]:
# Merge personal data with transaction features
transaction_features=pd.read_csv("/content/drive/MyDrive/ETH/dataset/transaction_features.csv")
transaction_features.head()

Unnamed: 0,wallet_id,num_transactions,total_value,avg_transaction_value,num_unique_addresses,default_address,balance,can_sign,network_id,Credit_Score
0,wallet_0,52,33614.85349,646.43949,13,address_0,87667.45009,True,network_3,Standard
1,wallet_1,93,19746.27262,212.325512,4,address_1,98611.21388,False,network_1,Poor
2,wallet_2,15,3291.867938,219.457863,9,address_2,95710.14627,False,network_1,Poor
3,wallet_3,72,18131.02685,251.819817,8,address_3,15482.12286,True,network_1,Poor
4,wallet_4,61,20956.21622,343.544528,46,address_4,3872.076784,True,network_1,Poor


In [15]:
transaction_features.shape

(50000, 10)

In [16]:
transaction_features = transaction_features.sample(n=3000, random_state=42)

In [17]:
transaction_features.head()

Unnamed: 0,wallet_id,num_transactions,total_value,avg_transaction_value,num_unique_addresses,default_address,balance,can_sign,network_id,Credit_Score
33553,wallet_33553,31,30246.37115,975.689392,34,address_33553,1378.536198,False,network_2,Poor
9427,wallet_9427,94,76005.08024,808.564683,48,address_9427,45694.02298,True,network_3,Poor
199,wallet_199,62,90736.36994,1463.489838,31,address_199,91878.92694,True,network_2,Good
12447,wallet_12447,6,89300.00376,14883.33396,47,address_12447,83485.57706,True,network_3,Standard
39489,wallet_39489,58,7597.199856,130.986204,18,address_39489,87748.05087,False,network_1,Poor


In [18]:
#merged_data = pd.concat([sampled_dataset, transaction_features], axis=0)
merged_data = pd.merge(sampled_dataset, transaction_features, on='Credit_Score', how='inner')

In [19]:
merged_data.head()

Unnamed: 0,Customer_ID,Name,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Type_of_Loan,Credit_Score,wallet_id,num_transactions,total_value,avg_transaction_value,num_unique_addresses,default_address,balance,can_sign,network_id
0,CUS_0x3539,an Arakalid,30,Manager,43391.96,3688.996667,"Auto Loan, Personal Loan, Payday Loan, and Cre...",Good,wallet_199,62,90736.36994,1463.489838,31,address_199,91878.92694,True,network_2
1,CUS_0x3539,an Arakalid,30,Manager,43391.96,3688.996667,"Auto Loan, Personal Loan, Payday Loan, and Cre...",Good,wallet_10822,52,59587.01177,1145.904073,45,address_10822,91685.91451,True,network_2
2,CUS_0x3539,an Arakalid,30,Manager,43391.96,3688.996667,"Auto Loan, Personal Loan, Payday Loan, and Cre...",Good,wallet_43106,61,35139.77548,576.061893,31,address_43106,76836.19147,True,network_3
3,CUS_0x3539,an Arakalid,30,Manager,43391.96,3688.996667,"Auto Loan, Personal Loan, Payday Loan, and Cre...",Good,wallet_34304,78,58420.68586,748.983152,42,address_34304,14842.38851,False,network_1
4,CUS_0x3539,an Arakalid,30,Manager,43391.96,3688.996667,"Auto Loan, Personal Loan, Payday Loan, and Cre...",Good,wallet_12609,39,3573.307925,91.62328,6,address_12609,41807.56692,True,network_3


## Train Test Split

In [20]:
merged_data.shape

(2985510, 17)

In [21]:
print(merged_data.head())

  Customer_ID         Name  Age Occupation  Annual_Income  \
0  CUS_0x3539  an Arakalid   30    Manager       43391.96   
1  CUS_0x3539  an Arakalid   30    Manager       43391.96   
2  CUS_0x3539  an Arakalid   30    Manager       43391.96   
3  CUS_0x3539  an Arakalid   30    Manager       43391.96   
4  CUS_0x3539  an Arakalid   30    Manager       43391.96   

   Monthly_Inhand_Salary                                       Type_of_Loan  \
0            3688.996667  Auto Loan, Personal Loan, Payday Loan, and Cre...   
1            3688.996667  Auto Loan, Personal Loan, Payday Loan, and Cre...   
2            3688.996667  Auto Loan, Personal Loan, Payday Loan, and Cre...   
3            3688.996667  Auto Loan, Personal Loan, Payday Loan, and Cre...   
4            3688.996667  Auto Loan, Personal Loan, Payday Loan, and Cre...   

  Credit_Score     wallet_id  num_transactions   total_value  \
0         Good    wallet_199                62  90736.369940   
1         Good  wallet_10822  

In [22]:
merged_data.columns

Index(['Customer_ID', 'Name', 'Age', 'Occupation', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Type_of_Loan', 'Credit_Score', 'wallet_id',
       'num_transactions', 'total_value', 'avg_transaction_value',
       'num_unique_addresses', 'default_address', 'balance', 'can_sign',
       'network_id'],
      dtype='object')

In [23]:
y = merged_data['Credit_Score']
X = merged_data.drop(columns=['Credit_Score'], axis=1)

In [24]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object':  # Check if the column contains strings
        X[column] = label_encoder.fit_transform(X[column])

In [25]:
y = label_encoder.fit_transform(y)

In [26]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

## Model building with RandomForest

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=30, class_weight='balanced')
rf.fit(X_train, y_train)

## Model performance

In [28]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

predictions = rf.predict(X_test)
print('Classification Report', classification_report(y_test, predictions))
print('\n')
print('Confusion Matrix', confusion_matrix(y_test, predictions))
print('\n')
print('Accuracy Score', accuracy_score(y_test, predictions))

Classification Report               precision    recall  f1-score   support

           0       0.98      0.98      0.98    107802
           1       0.98      0.99      0.99    195468
           2       0.99      0.98      0.99    293832

    accuracy                           0.99    597102
   macro avg       0.98      0.99      0.99    597102
weighted avg       0.99      0.99      0.99    597102



Confusion Matrix [[106003      0   1799]
 [     0 193498   1970]
 [  1798   3036 288998]]


Accuracy Score 0.9855920763956577


In [29]:
# Display the predicted labels
print("Predicted labels:", predictions)

Predicted labels: [1 2 0 ... 1 2 1]


In [30]:
set(predictions)

{0, 1, 2}

## Save model

In [31]:
import joblib

joblib.dump(rf, "/content/drive/MyDrive/ETH/model_credit.joblib")

['/content/drive/MyDrive/ETH/model_credit.joblib']

## loan amount

In [32]:
def calculate_loan_amount(crypto_collateral, credit_score):
    # Loan-to-collateral ratio increases with higher credit scores
    if credit_score >= 750:
        loan_to_collateral_ratio = 0.75  # Can borrow up to 75% of collateral
    elif 700 <= credit_score < 750:
        loan_to_collateral_ratio = 0.6
    elif 650 <= credit_score < 700:
        loan_to_collateral_ratio = 0.5
    elif 600 <= credit_score < 650:
        loan_to_collateral_ratio = 0.4
    else:
        loan_to_collateral_ratio = 0.3  # Lower loan-to-collateral ratio for low credit score

    # Calculate max loan amount
    return crypto_collateral * loan_to_collateral_ratio

## calculate interest rate

In [33]:
def calculate_interest_rate(credit_score, loan_amount, loan_term, collateral=False):
    # Define interest rate tiers based on credit score
    if credit_score >= 750:
        base_rate = 0.05  # Excellent
    elif 700 <= credit_score < 750:
        base_rate = 0.08  # Good
    elif 650 <= credit_score < 700:
        base_rate = 0.12  # Fair
    elif 600 <= credit_score < 650:
        base_rate = 0.18  # Poor
    else:
        base_rate = 0.25  # Very Poor

    # Adjust interest rate based on loan amount, loan term, and collateral
    rate = base_rate

    # Higher loan amounts could lower rates slightly
    if loan_amount > 50000:
        rate -= 0.01  # 1% discount for larger loans

    # Longer loan terms increase rate due to higher risk
    if loan_term > 5:
        rate += 0.02  # Add 2% for terms over 5 years

    # Collateral reduces risk, so we lower the rate
    if collateral:
        rate -= 0.02  # 2% discount for collateral

    # Ensure rate doesn’t go below minimum or above maximum
    rate = max(0.03, min(rate, 0.3))  # Keep rate within 3% to 30%

    return rate

# Example usage
credit_score = 720
loan_amount = 60000
loan_term = 6  # in years
collateral = True

interest_rate = calculate_interest_rate(credit_score, loan_amount, loan_term, collateral)
print("Interest Rate:", interest_rate * 100, "%")

Interest Rate: 7.000000000000001 %
