In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.tree import plot_tree
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Reading the data 

In [2]:
df=pd.read_csv(r'train.csv')
df.head()

  df=pd.read_csv(r'train.csv')


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


## Data Exploration

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

##### Features that will be deleted due its cardinality issue

In [4]:
df=df.drop(columns=['ID','Customer_ID','Name','SSN'])


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     100000 non-null  object 
 1   Age                       100000 non-null  object 
 2   Occupation                100000 non-null  object 
 3   Annual_Income             100000 non-null  object 
 4   Monthly_Inhand_Salary     84998 non-null   float64
 5   Num_Bank_Accounts         100000 non-null  int64  
 6   Num_Credit_Card           100000 non-null  int64  
 7   Interest_Rate             100000 non-null  int64  
 8   Num_of_Loan               100000 non-null  object 
 9   Type_of_Loan              88592 non-null   object 
 10  Delay_from_due_date       100000 non-null  int64  
 11  Num_of_Delayed_Payment    92998 non-null   object 
 12  Changed_Credit_Limit      100000 non-null  object 
 13  Num_Credit_Inquiries      98035 non-null   fl

##### Working on the mounth feature


In [6]:
df['Month'].value_counts()

Month
January     12500
February    12500
March       12500
April       12500
May         12500
June        12500
July        12500
August      12500
Name: count, dtype: int64

In [7]:
dic_Mounth={'January':1,
    'February':2,
    'March':3,
    'April':4,
    'May':5,
    'June':6,
    'July':7,
    'August':8
    }

df['Month']=df['Month'].map(dic_Mounth)
df['Month'].value_counts()

Month
1    12500
2    12500
3    12500
4    12500
5    12500
6    12500
7    12500
8    12500
Name: count, dtype: int64

### Age

In [8]:
df['Age']= df['Age'].apply(lambda x: int(x.replace('_','')))

In [9]:
df['Age']

0         23
1         23
2       -500
3         23
4         23
        ... 
99995     25
99996     25
99997     25
99998     25
99999     25
Name: Age, Length: 100000, dtype: int64

In [10]:
df=df[df['Age'] >= 0] 

In [11]:
df['Age']

0        23
1        23
3        23
4        23
5        23
         ..
99995    25
99996    25
99997    25
99998    25
99999    25
Name: Age, Length: 99114, dtype: int64

### Occupation: we will use one hot encoding 

In [12]:
df['Occupation'].value_counts()

Occupation
_______          7001
Lawyer           6505
Architect        6303
Engineer         6295
Scientist        6238
Mechanic         6229
Accountant       6221
Media_Manager    6176
Developer        6169
Teacher          6153
Entrepreneur     6134
Journalist       6036
Doctor           6025
Manager          5932
Musician         5857
Writer           5840
Name: count, dtype: int64

### Annual_Income: we will convert it to float after cleaning the noise character             

In [13]:
df['Annual_Income'].value_counts()
#df["Annual_Income"]=df["Annual_Income"].str.split('_').str[0]

Annual_Income
20867.67      16
36585.12      16
17273.83      16
95596.35      15
33029.66      15
              ..
60381.32_      1
46152.92_      1
18105.32_      1
73475.02_      1
13037701.0     1
Name: count, Length: 18908, dtype: int64

In [14]:
df['Annual_Income']= df['Annual_Income'].apply(lambda x: float(x.replace('_','')))

In [15]:
#df["Annual_Income"]=df["Annual_Income"].astype(str).astype(float) 

In [16]:
df['Annual_Income'].dtype

dtype('float64')

### Num_Bank_Accounts

In [17]:
df['Num_Bank_Accounts']

0        3
1        3
3        3
4        3
5        3
        ..
99995    4
99996    4
99997    4
99998    4
99999    4
Name: Num_Bank_Accounts, Length: 99114, dtype: int64

In [18]:
df['Num_Bank_Accounts'].dtype

dtype('int64')

In [19]:
df=df[df['Num_Bank_Accounts'] >= 0] 

In [20]:
# from scipy import stats
# df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [21]:
# # IQR
# Q1 = np.percentile(df['Num_Bank_Accounts'], 25, method='midpoint')
# Q3 = np.percentile(df['Num_Bank_Accounts'], 75, method='midpoint')
# IQR = Q3 - Q1
# print(IQR)
# # Above Upper bound
# upper = Q3+1.5*IQR
# upper_array = np.array(df['Num_Bank_Accounts'] >= upper)
# print("Upper Bound:", upper)
# print(upper_array.sum())
 
# # Below Lower bound
# lower = Q1-1.5*IQR
# lower_array = np.array(df['Num_Bank_Accounts'] <= lower)
# print("Lower Bound:", lower)
# print(lower_array.sum())

### Num_of_Loan 

In [22]:
df['Num_of_Loan'].nunique()

430

In [23]:
df['Num_of_Loan']= df['Num_of_Loan'].apply(lambda x: int(x.replace('_','')))
#df["Num_of_Loan"]=df["Num_of_Loan"].str.split('_').str[0]
#df["Num_of_Loan"]=df['Num_of_Loan'].astype(str).astype(int)

In [24]:
df['Num_of_Loan'].dtype

dtype('int64')

### Type of loan will be deleted due its high cardinality

In [25]:
df['Type_of_Loan'].nunique()

6260

In [26]:
df['Type_of_Loan'].value_counts()

Type_of_Loan
Not Specified                                                                                                                         1396
Credit-Builder Loan                                                                                                                   1271
Personal Loan                                                                                                                         1263
Debt Consolidation Loan                                                                                                               1246
Student Loan                                                                                                                          1232
                                                                                                                                      ... 
Debt Consolidation Loan, Student Loan, Home Equity Loan, Home Equity Loan, and Auto Loan                                                 6
Credit-Builder

In [27]:
df.drop('Type_of_Loan', axis=1, inplace=True)

### Delay from due date

In [28]:
df['Delay_from_due_date'].min()

-5

drop the negative numbers


In [29]:
df=df[df['Delay_from_due_date'] >= 0] 

In [30]:
df['Delay_from_due_date'].mean()

21.205010811414418

In [31]:
df['Delay_from_due_date'].dtype

dtype('int64')

### Number of delayed payment 

In [32]:
df['Num_of_Delayed_Payment']

0          7
3          4
4        NaN
5          4
6         8_
        ... 
99995      7
99996      7
99997      6
99998    NaN
99999      6
Name: Num_of_Delayed_Payment, Length: 98507, dtype: object

In [33]:
df['Num_of_Delayed_Payment']=df['Num_of_Delayed_Payment'].str.split('_').str[0]

In [34]:
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].fillna(0)

In [35]:
df['Num_of_Delayed_Payment']

0        7
3        4
4        0
5        4
6        8
        ..
99995    7
99996    7
99997    6
99998    0
99999    6
Name: Num_of_Delayed_Payment, Length: 98507, dtype: object

In [36]:
#df['Num_of_Delayed_Payment']= df['Num_of_Delayed_Payment'].apply(lambda x: int(x.replace('_','')))
df['Num_of_Delayed_Payment']=df['Num_of_Delayed_Payment'].astype(int)

In [37]:
df['Num_of_Delayed_Payment'].dtype

dtype('int32')

### credit mix

In [38]:
df['Credit_Mix']

0           _
3        Good
4        Good
5        Good
6        Good
         ... 
99995       _
99996       _
99997    Good
99998    Good
99999    Good
Name: Credit_Mix, Length: 98507, dtype: object

In [39]:
df['Credit_Mix'].value_counts()

Credit_Mix
Standard    36176
Good        23621
_           19903
Bad         18807
Name: count, dtype: int64

In [40]:
dic_credit={'Bad':0,
    'Standard':1,
    'Good':2,
    '_':None
    }

df['Credit_Mix']=df['Credit_Mix'].map(dic_credit)
df['Credit_Mix'].value_counts()

Credit_Mix
1.0    36176
2.0    23621
0.0    18807
Name: count, dtype: int64

In [41]:
df['Credit_Mix']

0        NaN
3        2.0
4        2.0
5        2.0
6        2.0
        ... 
99995    NaN
99996    NaN
99997    2.0
99998    2.0
99999    2.0
Name: Credit_Mix, Length: 98507, dtype: float64

In [42]:
#drop all nans in the end 
#df['Credit_Mix'].dropna(axis=0,inplace=True)

In [43]:
df['Credit_Mix']

0        NaN
3        2.0
4        2.0
5        2.0
6        2.0
        ... 
99995    NaN
99996    NaN
99997    2.0
99998    2.0
99999    2.0
Name: Credit_Mix, Length: 98507, dtype: float64

### changed credit limit

In [44]:
df['Changed_Credit_Limit']

0        11.27
3         6.27
4        11.27
5         9.27
6        11.27
         ...  
99995     11.5
99996     11.5
99997     11.5
99998     11.5
99999     11.5
Name: Changed_Credit_Limit, Length: 98507, dtype: object

In [45]:
df['Changed_Credit_Limit'] = df['Changed_Credit_Limit'].replace('_', 0)

In [46]:
#df['Changed_Credit_Limit']= df['Changed_Credit_Limit'].apply(lambda x: float(x.replace('_','')))
df['Changed_Credit_Limit']=df['Changed_Credit_Limit'].astype(float)

In [47]:
df['Changed_Credit_Limit'].dtype

dtype('float64')

In [48]:
df['Changed_Credit_Limit']

0        11.27
3         6.27
4        11.27
5         9.27
6        11.27
         ...  
99995    11.50
99996    11.50
99997    11.50
99998    11.50
99999    11.50
Name: Changed_Credit_Limit, Length: 98507, dtype: float64

### outstanding dept

In [49]:
df['Outstanding_Debt']

0        809.98
3        809.98
4        809.98
5        809.98
6        809.98
          ...  
99995    502.38
99996    502.38
99997    502.38
99998    502.38
99999    502.38
Name: Outstanding_Debt, Length: 98507, dtype: object

In [50]:
df['Outstanding_Debt']=df['Outstanding_Debt'].str.split('_').str[0]

In [51]:
df['Outstanding_Debt']=df['Outstanding_Debt'].astype(float)

In [52]:
df['Outstanding_Debt'].dtype

dtype('float64')

### credit history age

In [53]:
df['Credit_History_Age']

0         22 Years and 1 Months
3         22 Years and 4 Months
4         22 Years and 5 Months
5         22 Years and 6 Months
6         22 Years and 7 Months
                  ...          
99995     31 Years and 6 Months
99996     31 Years and 7 Months
99997     31 Years and 8 Months
99998     31 Years and 9 Months
99999    31 Years and 10 Months
Name: Credit_History_Age, Length: 98507, dtype: object

In [54]:
df['Credit_History_Age_by_years']=df['Credit_History_Age'].str.split(' ').str[0]
df['Credit_History_Age_by_years']

0        22
3        22
4        22
5        22
6        22
         ..
99995    31
99996    31
99997    31
99998    31
99999    31
Name: Credit_History_Age_by_years, Length: 98507, dtype: object

In [55]:
df['Credit_History_Age_by_years']=df['Credit_History_Age_by_years'].astype(float)

In [56]:
df['Credit_History_Age_by_mounths']=df['Credit_History_Age'].str.split(' ').str[3]
df['Credit_History_Age_by_mounths']

0         1
3         4
4         5
5         6
6         7
         ..
99995     6
99996     7
99997     8
99998     9
99999    10
Name: Credit_History_Age_by_mounths, Length: 98507, dtype: object

In [57]:
df['Credit_History_Age_by_mounths']=df['Credit_History_Age_by_mounths'].astype(float)

### payment of min amount  we will use one hot encoding 

In [58]:
df['Payment_of_Min_Amount']

0        No
3        No
4        No
5        No
6        No
         ..
99995    No
99996    No
99997    No
99998    No
99999    No
Name: Payment_of_Min_Amount, Length: 98507, dtype: object

In [59]:
df['Payment_of_Min_Amount'].value_counts()

Payment_of_Min_Amount
Yes    51878
No     34801
NM     11828
Name: count, dtype: int64

In [60]:
# dic_pomA={'No':0,
#           'Yes':1

# }
# df['Payment_of_Min_Amount']=df['Payment_of_Min_Amount'].map(dic_pomA)
# df['Payment_of_Min_Amount']

### amount invested monthly 

In [61]:
df['Amount_invested_monthly']

0         80.41529543900253
3         199.4580743910713
4        41.420153086217326
5        62.430172331195294
6         178.3440674122349
                ...        
99995     60.97133255718485
99996     54.18595028760385
99997     24.02847744864441
99998    251.67258219721603
99999     167.1638651610451
Name: Amount_invested_monthly, Length: 98507, dtype: object

In [62]:
#df['Amount_invested_monthly']=df['Amount_invested_monthly'].str.split('_').str[0]

In [63]:
df["Amount_invested_monthly"] = df["Amount_invested_monthly"].str.replace('__10000__',"10000", regex=True)

In [64]:
df['Amount_invested_monthly']=df['Amount_invested_monthly'].astype(float)

In [65]:
df["Amount_invested_monthly"].dtype

dtype('float64')

In [66]:
# df['Amount_invested_monthly']= df['Amount_invested_monthly'].apply(lambda x: str (x.replace('__','')))

In [67]:
#df['Amount_invested_monthly'] = df['Amount_invested_monthly'].replace(pattern, '__', regex=True)

In [68]:
#df['Amount_invested_monthly'] = df['Amount_invested_monthly'].str.extract(r'(\d{5})')

### a new idea i hava found 'remove hyphen' function 

In [69]:
# def remove_hyphen(column):
#     return column.str.replace('__', '')

In [70]:
#df['Amount_invested_monthly'] = remove_hyphen(df['Amount_invested_monthly'])

In [71]:
#df['Amount_invested_monthly'].astype(float)

### Payment behavour 

In [72]:
df['Payment_Behaviour']

0         High_spent_Small_value_payments
3          Low_spent_Small_value_payments
4        High_spent_Medium_value_payments
5                                  !@9#%8
6          Low_spent_Small_value_payments
                       ...               
99995     High_spent_Large_value_payments
99996    High_spent_Medium_value_payments
99997     High_spent_Large_value_payments
99998      Low_spent_Large_value_payments
99999                              !@9#%8
Name: Payment_Behaviour, Length: 98507, dtype: object

In [73]:
df['Payment_Behaviour'].value_counts()

Payment_Behaviour
Low_spent_Small_value_payments      25169
High_spent_Medium_value_payments    17298
Low_spent_Medium_value_payments     13625
High_spent_Large_value_payments     13485
High_spent_Small_value_payments     11186
Low_spent_Large_value_payments      10262
!@9#%8                               7482
Name: count, dtype: int64

In [74]:
#df['Payment_Behaviour']=df['Payment_Behaviour'].replace(pattern, ' ', regex=True)

In [75]:
df['Payment_Behaviour'].replace('  9  8',np.nan)

0         High_spent_Small_value_payments
3          Low_spent_Small_value_payments
4        High_spent_Medium_value_payments
5                                  !@9#%8
6          Low_spent_Small_value_payments
                       ...               
99995     High_spent_Large_value_payments
99996    High_spent_Medium_value_payments
99997     High_spent_Large_value_payments
99998      Low_spent_Large_value_payments
99999                              !@9#%8
Name: Payment_Behaviour, Length: 98507, dtype: object

In [76]:
df['Payment_Behaviour_spent']=df['Payment_Behaviour'].str.split('_').str[0]
df['Payment_Behaviour_spent']

0          High
3           Low
4          High
5        !@9#%8
6           Low
          ...  
99995      High
99996      High
99997      High
99998       Low
99999    !@9#%8
Name: Payment_Behaviour_spent, Length: 98507, dtype: object

In [77]:
dic_Pbs={"High":1,
         "Low":0
         }
df['Payment_Behaviour_spent']=df['Payment_Behaviour_spent'].map(dic_Pbs)
df['Payment_Behaviour_spent']

0        1.0
3        0.0
4        1.0
5        NaN
6        0.0
        ... 
99995    1.0
99996    1.0
99997    1.0
99998    0.0
99999    NaN
Name: Payment_Behaviour_spent, Length: 98507, dtype: float64

In [78]:
df['Payment_Behaviour_value']=df['Payment_Behaviour'].str.split('_').str[2]
df['Payment_Behaviour_value']

0         Small
3         Small
4        Medium
5           NaN
6         Small
          ...  
99995     Large
99996    Medium
99997     Large
99998     Large
99999       NaN
Name: Payment_Behaviour_value, Length: 98507, dtype: object

In [79]:
dic_Pbs={"Small":0,
         "Medium":1,
         "Large":2
         }
df['Payment_Behaviour_value']=df['Payment_Behaviour_value'].map(dic_Pbs)
df['Payment_Behaviour_value']

0        0.0
3        0.0
4        1.0
5        NaN
6        0.0
        ... 
99995    2.0
99996    1.0
99997    2.0
99998    2.0
99999    NaN
Name: Payment_Behaviour_value, Length: 98507, dtype: float64

In [80]:
df['Payment_Behaviour_value']

0        0.0
3        0.0
4        1.0
5        NaN
6        0.0
        ... 
99995    2.0
99996    1.0
99997    2.0
99998    2.0
99999    NaN
Name: Payment_Behaviour_value, Length: 98507, dtype: float64

### mounthly balance 

In [81]:
df['Monthly_Balance']

0        312.49408867943663
3        223.45130972736786
4        341.48923103222177
5         340.4792117872438
6         244.5653167062043
                ...        
99995            479.866228
99996             496.65161
99997            516.809083
99998            319.164979
99999            393.673696
Name: Monthly_Balance, Length: 98507, dtype: object

In [82]:
#df['Monthly_Balance']=remove_hyphen(df['Monthly_Balance'])

In [83]:
#df['Monthly_Balance']= df['Monthly_Balance'].apply(lambda x: float(x.replace('__','')))
df["Monthly_Balance"] = [str(x).replace('__',' ') for x in df["Monthly_Balance"]]

In [84]:
df['Monthly_Balance']=df['Monthly_Balance'].astype(float)

In [85]:
df['Monthly_Balance']

0        312.494089
3        223.451310
4        341.489231
5        340.479212
6        244.565317
            ...    
99995    479.866228
99996    496.651610
99997    516.809083
99998    319.164979
99999    393.673696
Name: Monthly_Balance, Length: 98507, dtype: float64

### Credit score

In [86]:
df['Credit_Score'].value_counts()

Credit_Score
Standard    52500
Poor        28689
Good        17318
Name: count, dtype: int64

In [87]:
dic_credit_s={'Poor':0,
    'Standard':1,
    'Good':2
    }
df['Credit_Score']=df['Credit_Score'].map(dic_credit_s)
df['Credit_Score'].value_counts()

Credit_Score
1    52500
0    28689
2    17318
Name: count, dtype: int64

### here i have removed the outliers 

In [88]:
# q_low = df["Annual_Income"].quantile(0.01)
# q_hi  = df["Annual_Income"].quantile(0.99)

# df = df[(df["Annual_Income"] < q_hi) & (df["Annual_Income"] > q_low)]
# df.info()

### Onehot Encoding

In [89]:
df=pd.get_dummies(df, columns=['Occupation', 'Payment_of_Min_Amount'])

# Spliting the x and y

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98507 entries, 0 to 99999
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Month                          98507 non-null  int64  
 1   Age                            98507 non-null  int64  
 2   Annual_Income                  98507 non-null  float64
 3   Monthly_Inhand_Salary          83736 non-null  float64
 4   Num_Bank_Accounts              98507 non-null  int64  
 5   Num_Credit_Card                98507 non-null  int64  
 6   Interest_Rate                  98507 non-null  int64  
 7   Num_of_Loan                    98507 non-null  int64  
 8   Delay_from_due_date            98507 non-null  int64  
 9   Num_of_Delayed_Payment         98507 non-null  int32  
 10  Changed_Credit_Limit           98507 non-null  float64
 11  Num_Credit_Inquiries           96580 non-null  float64
 12  Credit_Mix                     78604 non-null  floa

In [91]:
df.dropna(inplace=True)

In [92]:
x=df.drop(columns=['Credit_Score','Payment_Behaviour','Credit_History_Age'])


In [93]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52002 entries, 4 to 99998
Data columns (total 41 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Month                          52002 non-null  int64  
 1   Age                            52002 non-null  int64  
 2   Annual_Income                  52002 non-null  float64
 3   Monthly_Inhand_Salary          52002 non-null  float64
 4   Num_Bank_Accounts              52002 non-null  int64  
 5   Num_Credit_Card                52002 non-null  int64  
 6   Interest_Rate                  52002 non-null  int64  
 7   Num_of_Loan                    52002 non-null  int64  
 8   Delay_from_due_date            52002 non-null  int64  
 9   Num_of_Delayed_Payment         52002 non-null  int32  
 10  Changed_Credit_Limit           52002 non-null  float64
 11  Num_Credit_Inquiries           52002 non-null  float64
 12  Credit_Mix                     52002 non-null  floa

In [94]:
y=df['Credit_Score']

In [95]:
y.info()

<class 'pandas.core.series.Series'>
Index: 52002 entries, 4 to 99998
Series name: Credit_Score
Non-Null Count  Dtype
--------------  -----
52002 non-null  int64
dtypes: int64(1)
memory usage: 812.5 KB


In [96]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [97]:
# Create and train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=4, random_state=42)
rf_classifier.fit(X_train, y_train)

In [100]:
# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(f1_score(y_test, y_pred, average='macro'))
# 0.26666666666666666
print(f1_score(y_test, y_pred, average='micro'))
# 0.33333333333333331
print(f1_score(y_test, y_pred, average='weighted')) 
# 0.26666666666666666

0.7032016152293049
0.6682354566069227
0.7032016152293049
0.7004011970219545
