In [490]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [491]:
df = pd.read_csv("train.csv")

  df = pd.read_csv("train.csv")


# Preprocessing

In [492]:
df.shape

(100000, 28)

In [493]:
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [494]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [495]:
df = df.drop(columns=['ID', 'Month', 'Name', 'SSN'])

In [496]:
clos_to_covert = ['Annual_Income', 'Monthly_Inhand_Salary', 'Outstanding_Debt',
                  'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly',
                  'Monthly_Balance', "Num_of_Loan", "Num_of_Delayed_Payment"]

for col in clos_to_covert:
        df[col] = df[col].astype(str).str.replace(r"[^0-9.]", "", regex=True)
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [497]:
df["Age"] = df["Age"].astype(str).str.replace(r"[^0-9]", "", regex=True)
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

In [498]:
median_age = df.loc[(df["Age"] >= 18) & (df["Age"] <= 90), "Age"].median()
df.loc[(df["Age"] < 18) | (df["Age"] > 90), "Age"] = median_age

In [499]:
df["Age"].unique()

array([23, 34, 28, 54, 55, 21, 31, 33, 30, 24, 44, 45, 40, 41, 32, 35, 36,
       39, 37, 20, 46, 26, 42, 19, 48, 38, 43, 22, 18, 27, 25, 47, 53, 56,
       29, 49, 51, 50, 52])

In [500]:
df["Occupation"].unique()

array(['Scientist', '_______', 'Teacher', 'Engineer', 'Entrepreneur',
       'Developer', 'Lawyer', 'Media_Manager', 'Doctor', 'Journalist',
       'Manager', 'Accountant', 'Musician', 'Mechanic', 'Writer',
       'Architect'], dtype=object)

In [501]:
df["Occupation"] = df["Occupation"].replace(["_______"], np.nan)

In [502]:
df["Occupation"].isna().sum()

np.int64(7062)

In [503]:
df["Occupation"] = df["Occupation"].fillna(df["Occupation"].mode()[0])

In [504]:
df["Occupation"].isna().sum()

np.int64(0)

In [505]:
freq_map = df["Occupation"].value_counts().to_dict()
df["Occupation"] = df["Occupation"].map(freq_map)

In [506]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Customer_ID               100000 non-null  object 
 1   Age                       100000 non-null  int64  
 2   Occupation                100000 non-null  int64  
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     84998 non-null   float64
 5   Num_Bank_Accounts         100000 non-null  int64  
 6   Num_Credit_Card           100000 non-null  int64  
 7   Interest_Rate             100000 non-null  int64  
 8   Num_of_Loan               100000 non-null  int64  
 9   Type_of_Loan              88592 non-null   object 
 10  Delay_from_due_date       100000 non-null  int64  
 11  Num_of_Delayed_Payment    92998 non-null   float64
 12  Changed_Credit_Limit      100000 non-null  object 
 13  Num_Credit_Inquiries      98035 non-null   fl

In [507]:
df["Annual_Income"].isna().sum()

np.int64(0)

In [508]:
df["Annual_Income"] = df.groupby("Customer_ID")["Annual_Income"].transform(lambda x: x.fillna(x.median()))
df["Annual_Income"] = df["Annual_Income"].fillna(df["Annual_Income"].median())

In [509]:
df["Annual_Income"].isna().sum()

np.int64(0)

In [510]:
df["Monthly_Inhand_Salary"].isna().sum()


np.int64(15002)

In [511]:
df["Monthly_Inhand_Salary"] = df.groupby("Customer_ID")["Monthly_Inhand_Salary"].transform(lambda x: x.fillna(x.median()))
df["Monthly_Inhand_Salary"] = df["Monthly_Inhand_Salary"].fillna(df["Monthly_Inhand_Salary"].median())

In [512]:
df["Monthly_Inhand_Salary"].isna().sum()

np.int64(0)

In [513]:
for col in ["Num_Bank_Accounts", "Num_Credit_Card"]:
    df[col] = df[col].astype(str).str.replace(r"[^0-9]", "", regex=True)
    df[col] = pd.to_numeric(df[col])

In [514]:
df["Num_Bank_Accounts"].isna().sum()

np.int64(0)

In [515]:
df["Num_Credit_Card"].isna().sum()


np.int64(0)

In [516]:
df["Interest_Rate"].isna().sum()

np.int64(0)

In [517]:
df["Num_of_Loan"].isna().sum()

np.int64(0)

In [518]:
df["Num_Bank_Accounts"].value_counts()

Unnamed: 0_level_0,count
Num_Bank_Accounts,Unnamed: 1_level_1
6,13001
7,12823
8,12765
4,12186
5,12118
...,...
1091,1
1123,1
1657,1
299,1


In [519]:
for col in ["Num_Bank_Accounts", "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Num_Credit_Inquiries"]:
    df[col] = df.groupby("Customer_ID")[col].transform(lambda x: x.mode()[0] if not x.mode().empty else x.median())

In [520]:
df.loc[df["Num_Bank_Accounts"] == 0, "Num_Credit_Card"] = 0

In [521]:
df["Num_Bank_Accounts"].value_counts()

Unnamed: 0_level_0,count
Num_Bank_Accounts,Unnamed: 1_level_1
6,13184
7,12976
8,12936
4,12392
5,12272
3,12096
9,5512
10,5328
1,4568
0,4384


In [522]:
df["Num_Credit_Card"].value_counts()

Unnamed: 0_level_0,count
Num_Credit_Card,Unnamed: 1_level_1
5,17928
6,16488
7,16400
4,13488
3,12736
8,5088
10,4960
9,4736
0,4400
2,1880


In [523]:
df["Interest_Rate"].value_counts()

Unnamed: 0_level_0,count
Interest_Rate,Unnamed: 1_level_1
8,5104
5,5096
6,4832
12,4648
10,4616
7,4584
9,4576
11,4512
18,4192
15,4072


In [524]:
df["Num_of_Loan"].value_counts()

Unnamed: 0_level_0,count
Num_of_Loan,Unnamed: 1_level_1
3,15752
2,15712
4,15456
0,11408
1,11128
6,8144
7,7680
5,7528
9,3856
8,3336


In [525]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Customer_ID               100000 non-null  object 
 1   Age                       100000 non-null  int64  
 2   Occupation                100000 non-null  int64  
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     100000 non-null  float64
 5   Num_Bank_Accounts         100000 non-null  int64  
 6   Num_Credit_Card           100000 non-null  int64  
 7   Interest_Rate             100000 non-null  int64  
 8   Num_of_Loan               100000 non-null  int64  
 9   Type_of_Loan              88592 non-null   object 
 10  Delay_from_due_date       100000 non-null  int64  
 11  Num_of_Delayed_Payment    92998 non-null   float64
 12  Changed_Credit_Limit      100000 non-null  object 
 13  Num_Credit_Inquiries      100000 non-null  fl

In [526]:
df["Type_of_Loan"].isna().sum()

np.int64(11408)

In [527]:
df["Type_of_Loan"] = df["Type_of_Loan"].fillna("Not Specified")

In [528]:
def clean_loans(x):
    x = str(x).replace("and ", "")
    x = x.strip().strip(",")
    loans = [loan.strip() for loan in x.split(",") if loan.strip()]
    loans = [loan for loan in loans if loan.lower() != "nan"]
    return loans

df["Type_of_Loan"] = df["Type_of_Loan"].apply(clean_loans)


In [529]:
df["Type_of_Loan"]

Unnamed: 0,Type_of_Loan
0,"[Auto Loan, Credit-Builder Loan, Personal Loan..."
1,"[Auto Loan, Credit-Builder Loan, Personal Loan..."
2,"[Auto Loan, Credit-Builder Loan, Personal Loan..."
3,"[Auto Loan, Credit-Builder Loan, Personal Loan..."
4,"[Auto Loan, Credit-Builder Loan, Personal Loan..."
...,...
99995,"[Auto Loan, Student Loan]"
99996,"[Auto Loan, Student Loan]"
99997,"[Auto Loan, Student Loan]"
99998,"[Auto Loan, Student Loan]"


In [530]:
mlb = MultiLabelBinarizer()
loan_dummies = pd.DataFrame(
    mlb.fit_transform(df["Type_of_Loan"]),
    columns=[f"Loan_{c}" for c in mlb.classes_],
    index=df.index
    )

df = df.join(loan_dummies)
df = df.drop(columns=["Type_of_Loan"])


In [531]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        92998 non-null   float64
 11  Changed_Credit_Limit          100000 non-null  object 
 12  Num_Credit_Inquiries          100000 non-null

In [532]:
df["Delay_from_due_date"].isna().sum()

np.int64(0)

In [533]:
df["Delay_from_due_date"][1]

np.int64(-1)

In [534]:
df.loc[df["Delay_from_due_date"] < 0, "Delay_from_due_date"] = 0

In [535]:
df["Num_of_Delayed_Payment"].isna().sum()

np.int64(7002)

In [536]:
df["Num_of_Delayed_Payment"].fillna(df["Num_of_Delayed_Payment"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Num_of_Delayed_Payment"].fillna(df["Num_of_Delayed_Payment"].median(), inplace=True)


In [537]:
df["Num_of_Delayed_Payment"].isna().sum()

np.int64(0)

In [538]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        100000 non-null  float64
 11  Changed_Credit_Limit          100000 non-null  object 
 12  Num_Credit_Inquiries          100000 non-null

In [539]:
df["Changed_Credit_Limit"] = df["Changed_Credit_Limit"].astype(str).str.replace(r"[^0-9.-]", "", regex=True)
df["Changed_Credit_Limit"] = pd.to_numeric(df["Changed_Credit_Limit"], errors="coerce")

In [540]:
df["Changed_Credit_Limit"] = df.groupby("Customer_ID")["Changed_Credit_Limit"].transform(lambda x: x.fillna(x.median()))
df["Changed_Credit_Limit"] = df["Changed_Credit_Limit"].fillna(df["Changed_Credit_Limit"].median())

In [541]:
df["Num_Credit_Inquiries"].value_counts()

Unnamed: 0_level_0,count
Num_Credit_Inquiries,Unnamed: 1_level_1
4.0,11936
3.0,9416
2.0,8568
7.0,8416
6.0,8264
8.0,8152
1.0,8104
0.0,7504
5.0,5728
9.0,5304


In [542]:
df["Credit_Mix"].value_counts()

Unnamed: 0_level_0,count
Credit_Mix,Unnamed: 1_level_1
Standard,36479
Good,24337
_,20195
Bad,18989


In [543]:
df["Credit_Mix"] = df["Credit_Mix"].replace(0, np.nan)

In [544]:
df["Credit_Mix"] = df["Credit_Mix"].replace("_", np.nan)

In [545]:
df["Credit_Mix"] = df.groupby("Customer_ID")["Credit_Mix"].transform(lambda x: x.mode()[0] if not x.mode().empty else "Standard")

In [546]:
df["Credit_Mix"].value_counts()

Unnamed: 0_level_0,count
Credit_Mix,Unnamed: 1_level_1
Standard,45848
Good,30384
Bad,23768


In [547]:
df["Credit_Mix"] = df["Credit_Mix"].fillna(df["Credit_Mix"].mode()[0])

# (Bad < Standard < Good)
credit_mix_order = [["Bad", "Standard", "Good"]]

encoder = OrdinalEncoder(categories=credit_mix_order)
df["Credit_Mix"] = encoder.fit_transform(df[["Credit_Mix"]])

In [548]:
df["Credit_Mix"].value_counts()

Unnamed: 0_level_0,count
Credit_Mix,Unnamed: 1_level_1
1.0,45848
2.0,30384
0.0,23768


In [549]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        100000 non-null  float64
 11  Changed_Credit_Limit          100000 non-null  float64
 12  Num_Credit_Inquiries          100000 non-null

In [550]:
df["Outstanding_Debt"] = df.groupby("Customer_ID")["Outstanding_Debt"].transform(lambda x: x.fillna(x.median()))
df["Outstanding_Debt"] = df["Outstanding_Debt"].fillna(df["Outstanding_Debt"].median())

In [551]:
def convert_credit_history(x):
    try:
        parts = x.split()
        years = int(parts[0])
        months = int(parts[3])
        return years * 12 + months
    except:
        return np.nan

df["Credit_History_Age_Months"] = df["Credit_History_Age"].apply(convert_credit_history)
df.drop('Credit_History_Age', axis=1, inplace=True)

In [552]:
df['Credit_History_Age_Months'].isna().sum()

np.int64(9030)

In [553]:
df["Credit_History_Age_Months"] = df["Credit_History_Age_Months"].fillna(df["Credit_History_Age_Months"].median())
df["Credit_History_Age_Months"] = df["Credit_History_Age_Months"].astype(int)

In [554]:
df['Credit_History_Age_Months'].isna().sum()

np.int64(0)

In [555]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        100000 non-null  float64
 11  Changed_Credit_Limit          100000 non-null  float64
 12  Num_Credit_Inquiries          100000 non-null

In [556]:
df["Payment_of_Min_Amount"].value_counts()

Unnamed: 0_level_0,count
Payment_of_Min_Amount,Unnamed: 1_level_1
Yes,52326
No,35667
NM,12007


In [557]:
df = pd.get_dummies(df, columns=["Payment_of_Min_Amount"], prefix="PMA")

In [558]:
df

Unnamed: 0,Customer_ID,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,...,Loan_Home Equity Loan,Loan_Mortgage Loan,Loan_Not Specified,Loan_Payday Loan,Loan_Personal Loan,Loan_Student Loan,Credit_History_Age_Months,PMA_NM,PMA_No,PMA_Yes
0,CUS_0xd40,23,6299,19114.12,1824.843333,3,4,3,4,3,...,1,0,0,0,1,0,265,False,True,False
1,CUS_0xd40,23,6299,19114.12,1824.843333,3,4,3,4,0,...,1,0,0,0,1,0,219,False,True,False
2,CUS_0xd40,34,6299,19114.12,1824.843333,3,4,3,4,3,...,1,0,0,0,1,0,267,False,True,False
3,CUS_0xd40,23,6299,19114.12,1824.843333,3,4,3,4,5,...,1,0,0,0,1,0,268,False,True,False
4,CUS_0xd40,23,6299,19114.12,1824.843333,3,4,3,4,6,...,1,0,0,0,1,0,269,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,CUS_0x942c,25,6291,39628.99,3359.415833,4,6,7,2,23,...,0,0,0,0,0,1,378,False,True,False
99996,CUS_0x942c,25,6291,39628.99,3359.415833,4,6,7,2,18,...,0,0,0,0,0,1,379,False,True,False
99997,CUS_0x942c,25,6291,39628.99,3359.415833,4,6,7,2,27,...,0,0,0,0,0,1,380,False,True,False
99998,CUS_0x942c,25,6291,39628.99,3359.415833,4,6,7,2,20,...,0,0,0,0,0,1,381,False,True,False


In [559]:
df["PMA_NM"] = df["PMA_NM"].astype(int)
df["PMA_No"] = df["PMA_No"].astype(int)
df["PMA_Yes"] = df["PMA_Yes"].astype(int)

In [560]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        100000 non-null  float64
 11  Changed_Credit_Limit          100000 non-null  float64
 12  Num_Credit_Inquiries          100000 non-null

In [561]:
df["Amount_invested_monthly"].isna().sum()

np.int64(4479)

In [562]:
df["Amount_invested_monthly"] = df.groupby("Customer_ID")["Amount_invested_monthly"].transform(lambda x: x.fillna(x.median()))
df["Amount_invested_monthly"] = df["Amount_invested_monthly"].fillna(df["Amount_invested_monthly"].median())


In [563]:
df["Amount_invested_monthly"].isna().sum()

np.int64(0)

In [564]:
df["Payment_Behaviour"].value_counts()

Unnamed: 0_level_0,count
Payment_Behaviour,Unnamed: 1_level_1
Low_spent_Small_value_payments,25513
High_spent_Medium_value_payments,17540
Low_spent_Medium_value_payments,13861
High_spent_Large_value_payments,13721
High_spent_Small_value_payments,11340
Low_spent_Large_value_payments,10425
!@9#%8,7600


In [565]:
df["Payment_Behaviour"] = df["Payment_Behaviour"].replace("!@9#%8", "Not Mentioned")

In [566]:
df = pd.get_dummies(df, columns=["Payment_Behaviour"], drop_first=True)

In [567]:
rename_map = {
    "Payment_Behaviour_High_spent_Medium_value_payments": "PB_High_Med",
    "Payment_Behaviour_High_spent_Small_value_payments": "PB_High_Small",
    "Payment_Behaviour_Low_spent_Large_value_payments": "PB_Low_Large",
    "Payment_Behaviour_Low_spent_Medium_value_payments": "PB_Low_Med",
    "Payment_Behaviour_Low_spent_Small_value_payments": "PB_Low_Small",
    "Payment_Behaviour_Not Mentioned": "PB_NotMentioned"
}

df.rename(columns=rename_map, inplace=True)

In [568]:
pb_cols = [c for c in df.columns if c.startswith("PB")]

df[pb_cols] = df[pb_cols].astype(int)

In [569]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 39 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Customer_ID                   100000 non-null  object 
 1   Age                           100000 non-null  int64  
 2   Occupation                    100000 non-null  int64  
 3   Annual_Income                 100000 non-null  float64
 4   Monthly_Inhand_Salary         100000 non-null  float64
 5   Num_Bank_Accounts             100000 non-null  int64  
 6   Num_Credit_Card               100000 non-null  int64  
 7   Interest_Rate                 100000 non-null  int64  
 8   Num_of_Loan                   100000 non-null  int64  
 9   Delay_from_due_date           100000 non-null  int64  
 10  Num_of_Delayed_Payment        100000 non-null  float64
 11  Changed_Credit_Limit          100000 non-null  float64
 12  Num_Credit_Inquiries          100000 non-null

In [570]:
df["Monthly_Balance"].isna().sum()

np.int64(1200)

In [571]:
df["Monthly_Balance"] = df.groupby("Customer_ID")["Monthly_Balance"].transform(lambda x: x.fillna(x.median()))
df["Monthly_Balance"] = df["Monthly_Balance"].fillna(df["Monthly_Balance"].median())


In [572]:
df["Monthly_Balance"].isna().sum()

np.int64(0)

In [573]:
df["Credit_Score"].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

In [574]:
df["Credit_Score"] = df["Credit_Score"].astype(str).str.strip().str.capitalize()

In [575]:
le = LabelEncoder()
df["Credit_Score"] = le.fit_transform(df["Credit_Score"])

print(dict(zip(le.classes_, le.transform(le.classes_))))


{'Good': np.int64(0), 'Poor': np.int64(1), 'Standard': np.int64(2)}


In [576]:
print(df["Credit_Score"].value_counts(normalize=True))

Credit_Score
2    0.53174
1    0.28998
0    0.17828
Name: proportion, dtype: float64
