In [1]:
import pandas as pd
import numpy as np
from datasist.structdata import detect_outliers
from sklearn.impute import SimpleImputer


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

df = pd.read_csv("train.csv", low_memory=False)
df

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,11.27,4.0,_,809.98,26.822620,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.944960,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,_,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736786,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0x25fe9,CUS_0x942c,April,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",23,7,11.5,3.0,_,502.38,34.663572,31 Years and 6 Months,No,35.104023,60.97133255718485,High_spent_Large_value_payments,479.86622816574095,Poor
99996,0x25fea,CUS_0x942c,May,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",18,7,11.5,3.0,_,502.38,40.565631,31 Years and 7 Months,No,35.104023,54.18595028760385,High_spent_Medium_value_payments,496.651610435322,Poor
99997,0x25feb,CUS_0x942c,June,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,5729,2,"Auto Loan, and Student Loan",27,6,11.5,3.0,Good,502.38,41.255522,31 Years and 8 Months,No,35.104023,24.02847744864441,High_spent_Large_value_payments,516.8090832742814,Poor
99998,0x25fec,CUS_0x942c,July,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",20,,11.5,3.0,Good,502.38,33.638208,31 Years and 9 Months,No,35.104023,251.67258219721603,Low_spent_Large_value_payments,319.1649785257098,Standard


# Data Cleaning

## Identify Issues

1. **Not Useful Columns:**
   - `ID`, `Name`, `month` and `SSN` are not useful for the analysis. and Delete customers ID after being used

2. **Numeric Columns Incorrectly Typed as Categorical:**
   - `Age`, `Annual_Income`, `Num_of_Loan`, `Num_of_Delayed_Payment`, `Changed_Credit_Limit`, `Amount_invested_monthly`, `Outstanding_Debt`, `Credit_Mix`, `Monthly_Balance` are numerical but show as categorical. These need to be fixed.

3. **Values "__" in Columns:**
   - `Occupation` and `CreditMix` have values "__" that need to be addressed.

4. **Outliers:**
   - Means and Delete Age rows that higher than 100 and lower than 0
   
5. **Num_Credit_Card Zeros:**
   - `Num_Credit_Card` has zeros that need attention.

6. **Type_of_Loan Restructuring:**
   - `Type_of_Loan` needs to be rewritten as 8 columns.

7. **Negative Values in Num_Bank_Accounts:**
   - `Num_Bank_Accounts` contains negative values that need to be handled.

8. **Feature Engineering:**
   - `Credit_History_Age`, `Payment_of_Min_Amount`, `Payment_Behaviour`, `Credit_Mix` need feature engineering.

9. **Missing Data:**
    - There is a significant amount of missing data that needs to be addressed.

---

1. **Not Useful Columns:**

In [3]:
del df['ID'] # Identification 
del df['Name'] # Name of client 
del df['SSN'] # SSN (social security number of a person)
del df['Month']

2. **Numeric Columns Incorrectly Typed as Categorical:**


In [4]:
N_to_fix = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Amount_invested_monthly', 'Outstanding_Debt' , 'Monthly_Balance']

def fix_nums(num):
    try : 
        return float(num.replace("_",""))
    except :
        return np.nan
    
for col in N_to_fix :
    df[col] = df[col].apply(fix_nums)

In [5]:
df.shape

(100000, 24)

5. **Num_Credit_Card Zeros:**

In [6]:
df['Num_Credit_Card'].replace(0,1,inplace=True)

6. **Type_of_Loan Restructuring:**

In [7]:
## Rebuild Type of loans Columns 
for i in df['Type_of_Loan'].value_counts().head(9).index[1:] : 
    df[i] = df['Type_of_Loan'].str.contains(i)

del df['Type_of_Loan']

7. **Negative Values in Num_Bank_Accounts:**

In [8]:
df['Num_Bank_Accounts'] = df['Num_Bank_Accounts'].apply(lambda x :abs (x))

8. **Feature Engineering:**
   - `Credit_History_Age`, `Payment_of_Min_Amount`, `Payment_Behaviour`, `Credit_Mix` need feature engineering.


**Credit_History_Age**

In [9]:
print(df['Credit_History_Age'].sample(10))

48024    17 Years and 4 Months
21053    22 Years and 7 Months
38352    27 Years and 8 Months
11036    28 Years and 2 Months
20692                      NaN
26262     9 Years and 3 Months
18246    14 Years and 1 Months
85698    28 Years and 7 Months
4185                       NaN
99566    24 Years and 1 Months
Name: Credit_History_Age, dtype: object


In [10]:
import re

def History_age(age):
    try: 
        # Extract numbers using regular expression
        numbers = re.findall(r'\d+', age)
        
        # Convert the first number to years, and the second number to months
        years = int(numbers[0]) if numbers else 0
        months = int(numbers[1]) if len(numbers) > 1 else 0
        
        return years * 12 + months
    except:
        return np.nan

df['Credit_History_Age'] = df['Credit_History_Age'].apply(History_age)

**Payment_of_Min_Amount**

In [11]:
df['Payment_of_Min_Amount'].replace("NM","No",inplace=True)
df['Payment_of_Min_Amount'].value_counts()

Yes    52326
No     47674
Name: Payment_of_Min_Amount, dtype: int64

**Payment_Behaviour**

In [12]:
df['Payment_Behaviour']= df['Payment_Behaviour'].replace("!@9#%8",np.nan)
df['Payment_Behaviour'].value_counts()

Low_spent_Small_value_payments      25513
High_spent_Medium_value_payments    17540
Low_spent_Medium_value_payments     13861
High_spent_Large_value_payments     13721
High_spent_Small_value_payments     11340
Low_spent_Large_value_payments      10425
Name: Payment_Behaviour, dtype: int64

3. **Values "__" in Columns:**
   - `Occupation` and `CreditMix` have values "__" that need to be addressed.



In [13]:
m = {
    "Bad":0,
    "Standard":1,
    "Good":2,
    "_":np.nan
}
df['Credit_Mix'] = df['Credit_Mix'].map(m)
df['Credit_Mix'].value_counts()

1.0    36479
2.0    24337
0.0    18989
Name: Credit_Mix, dtype: int64

In [14]:
df['Occupation'].value_counts()

_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: Occupation, dtype: int64

In [15]:
# Identify customers with '_______' in 'Occupation'
customers_with_placeholder = df[df['Occupation'] == "_______"]['Customer_ID'].unique()

# Calculate mode 'Occupation' for each customer group
mode_occupation_by_customer = df.groupby('Customer_ID')['Occupation'].apply(lambda x: x.mode()[0]).reset_index()

# Replace '_______' with mode 'Occupation' using merge
df = df.merge(mode_occupation_by_customer, on='Customer_ID', how='left', suffixes=('', '_mode'))
df['Occupation'] = df.apply(lambda row: row['Occupation_mode'] if row['Occupation'] == "_______" else row['Occupation'], axis=1)

# Drop temporary columns used for calculation
df.drop(['Occupation_mode'], axis=1, inplace=True)


In [16]:
df['Occupation'].value_counts()

Lawyer           7096
Engineer         6864
Architect        6824
Mechanic         6776
Scientist        6744
Accountant       6744
Developer        6720
Media_Manager    6715
Teacher          6672
Entrepreneur     6648
Doctor           6568
Journalist       6536
Manager          6432
Musician         6352
Writer           6304
_______             5
Name: Occupation, dtype: int64

In [17]:
df['Occupation'] = df['Occupation'].replace("_______",df['Occupation'].mode()[0])
df['Occupation'].value_counts()


Lawyer           7101
Engineer         6864
Architect        6824
Mechanic         6776
Scientist        6744
Accountant       6744
Developer        6720
Media_Manager    6715
Teacher          6672
Entrepreneur     6648
Doctor           6568
Journalist       6536
Manager          6432
Musician         6352
Writer           6304
Name: Occupation, dtype: int64

4. **Outliers:**

In [18]:
df.dtypes

Customer_ID                  object
Age                         float64
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                 float64
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                  float64
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                 object
Credit-Builder Loan          object
Personal Loan                object
Debt Consolidation Loan      object
Student Loan                 object
Payday Loan                 

In [19]:
Numericals = df.select_dtypes(exclude='object').columns[1:]

for col in Numericals:
    outliers_indices = detect_outliers(df, 0, [col])
    median = df[col].median()
    df.loc[outliers_indices, col] = median

In [20]:
df = df[(df['Age'] >= 0) & (df['Age'] <= 100)]

9. **Advanced Handling Missing Data**

In [21]:
# Edit Columns from bool to int 
for col in list(df.columns[-8:]):
    df[col] = df[col].astype(float)

In [22]:
df

Unnamed: 0,Customer_ID,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Credit-Builder Loan,Personal Loan,Debt Consolidation Loan,Student Loan,Payday Loan,Mortgage Loan,Auto Loan,Home Equity Loan
0,CUS_0xd40,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,3,7.0,11.27,4.0,,809.98,26.822620,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,CUS_0xd40,23.0,Scientist,19114.12,,3,4,3,4.0,-1,,11.27,4.0,2.0,809.98,31.944960,,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,CUS_0xd40,23.0,Scientist,19114.12,,3,4,3,4.0,5,4.0,6.27,4.0,2.0,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.451310,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,CUS_0xd40,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,6,,11.27,4.0,2.0,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
5,CUS_0xd40,23.0,Scientist,19114.12,,3,4,3,4.0,8,4.0,9.27,4.0,2.0,809.98,27.262259,270.0,No,49.574949,62.430172,,340.479212,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,CUS_0x942c,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,23,7.0,11.50,3.0,,502.38,34.663572,378.0,No,35.104023,60.971333,High_spent_Large_value_payments,479.866228,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99996,CUS_0x942c,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,18,7.0,11.50,3.0,,502.38,40.565631,379.0,No,35.104023,54.185950,High_spent_Medium_value_payments,496.651610,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99997,CUS_0x942c,25.0,Mechanic,39628.99,3359.415833,4,6,13,2.0,27,6.0,11.50,3.0,2.0,502.38,41.255522,380.0,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99998,CUS_0x942c,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,20,,11.50,3.0,2.0,502.38,33.638208,381.0,No,35.104023,251.672582,Low_spent_Large_value_payments,319.164979,Standard,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [23]:
# IDs = 1 
# for ID in df['Customer_ID'].unique() :
#     df['Customer_ID'] = df['Customer_ID'].replace(ID,IDs)
#     IDs += 1 

df['Customer_ID'] = pd.factorize(df['Customer_ID'])[0] + 1



In [24]:
df

Unnamed: 0,Customer_ID,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Credit-Builder Loan,Personal Loan,Debt Consolidation Loan,Student Loan,Payday Loan,Mortgage Loan,Auto Loan,Home Equity Loan
0,1,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,3,7.0,11.27,4.0,,809.98,26.822620,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,23.0,Scientist,19114.12,,3,4,3,4.0,-1,,11.27,4.0,2.0,809.98,31.944960,,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1,23.0,Scientist,19114.12,,3,4,3,4.0,5,4.0,6.27,4.0,2.0,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.451310,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1,23.0,Scientist,19114.12,1824.843333,3,4,3,4.0,6,,11.27,4.0,2.0,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
5,1,23.0,Scientist,19114.12,,3,4,3,4.0,8,4.0,9.27,4.0,2.0,809.98,27.262259,270.0,No,49.574949,62.430172,,340.479212,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,12500,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,23,7.0,11.50,3.0,,502.38,34.663572,378.0,No,35.104023,60.971333,High_spent_Large_value_payments,479.866228,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99996,12500,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,18,7.0,11.50,3.0,,502.38,40.565631,379.0,No,35.104023,54.185950,High_spent_Medium_value_payments,496.651610,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99997,12500,25.0,Mechanic,39628.99,3359.415833,4,6,13,2.0,27,6.0,11.50,3.0,2.0,502.38,41.255522,380.0,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
99998,12500,25.0,Mechanic,39628.99,3359.415833,4,6,7,2.0,20,,11.50,3.0,2.0,502.38,33.638208,381.0,No,35.104023,251.672582,Low_spent_Large_value_payments,319.164979,Standard,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [25]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=1)

In [26]:
Numericals = df.select_dtypes(exclude='object').columns[1:]
Numericals

Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Credit-Builder Loan',
       'Personal Loan', 'Debt Consolidation Loan', 'Student Loan',
       'Payday Loan', 'Mortgage Loan', 'Auto Loan', 'Home Equity Loan'],
      dtype='object')

In [27]:
# for col in Numericals[1:]:
#     imputer.fit(df[['Customer_ID',col]])
#     df[['Customer_ID',col]] = imputer.transform(df[['Customer_ID',col]])

# Create a SimpleImputer
imputer = SimpleImputer(strategy='median')

# Fit and transform numerical columns
df[Numericals[1:]] = imputer.fit_transform(df[Numericals[1:]])

In [28]:
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(df[['Payment_Behaviour']])
df[['Payment_Behaviour']] = imputer.transform(df[['Payment_Behaviour']])

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97224 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_ID               97224 non-null  int64  
 1   Age                       97224 non-null  float64
 2   Occupation                97224 non-null  object 
 3   Annual_Income             97224 non-null  float64
 4   Monthly_Inhand_Salary     97224 non-null  float64
 5   Num_Bank_Accounts         97224 non-null  float64
 6   Num_Credit_Card           97224 non-null  float64
 7   Interest_Rate             97224 non-null  float64
 8   Num_of_Loan               97224 non-null  float64
 9   Delay_from_due_date       97224 non-null  float64
 10  Num_of_Delayed_Payment    97224 non-null  float64
 11  Changed_Credit_Limit      97224 non-null  float64
 12  Num_Credit_Inquiries      97224 non-null  float64
 13  Credit_Mix                97224 non-null  float64
 14  Outsta

In [30]:
m = {
    "Poor":0,
    "Standard":1,
    "Good":2
}
df['Credit_Score'] = df['Credit_Score'].map(m)
del df['Customer_ID']

In [31]:
df = pd.get_dummies(df,drop_first=False)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97224 entries, 0 to 99999
Data columns (total 50 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Age                                                 97224 non-null  float64
 1   Annual_Income                                       97224 non-null  float64
 2   Monthly_Inhand_Salary                               97224 non-null  float64
 3   Num_Bank_Accounts                                   97224 non-null  float64
 4   Num_Credit_Card                                     97224 non-null  float64
 5   Interest_Rate                                       97224 non-null  float64
 6   Num_of_Loan                                         97224 non-null  float64
 7   Delay_from_due_date                                 97224 non-null  float64
 8   Num_of_Delayed_Payment                              97224 non-null  float64


In [33]:
df

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance,Credit_Score,Credit-Builder Loan,Personal Loan,Debt Consolidation Loan,Student Loan,Payday Loan,Mortgage Loan,Auto Loan,Home Equity Loan,Occupation_Accountant,Occupation_Architect,Occupation_Developer,Occupation_Doctor,Occupation_Engineer,Occupation_Entrepreneur,Occupation_Journalist,Occupation_Lawyer,Occupation_Manager,Occupation_Mechanic,Occupation_Media_Manager,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
0,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,11.27,4.0,1.0,809.98,26.822620,265.0,49.574949,80.415295,312.494089,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0
1,23.0,19114.12,3095.978333,3.0,4.0,3.0,4.0,-1.0,14.0,11.27,4.0,2.0,809.98,31.944960,219.0,49.574949,118.280222,284.629162,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0
3,23.0,19114.12,3095.978333,3.0,4.0,3.0,4.0,5.0,4.0,6.27,4.0,2.0,809.98,31.377862,268.0,49.574949,199.458074,223.451310,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,14.0,11.27,4.0,2.0,809.98,24.797347,269.0,49.574949,41.420153,341.489231,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
5,23.0,19114.12,3095.978333,3.0,4.0,3.0,4.0,8.0,4.0,9.27,4.0,2.0,809.98,27.262259,270.0,49.574949,62.430172,340.479212,2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,23.0,7.0,11.50,3.0,1.0,502.38,34.663572,378.0,35.104023,60.971333,479.866228,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0
99996,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,18.0,7.0,11.50,3.0,1.0,502.38,40.565631,379.0,35.104023,54.185950,496.651610,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0
99997,25.0,39628.99,3359.415833,4.0,6.0,13.0,2.0,27.0,6.0,11.50,3.0,2.0,502.38,41.255522,380.0,35.104023,24.028477,516.809083,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0
99998,25.0,39628.99,3359.415833,4.0,6.0,7.0,2.0,20.0,14.0,11.50,3.0,2.0,502.38,33.638208,381.0,35.104023,251.672582,319.164979,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [34]:
df.dtypes

Age                                                   float64
Annual_Income                                         float64
Monthly_Inhand_Salary                                 float64
Num_Bank_Accounts                                     float64
Num_Credit_Card                                       float64
Interest_Rate                                         float64
Num_of_Loan                                           float64
Delay_from_due_date                                   float64
Num_of_Delayed_Payment                                float64
Changed_Credit_Limit                                  float64
Num_Credit_Inquiries                                  float64
Credit_Mix                                            float64
Outstanding_Debt                                      float64
Credit_Utilization_Ratio                              float64
Credit_History_Age                                    float64
Total_EMI_per_month                                   float64
Amount_i

In [35]:
df.to_csv('cleaned_data.csv', index=False)