In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Data Cleaning

## Identify Issues

1. **Not Useful Columns:**
   - `ID`, `Name`, and `SSN` are not useful for the analysis.

2. **Numeric Columns Incorrectly Typed as Categorical:**
   - `Age`, `Annual_Income`, `Num_of_Loan`, `Num_of_Delayed_Payment`, `Changed_Credit_Limit`, `Amount_invested_monthly`, `Outstanding_Debt`, `Credit_Mix`, `Monthly_Balance` are numerical but show as categorical. These need to be fixed.

3. **Values "__" in Columns:**
   - `Occupation` and `CreditMix` have values "__" that need to be addressed.

4. **Outliers:**
   - The data contains outliers that require investigation.

5. **Num_Credit_Card Zeros:**
   - `Num_Credit_Card` has zeros that need attention.

6. **Type_of_Loan Restructuring:**
   - `Type_of_Loan` needs to be rewritten as 8 columns.

7. **Negative Values in Num_Bank_Accounts:**
   - `Num_Bank_Accounts` contains negative values that need to be handled.

8. **Feature Engineering:**
   - `Credit_History_Age`, `Payment_of_Min_Amount`, `Payment_Behaviour`, `Credit_Mix` need feature engineering.

9. **Imbalanced Target Column:**
   - The target column is imbalanced and may require techniques for handling class imbalance.

10. **Missing Data:**
    - There is a significant amount of missing data that needs to be addressed.

---

*Note: The specific actions for each issue may involve further analysis, cleaning, or preprocessing steps.*


1. **Not Useful Columns:**

In [2]:
del train['ID'] # Identification 
del train['Name'] # Name of client 
del train['SSN'] # SSN (social security number of a person)

**Fix Numerical Columns**
- replace _
- convert into float

In [3]:
N_to_fix = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 
            'Changed_Credit_Limit', 'Amount_invested_monthly', 'Outstanding_Debt' , 'Monthly_Balance']

In [4]:
def fix_nums(num):
    try : 
        return float(num.replace("_",""))
    except :
        return np.nan

In [5]:
for col in N_to_fix :
    train[col] = train[col].apply(fix_nums)

**Type_of_Loan**

In [6]:
## Rebuild Type of loans Columns 
for i in train['Type_of_Loan'].value_counts().head(9).index[1:] : 
    train[i] = train['Type_of_Loan'].str.contains(i)

del train['Type_of_Loan']

**Num_Bank_Accounts**

In [7]:
train['Num_Bank_Accounts'] = train['Num_Bank_Accounts'].apply(lambda x :abs (x))

**Num_Credit_Card**

In [8]:
train['Num_Credit_Card'].replace(0,1,inplace=True)

**เหลือ 8-10**

ข้อมูลมีการ/น่าจะมีการกำหนดเฉลย (labeling) ด้วยวิธีใด เพราะเหตุใด มีปัญหาหรือไม่ อย่างไร แก้ไขได้อย่างไร


In [9]:
train['Credit_Score'].unique()

array(['Good', 'Standard', 'Poor'], dtype=object)

Index(['Customer_ID', 'Month', 'Age', 'Occupation', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance'],
      dtype='object')

In [16]:
train.columns

Index(['Customer_ID', 'Month', 'Age', 'Occupation', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score', 'Credit-Builder Loan', 'Personal Loan',
       'Debt Consolidation Loan', 'Student Loan', 'Payday Loan',
       'Mortgage Loan', 'Auto Loan', 'Home Equity Loan'],
      dtype='object')

In [17]:
test.shape

(50000, 24)

In [18]:
train.shape

(100000, 32)