In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

train_dataset = pd.read_csv("train.csv", low_memory=False)
# test = pd.read_csv("test.csv", low_memory=False)
train_dataset

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,11.27,4.0,_,809.98,26.822620,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.944960,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,_,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736786,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0x25fe9,CUS_0x942c,April,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",23,7,11.5,3.0,_,502.38,34.663572,31 Years and 6 Months,No,35.104023,60.97133255718485,High_spent_Large_value_payments,479.86622816574095,Poor
99996,0x25fea,CUS_0x942c,May,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",18,7,11.5,3.0,_,502.38,40.565631,31 Years and 7 Months,No,35.104023,54.18595028760385,High_spent_Medium_value_payments,496.651610435322,Poor
99997,0x25feb,CUS_0x942c,June,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,5729,2,"Auto Loan, and Student Loan",27,6,11.5,3.0,Good,502.38,41.255522,31 Years and 8 Months,No,35.104023,24.02847744864441,High_spent_Large_value_payments,516.8090832742814,Poor
99998,0x25fec,CUS_0x942c,July,Nicks,25,078-73-5990,Mechanic,39628.99,3359.415833,4,6,7,2,"Auto Loan, and Student Loan",20,,11.5,3.0,Good,502.38,33.638208,31 Years and 9 Months,No,35.104023,251.67258219721603,Low_spent_Large_value_payments,319.1649785257098,Standard


# Data Cleaning

## Identify Issues

1. **Not Useful Columns:**
   - `ID`, `Name`, and `SSN` are not useful for the analysis.

2. **Numeric Columns Incorrectly Typed as Categorical:**
   - `Age`, `Annual_Income`, `Num_of_Loan`, `Num_of_Delayed_Payment`, `Changed_Credit_Limit`, `Amount_invested_monthly`, `Outstanding_Debt`, `Credit_Mix`, `Monthly_Balance` are numerical but show as categorical. These need to be fixed.

3. **Values "__" in Columns:**
   - `Occupation` and `CreditMix` have values "__" that need to be addressed.

4. **Outliers:**
   - The data contains outliers that require investigation.

5. **Num_Credit_Card Zeros:**
   - `Num_Credit_Card` has zeros that need attention.

6. **Type_of_Loan Restructuring:**
   - `Type_of_Loan` needs to be rewritten as 8 columns.

7. **Negative Values in Num_Bank_Accounts:**
   - `Num_Bank_Accounts` contains negative values that need to be handled.

8. **Feature Engineering:**
   - `Credit_History_Age`, `Payment_of_Min_Amount`, `Payment_Behaviour`, `Credit_Mix` need feature engineering.

9. **Imbalanced Target Column:**
   - The target column is imbalanced and may require techniques for handling class imbalance.

10. **Missing Data:**
    - There is a significant amount of missing data that needs to be addressed.

---

*Note: The specific actions for each issue may involve further analysis, cleaning, or preprocessing steps.*


1. **Not Useful Columns:**

In [2]:
del train_dataset['ID'] # Identification 
del train_dataset['Name'] # Name of client 
del train_dataset['SSN'] # SSN (social security number of a person)

**Fix Numerical Columns**
- replace _
- convert into float

In [3]:
N_to_fix = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 
            'Changed_Credit_Limit', 'Amount_invested_monthly', 'Outstanding_Debt' , 'Monthly_Balance']

In [4]:
def fix_nums(num):
    try : 
        return float(num.replace("_",""))
    except :
        return np.nan

In [5]:
for col in N_to_fix :
    train_dataset[col] = train_dataset[col].apply(fix_nums)

**Type_of_Loan**

In [6]:
## Rebuild Type of loans Columns 
for i in train_dataset['Type_of_Loan'].value_counts().head(9).index[1:] : 
    train_dataset[i] = train_dataset['Type_of_Loan'].str.contains(i)

del train_dataset['Type_of_Loan']

**Num_Bank_Accounts**

In [7]:
train_dataset['Num_Bank_Accounts'] = train_dataset['Num_Bank_Accounts'].apply(lambda x :abs (x))

**Num_Credit_Card**

In [8]:
train_dataset['Num_Credit_Card'].replace(0,1,inplace=True)

**เหลือ 8-10**

ข้อมูลมีการ/น่าจะมีการกำหนดเฉลย (labeling) ด้วยวิธีใด เพราะเหตุใด มีปัญหาหรือไม่ อย่างไร แก้ไขได้อย่างไร


In [9]:
train_dataset['Credit_Score'].unique()


array(['Good', 'Standard', 'Poor'], dtype=object)

In [10]:
# X = train_dataset.drop(columns=['Credit_Score'])  # Features excluding the target variable
# Y = train_dataset['Credit_Score']  # Target variable
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# label_encoder = LabelEncoder()
# Y_train_encoded = label_encoder.fit_transform(Y_train)
# Y_test_encoded = label_encoder.transform(Y_test)

In [11]:
# svc_model = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)
# svc_model.fit(X_train, Y_train)
# # svc_predictions = svc_model.predict(X_test)
# # svc_accuracy = accuracy_score(y_test, svc_predictions)
# # print(f"Linear SVC Accuracy: {svc_accuracy}")

In [12]:
# sgd_model = SGDClassifier(loss='log', random_state=42)
# sgd_model.fit(X_train, Y_train)
# sgd_predictions = sgd_model.predict(X_test)
# sgd_accuracy = accuracy_score(Y_test, sgd_predictions)
# print(f"SGD Classifier Accuracy: {sgd_accuracy}")

In [19]:
numeric_columns = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
                    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
                    'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
                    'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Amount_invested_monthly',
                    'Monthly_Balance']

categorical_columns = ['Occupation', 'Credit_Mix', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Payment_Behaviour']

target_variable = ['Credit_Score']

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),  # StandardScaler for numeric columns
        ('cat', OneHotEncoder(), categorical_columns),  # OneHotEncoder for categorical columns
        ('target', LabelEncoder(), target_variable)  # LabelEncoder for target variable
    ])

# train_dataset = preprocessor.fit_transform(train_dataset)
for col in numeric_columns:
    unique_values = train_dataset[col].unique()
    non_numeric_values = [val for val in unique_values if not pd.api.types.is_numeric_dtype(val)]
    if non_numeric_values:
        print(f"Column '{col}' has non-numeric values: {non_numeric_values}")


Column 'Age' has non-numeric values: ['23', '-500', '28_', '28', '34', '54', '55', '21', '31', '33', '34_', '7580', '30', '30_', '24', '24_', '44', '45', '40', '41', '32', '33_', '35', '35_', '36', '39', '37', '181', '20', '46', '26', '42', '19', '31_', '48', '995', '40_', '37_', '38', '54_', '5079', '43', '21_', '22', '6409', '16', '7080', '18', '849', '3885', '20_', '15', '27', '43_', '38_', '25', '3052', '14', '5342', '17', '18_', '4431', '2657', '2111_', '46_', '47', '1032', '16_', '19_', '47_', '456', '5717', '53_', '53', '56', '25_', '27_', '55_', '3169', '1191', '29', '42_', '7140', '48_', '49', '49_', '8233', '6955', '2534', '3115', '7657', '51', '50', '5112', '50_', '32_', '6452', '2744', '22_', '17_', '1439', '29_', '5795', '4872', '1772', '15_', '1383', '5657', '52', '1934', '51_', '8352', '3734', '26_', '2056', '39_', '7508', '2339', '14_', '8406', '1424', '36_', '6953', '5626', '4471', '548', '44_', '769', '5490', '525', '4202', '3665', '7670_', '4670', '3616', '6922', '66

Column 'Annual_Income' has non-numeric values: ['19114.12', '34847.84', '34847.84_', '143162.64', '30689.89', '30689.89_', '35547.71_', '35547.71', '73928.46', '131313.4', '10909427.0', '34081.38_', '34081.38', '114838.41', '114838.41_', '31370.8', '33751.27', '88640.24', '88640.24_', '54392.16', '54392.16_', '8701.545', '8701.545_', '25546.26', '25546.26_', '31993.78', '92047.08', '92047.08_', '32284.62', '97791.42', '97791.42_', '19300.34', '19514.88', '10183.015', '10183.015_', '106733.13', '106733.13_', '12600.445', '12600.445_', '57983.12', '57983.12_', '20787.69', '34290.12', '34290.12_', '43070.24', '43070.24_', '28572.39', '6515990.0_', '39641.54', '39641.54_', '20186.02', '586359.0', '18627.64', '12986.745', '58317.0', '42171.98', '71681.4', '29469.98', '29469.98_', '72559.36', '15566.02', '15566.02_', '66567.32', '12909.895', '30788.44', '20574.47', '20574.47_', '148699.32', '148699.32_', '85554.03', '55829.79', '18334118.0', '19717385.0', '14165.23', '14165.23_', '87215.68',

Column 'Outstanding_Debt' has non-numeric values: ['809.98', '605.03', '1303.01', '632.46', '943.86', '548.2', '352.16', '1704.18', '1377.74', '421.43', '1328.93', '1328.93_', '950.36', '179.22', '2602.69', '758.44', '818.22', '1296.64', '1283.37', '1283.37_', '107.41', '2430.21', '881.4', '1672.43', '76.23', '569.8', '3532.83', '614.6', '2686.81', '1233.1', '400.07', '2258.73', '413.32', '1793.54', '1489.49', '98.97', '556.91', '1293.02', '3421.66', '1173.7', '1693.95', '233.79', '602.5', '3470.08', '749.95', '949.38', '1095.73', '3422.49', '2797.17_', '2797.17', '680.77', '3053.16', '1079.73', '2253.95', '3119.6', '4071.62', '1154.46', '2431.64', '125.25', '997.28', '3042.36', '1746.9', '1936.79', '4834.59', '1219.39', '325.93', '191.09', '1037.45', '2497.34', '4795.13', '2425.38', '580.31', '2497.48', '1045.11', '523.56', '327.88', '4984.82', '596.78', '2174.16', '996.54', '137.15', '1093.87', '3818.57_', '3818.57', '730.9', '1374.56', '852.81', '668.14', '660.69', '3865.15', '1549.

Column 'Amount_invested_monthly' has non-numeric values: ['80.41529543900253', '118.28022162236736', '81.699521264648', '199.4580743910713', '41.420153086217326', '62.430172331195294', '178.3440674122349', '24.785216509052056', '104.291825168246', '40.39123782853101', '58.51597569589465', '99.30622796053305', '130.11542024292334', '43.477190144355745', '70.10177420755677', '218.90434353388733', '168.413702679309', '232.86038375993544', '__10000__', '825.2162699393922', '430.9475278803298', '257.80809942568976', '263.17416316163934', '81.22885871073616', '124.88181990234848', '83.40650880252501', '272.3340373956682', '84.95284817115969', '71.28367488286933', '125.61725053231268', '276.72539431736266', '74.44364104999623', '173.13865100158367', '96.78548508587444', '62.72327834435009', '37.64363788963997', '181.0119827315892', '181.33090096186916', '98.67440994166124', '172.93921446875606', '150.05973429800815', '618.2023912505837', '177.95183568608738', '235.79032503182026', '348.509399

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
train_dataset.head(500)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,11.27,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7,_,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",8,4,9.27,4.0,Good,809.98,27.262259,22 Years and 6 Months,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,8_,11.27,4.0,Good,809.98,22.537593,22 Years and 7 Months,No,49.574949,178.3440674122349,Low_spent_Small_value_payments,244.5653167062043,Good
7,0x1609,CUS_0xd40,August,,23,#F%$D@*&8,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,6,11.27,4.0,Good,809.98,23.933795,,No,49.574949,24.785216509052056,High_spent_Medium_value_payments,358.12416760938714,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28_,004-07-5839,_______,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,3,4,5.42,2.0,Good,605.03,24.464031,26 Years and 7 Months,No,18.816215,104.291825168246,Low_spent_Small_value_payments,470.69062692529184,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,4,6,1,Credit-Builder Loan,7,1,7.42,2.0,Good,605.03,38.550848,26 Years and 8 Months,No,18.816215,40.39123782853101,High_spent_Large_value_payments,484.5912142650067,Good


In [14]:
train_dataset.dtypes

Customer_ID                  object
Month                        object
Age                         float64
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                 float64
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                 object
Credit-Builder Loan          object
Personal Loan                object
Debt Consolidation Loan      object
Student Loan                

In [25]:
train_dataset.Home Equity Loan

SyntaxError: invalid syntax (1511370055.py, line 1)