In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import warnings
warnings.filterwarnings("ignore")

from credit_score.ml_logic.data import clean_data

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
df_train = pd.read_csv("../raw_data/train.csv")

In [5]:
df_train.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,27.262259,22 Years and 6 Months,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,22.537593,22 Years and 7 Months,No,49.574949,178.3440674122349,Low_spent_Small_value_payments,244.5653167062043,Good
7,0x1609,CUS_0xd40,August,,23,#F%$D@*&8,Scientist,19114.12,1824.843333,3,...,Good,809.98,23.933795,,No,49.574949,24.785216509052056,High_spent_Medium_value_payments,358.12416760938714,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28_,004-07-5839,_______,34847.84,3037.986667,2,...,Good,605.03,24.464031,26 Years and 7 Months,No,18.816215,104.291825168246,Low_spent_Small_value_payments,470.69062692529184,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,Good,605.03,38.550848,26 Years and 8 Months,No,18.816215,40.39123782853101,High_spent_Large_value_payments,484.5912142650067,Good


In [6]:
print(f"Shape: {df_train.shape}")
df_train.info()

Shape: (100000, 28)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan             

In [7]:
object_col = df_train.select_dtypes(include="object").columns
object_col

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Num_of_Loan', 'Type_of_Loan',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [8]:
for column in object_col:
    print(column)
    print(df_train[column].value_counts(dropna=False))
    print("\n")

ID
0x1602     1
0x19c88    1
0x19caa    1
0x19ca5    1
0x19ca4    1
          ..
0xd94d     1
0xd94c     1
0xd94b     1
0xd94a     1
0x25fed    1
Name: ID, Length: 100000, dtype: int64


Customer_ID
CUS_0xd40     8
CUS_0x9bf4    8
CUS_0x5ae3    8
CUS_0xbe9a    8
CUS_0x4874    8
             ..
CUS_0x2eb4    8
CUS_0x7863    8
CUS_0x9d89    8
CUS_0xc045    8
CUS_0x942c    8
Name: Customer_ID, Length: 12500, dtype: int64


Month
January     12500
February    12500
March       12500
April       12500
May         12500
June        12500
July        12500
August      12500
Name: Month, dtype: int64


Name
NaN               9985
Stevex              44
Langep              44
Jessicad            39
Vaughanl            39
                  ... 
Robin Pomeroyz       4
Matt Scuffhamk       4
Julieno              4
Bavierq              4
Timothyl             3
Name: Name, Length: 10140, dtype: int64


Age
38      2833
28      2829
31      2806
26      2792
32      2749
        ... 
471        1
152

In [9]:
df_train_copy = df_train.copy()
df_train_copy.shape

(100000, 28)

In [10]:
def remove_special_characters(data):
    if data is np.NaN or not isinstance(data, str):
        return data
    else:
        return str(data).strip('_ ,"')

In [11]:
df_train_copy = df_train_copy.applymap(remove_special_characters).replace(['', 'nan', '!@9#%8', '#F%$D@*&8'], np.NaN)
df_train_copy.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [12]:
def change_data_type(df):
    df['Age'] = df.Age.astype(int) 
    df['Annual_Income'] = df.Annual_Income.astype(float)
    df['Num_of_Loan'] = df.Num_of_Loan.astype(int) 
    df['Num_of_Delayed_Payment'] = df.Num_of_Delayed_Payment.astype(float)
    df['Changed_Credit_Limit'] = df.Changed_Credit_Limit.astype(float)
    df['Outstanding_Debt'] = df.Outstanding_Debt.astype(float)
    df['Amount_invested_monthly'] = df.Amount_invested_monthly.astype(float)
    df['Monthly_Balance'] = df.Monthly_Balance.astype(float)
    return df

In [13]:
df_train_copy.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                          object
SSN                          object
Occupation                   object
Annual_Income                object
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                  object
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment       object
Changed_Credit_Limit         object
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt             object
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly      object
Payment_Behaviour            object
Monthly_Balance              object
Credit_Score                

In [14]:
df_train_copy = change_data_type(df_train_copy)
df_train_copy.dtypes

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           int64
SSN                          object
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   int64
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment      float64
Changed_Credit_Limit        float64
Num_Credit_Inquiries        float64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age           object
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [15]:
def convert_to_months(x):
    if pd.notnull(x):
        num1 = int(x.split(' ')[0])
        num2 = int(x.split(' ')[3])
        return (num1 * 12) + num2
    else:
        return x

In [16]:
df_train_copy['Credit_History_Age'] = df_train_copy.Credit_History_Age.apply(lambda x: convert_to_months(x)).astype(float)
df_train_copy['Credit_History_Age']

0        265.0
1          NaN
2        267.0
3        268.0
4        269.0
         ...  
99995    378.0
99996    379.0
99997    380.0
99998    381.0
99999    382.0
Name: Credit_History_Age, Length: 100000, dtype: float64

In [17]:
df_train_copy['Type_of_Loan'] = df_train_copy['Type_of_Loan'].apply(
    lambda x: x.lower().replace('and ', '').replace(', ', ',').strip() if pd.notna(x) else x)
df_train_copy['Type_of_Loan'].replace([np.NaN], 'No Data', inplace=True)

In [18]:
def reassign_object_missing_with_mode(df, groupby, column, inplace=True):      
    # Assigning Wrong values Make Simple Function
    def make_NaN_and_fill_mode(df, groupby, column, inplace=True):
        # Assign None to np.NaN
        if df[column].isin([None]).sum():
            df[column][df[column].isin([None])] = np.NaN
            
        # fill with local mode
        result = df.groupby(groupby)[column].transform(lambda x: x.fillna(stats.mode(x)[0][0]))

        if inplace:
            df[column]=result
        else:
            return result
          
    if inplace:  
        make_NaN_and_fill_mode(df, groupby, column, inplace)
    else:   
        return make_NaN_and_fill_mode(df, groupby, column, inplace)

In [19]:
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Occupation')
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Credit_Mix')
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Payment_Behaviour')

In [20]:
def reassign_numeric_missing_with_mode(df, groupby, column, inplace=True):      
    # Assigning Wrong values
    def make_group_NaN_and_fill_mode(df, groupby, column, inplace=True):
        df_dropped = df[df[column].notna()].groupby(groupby)[column].apply(list)
        x, y = df_dropped.apply(lambda x: stats.mode(x)).apply([min, max])
        mini, maxi = x[0][0], y[0][0]

        # assign Wrong Values to NaN
        col = df[column].apply(lambda x: np.NaN if ((x<mini)|(x>maxi)) else x)

        # fill with local mode
        mode_by_group = df.groupby(groupby)[column].transform(lambda x: x.mode()[0] if not x.mode().empty else np.NaN)
        result = col.fillna(mode_by_group)

        if inplace:
            df[column]=result
        else:
            return result
        
    if inplace:   
        make_group_NaN_and_fill_mode(df, groupby, column, inplace)
    else:   
        return make_group_NaN_and_fill_mode(df, groupby, column, inplace)

In [21]:
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Age')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Annual_Income')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Monthly_Inhand_Salary')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Bank_Accounts')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Credit_Card')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Interest_Rate')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_of_Loan')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Delay_from_due_date')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_of_Delayed_Payment')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Changed_Credit_Limit')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Credit_Inquiries')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Outstanding_Debt')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Total_EMI_per_month')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Amount_invested_monthly')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Monthly_Balance')

In [22]:
df_train_copy['Credit_History_Age'] = df_train_copy.groupby('Customer_ID')['Credit_History_Age'].apply(
    lambda x: x.interpolate().bfill().ffill()
    )

In [23]:
df_train_copy.isna().sum()

ID                             0
Customer_ID                    0
Month                          0
Name                        9985
Age                            0
SSN                         5572
Occupation                     0
Annual_Income                  0
Monthly_Inhand_Salary          0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                   0
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Changed_Credit_Limit           0
Num_Credit_Inquiries           0
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age             0
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly        0
Payment_Behaviour              0
Monthly_Balance                0
Credit_Score                   0
dtype: int64

In [24]:
df_train_copy.shape

(100000, 28)

In [25]:
link = 'https://celik-muhammed.medium.com/how-to-converting-pandas-column-of-comma-separated-strings-into-dummy-variables-762c02282a6c'

def process_type_of_loan(X):
    data_sep = ','
    col_sep = '_'
    
    object_cols = X.select_dtypes(include="object").columns
    dummy_cols   = [col for col in object_cols if X[col].str.contains(data_sep, regex=True).any()]
    dummy_prefix = [''.join(map(lambda x: x[0], col.split(col_sep))) if col_sep in col else col[:2] for col in dummy_cols]
    
    for col, pre in zip(dummy_cols, dummy_prefix):
        dummy_X = X.join(X[col].str.get_dummies(sep = data_sep).add_prefix(pre + col_sep))            
        
    dummy_X.drop(columns = dummy_cols, inplace=True)
    columns = dummy_X.columns
    
    for col, pre in zip(dummy_cols, dummy_prefix):
        X_transformed = X.join(X[col].str.get_dummies(sep = data_sep).add_prefix(pre + col_sep))   

    X_transformed = X_transformed.reindex(columns = columns, fill_value = 0)   
           
    return X_transformed

In [26]:
cleaned_df = df_train_copy.copy()

In [27]:
cleaned_df.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,27.262259,270.0,No,49.574949,62.430172,Low_spent_Small_value_payments,340.479212,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,22.537593,271.0,No,49.574949,178.344067,Low_spent_Small_value_payments,244.565317,Good
7,0x1609,CUS_0xd40,August,,23.0,,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,23.933795,271.0,No,49.574949,24.785217,High_spent_Medium_value_payments,358.124168,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,605.03,24.464031,319.0,No,18.816215,104.291825,Low_spent_Small_value_payments,470.690627,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.84,3037.986667,2.0,...,Good,605.03,38.550848,320.0,No,18.816215,40.391238,High_spent_Large_value_payments,484.591214,Good


In [28]:
cleaned_df.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [29]:
cleaned_df.drop(columns=['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], inplace=True)

In [30]:
X = cleaned_df.drop(columns="Credit_Score")
y = cleaned_df['Credit_Score']

In [31]:
X_processed = process_type_of_loan(X)
X_processed

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,ToL_No Data,ToL_auto loan,ToL_credit-builder loan,ToL_debt consolidation loan,ToL_home equity loan,ToL_mortgage loan,ToL_not specified,ToL_payday loan,ToL_personal loan,ToL_student loan
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,0,1,1,0,1,0,0,0,1,0
1,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,-1.0,4.0,...,0,1,1,0,1,0,0,0,1,0
2,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,0,1,1,0,1,0,0,0,1,0
3,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,4.0,...,0,1,1,0,1,0,0,0,1,0
4,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,4.0,...,0,1,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,23.0,7.0,...,0,1,0,0,0,0,0,0,0,1
99996,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,18.0,7.0,...,0,1,0,0,0,0,0,0,0,1
99997,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,27.0,6.0,...,0,1,0,0,0,0,0,0,0,1
99998,25.0,Mechanic,39628.99,3359.415833,4.0,6.0,7.0,2.0,20.0,6.0,...,0,1,0,0,0,0,0,0,0,1


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

In [33]:
one_hot_encode_cols = ['Occupation', 'Payment_Behaviour']
ordinal_encode_cols = ['Payment_of_Min_Amount','Credit_Mix']
min_max_scale_cols = ['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Num_Credit_Inquiries',
                      'Num_of_Delayed_Payment']
standard_scale_cols = ['Credit_Utilization_Ratio', 'Changed_Credit_Limit', 'Credit_History_Age']
robust_scale_cols = ['Annual_Income', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Num_of_Loan',
                     'Delay_from_due_date', 'Outstanding_Debt', 'Total_EMI_per_month',
                     'Amount_invested_monthly', 'Monthly_Balance']

In [34]:
column_transformations = [('one_hot_encode', OneHotEncoder( sparse=False,handle_unknown='ignore'), one_hot_encode_cols),
                          ('ordinal_encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_encode_cols)]

scaling_transformer = [('min_max_scale', MinMaxScaler(), min_max_scale_cols),
                       ('standard_scale', StandardScaler(), standard_scale_cols),
                       ('robust_scale', RobustScaler(), robust_scale_cols)]

preprocessor = ColumnTransformer(transformers = column_transformations + scaling_transformer)

pipeline = make_pipeline(preprocessor)

X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

In [35]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [36]:
X_train_preprocessed.shape, y_train_encoded.shape

((70000, 40), (70000,))

In [37]:
# from tpot import TPOTClassifier
# from sklearn.metrics import accuracy_score

# tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
# tpot.fit(X_train_preprocessed, y_train_encoded)

# # Print the best pipeline found by TPOT
# print(tpot.fitted_pipeline_)

# # Evaluate the best pipeline on the test data
# accuracy = tpot.score(X_test_preprocessed, y_test_encoded)
# print("Accuracy:", accuracy)

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier(bootstrap=False, max_features=0.1, min_samples_leaf=2, 
                                       min_samples_split=5, n_estimators=100, criterion='gini')
rf_classifier.fit(X_train_preprocessed, y_train_encoded)

# Calculate accuracy on the training set
accuracy_train = rf_classifier.score(X_train_preprocessed, y_train_encoded)

# Calculate accuracy on the test set (assuming you have a separate X_test and y_test)
accuracy_test = rf_classifier.score(X_test_preprocessed, y_test_encoded)
print("Accuracy on training set:", accuracy_train)
print("Accuracy on test set:", accuracy_test)

Accuracy on training set: 0.9954714285714286
Accuracy on test set: 0.8097333333333333


In [39]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Assuming you have your X_train, y_train, and X_test data ready
# Define the base classifiers
base_classifiers = [
    ('random_forest', RandomForestClassifier(bootstrap=False, max_features=0.1, min_samples_leaf=2, min_samples_split=5, n_estimators=100, criterion='gini')),
    ('svm', SVC())
]

# Define the meta-classifier
meta_classifier = LogisticRegression()

# Create the stacking classifier
stacking_classifier = StackingClassifier(
    estimators=base_classifiers,
    final_estimator=meta_classifier
)
# Train the stacking classifier
stacking_classifier.fit(X_train_preprocessed, y_train_encoded)

# Make predictions on the test set
predictions = stacking_classifier.predict(X_test_preprocessed)

# Calculate accuracy on the test set
accuracy = stacking_classifier.score(X_test_preprocessed, y_test_encoded)
print("Accuracy:", accuracy)