In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import warnings
warnings.filterwarnings("ignore")

from credit_score.ml_logic.data import clean_data
from credit_score.ml_logic.preprocessor import preprocess_features, preprocess_target

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv("../raw_data/train.csv")

In [3]:
df_train.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,27.262259,22 Years and 6 Months,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,22.537593,22 Years and 7 Months,No,49.574949,178.3440674122349,Low_spent_Small_value_payments,244.5653167062043,Good
7,0x1609,CUS_0xd40,August,,23,#F%$D@*&8,Scientist,19114.12,1824.843333,3,...,Good,809.98,23.933795,,No,49.574949,24.785216509052056,High_spent_Medium_value_payments,358.12416760938714,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28_,004-07-5839,_______,34847.84,3037.986667,2,...,Good,605.03,24.464031,26 Years and 7 Months,No,18.816215,104.291825168246,Low_spent_Small_value_payments,470.69062692529184,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28,004-07-5839,Teacher,34847.84,3037.986667,2,...,Good,605.03,38.550848,26 Years and 8 Months,No,18.816215,40.39123782853101,High_spent_Large_value_payments,484.5912142650067,Good


In [13]:
df_cleaned = clean_data(df_train)
df_cleaned.head(10)

🧹 Cleaning data ...
✅ Data cleaned
(100000, 28)


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,26.82262,265.0,No,49.574947,80.415298,High_spent_Small_value_payments,312.49408,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,31.94496,266.0,No,49.574947,118.28022,Low_spent_Large_value_payments,284.62915,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,28.609352,267.0,No,49.574947,81.699524,Low_spent_Medium_value_payments,331.209869,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,31.377861,268.0,No,49.574947,199.458069,Low_spent_Small_value_payments,223.451309,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,24.797346,269.0,No,49.574947,41.420155,High_spent_Medium_value_payments,341.489227,Good
5,0x1607,CUS_0xd40,June,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,27.262259,270.0,No,49.574947,62.430172,High_spent_Medium_value_payments,340.479218,Good
6,0x1608,CUS_0xd40,July,Aaron Maashoh,23.0,821-00-0265,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,22.537594,271.0,No,49.574947,178.34407,Low_spent_Small_value_payments,244.565323,Good
7,0x1609,CUS_0xd40,August,,23.0,,Scientist,19114.119141,1824.843384,3.0,...,Good,809.97998,23.933794,271.0,No,49.574947,24.785217,High_spent_Medium_value_payments,358.124176,Standard
8,0x160e,CUS_0x21b1,January,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.839844,3037.986572,2.0,...,Good,605.030029,24.464031,319.0,No,18.816216,104.291824,Low_spent_Small_value_payments,470.690613,Standard
9,0x160f,CUS_0x21b1,February,Rick Rothackerj,28.0,004-07-5839,Teacher,34847.839844,3037.986572,2.0,...,Good,605.030029,38.55085,320.0,No,18.816216,40.391239,High_spent_Large_value_payments,484.591217,Good


In [14]:
X = df_cleaned.drop(columns = ['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Credit_Score'])
y = df_cleaned['Credit_Score']

In [15]:
X.shape

(100000, 22)

In [16]:
X_processed = preprocess_features(X)
X_processed

array([[ 0.        ,  0.        ,  0.        , ..., -0.14295912,
        -0.26960206, -0.11209453],
       [ 0.        ,  0.        ,  0.        , ..., -0.14295912,
        -0.06607142, -0.25397605],
       [ 0.        ,  0.        ,  0.        , ..., -0.14295912,
        -0.2626991 , -0.01679824],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.26579085,
        -0.5726912 ,  0.92822826],
       [ 0.        ,  0.        ,  0.        , ..., -0.26579085,
         0.65093601, -0.07812792],
       [ 0.        ,  0.        ,  0.        , ..., -0.26579085,
         0.19668679,  0.30125257]])

In [17]:
X_processed.shape

(100000, 40)

In [None]:
print(f"Shape: {df_train.shape}")
df_train.info()

In [None]:
object_col = df_train.select_dtypes(include="object").columns
object_col

In [None]:
for column in object_col:
    print(column)
    print(df_train[column].value_counts(dropna=False))
    print("\n")

In [None]:
df_train_copy = df_train.copy()
df_train_copy.shape

In [None]:
def remove_special_characters(data):
    if data is np.NaN or not isinstance(data, str):
        return data
    else:
        return str(data).strip('_ ,"')

In [None]:
df_train_copy = df_train_copy.applymap(remove_special_characters).replace(['', 'nan', '!@9#%8', '#F%$D@*&8'], np.NaN)
df_train_copy.head()

In [None]:
def change_data_type(df):
    df['Age'] = df.Age.astype(int) 
    df['Annual_Income'] = df.Annual_Income.astype(float)
    df['Num_of_Loan'] = df.Num_of_Loan.astype(int) 
    df['Num_of_Delayed_Payment'] = df.Num_of_Delayed_Payment.astype(float)
    df['Changed_Credit_Limit'] = df.Changed_Credit_Limit.astype(float)
    df['Outstanding_Debt'] = df.Outstanding_Debt.astype(float)
    df['Amount_invested_monthly'] = df.Amount_invested_monthly.astype(float)
    df['Monthly_Balance'] = df.Monthly_Balance.astype(float)
    return df

In [None]:
df_train_copy.dtypes

In [None]:
df_train_copy = change_data_type(df_train_copy)
df_train_copy.dtypes

In [None]:
def convert_to_months(x):
    if pd.notnull(x):
        num1 = int(x.split(' ')[0])
        num2 = int(x.split(' ')[3])
        return (num1 * 12) + num2
    else:
        return x

In [None]:
df_train_copy['Credit_History_Age'] = df_train_copy.Credit_History_Age.apply(lambda x: convert_to_months(x)).astype(float)
df_train_copy['Credit_History_Age']

In [None]:
df_train_copy['Type_of_Loan'] = df_train_copy['Type_of_Loan'].apply(
    lambda x: x.lower().replace('and ', '').replace(', ', ',').strip() if pd.notna(x) else x)
df_train_copy['Type_of_Loan'].replace([np.NaN], 'No Data', inplace=True)

In [None]:
def reassign_object_missing_with_mode(df, groupby, column, inplace=True):      
    # Assigning Wrong values Make Simple Function
    def make_NaN_and_fill_mode(df, groupby, column, inplace=True):
        # Assign None to np.NaN
        if df[column].isin([None]).sum():
            df[column][df[column].isin([None])] = np.NaN
            
        # fill with local mode
        result = df.groupby(groupby)[column].transform(lambda x: x.fillna(stats.mode(x)[0][0]))

        if inplace:
            df[column]=result
        else:
            return result
          
    if inplace:  
        make_NaN_and_fill_mode(df, groupby, column, inplace)
    else:   
        return make_NaN_and_fill_mode(df, groupby, column, inplace)

In [None]:
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Occupation')
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Credit_Mix')
reassign_object_missing_with_mode(df_train_copy, 'Customer_ID', 'Payment_Behaviour')

In [None]:
def reassign_numeric_missing_with_mode(df, groupby, column, inplace=True):      
    # Assigning Wrong values
    def make_group_NaN_and_fill_mode(df, groupby, column, inplace=True):
        df_dropped = df[df[column].notna()].groupby(groupby)[column].apply(list)
        x, y = df_dropped.apply(lambda x: stats.mode(x)).apply([min, max])
        mini, maxi = x[0][0], y[0][0]

        # assign Wrong Values to NaN
        col = df[column].apply(lambda x: np.NaN if ((x<mini)|(x>maxi)) else x)

        # fill with local mode
        mode_by_group = df.groupby(groupby)[column].transform(lambda x: x.mode()[0] if not x.mode().empty else np.NaN)
        result = col.fillna(mode_by_group)

        if inplace:
            df[column]=result
        else:
            return result
        
    if inplace:   
        make_group_NaN_and_fill_mode(df, groupby, column, inplace)
    else:   
        return make_group_NaN_and_fill_mode(df, groupby, column, inplace)

In [None]:
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Age')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Annual_Income')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Monthly_Inhand_Salary')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Bank_Accounts')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Credit_Card')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Interest_Rate')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_of_Loan')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Delay_from_due_date')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_of_Delayed_Payment')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Changed_Credit_Limit')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Num_Credit_Inquiries')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Outstanding_Debt')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Total_EMI_per_month')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Amount_invested_monthly')
reassign_numeric_missing_with_mode(df_train_copy, 'Customer_ID', 'Monthly_Balance')

In [None]:
df_train_copy['Credit_History_Age'] = df_train_copy.groupby('Customer_ID')['Credit_History_Age'].apply(
    lambda x: x.interpolate().bfill().ffill()
    )

In [None]:
df_train_copy.isna().sum()

In [None]:
df_train_copy.shape

In [None]:
link = 'https://celik-muhammed.medium.com/how-to-converting-pandas-column-of-comma-separated-strings-into-dummy-variables-762c02282a6c'

def process_type_of_loan(X):
    data_sep = ','
    col_sep = '_'
    
    object_cols = X.select_dtypes(include="object").columns
    dummy_cols   = [col for col in object_cols if X[col].str.contains(data_sep, regex=True).any()]
    dummy_prefix = [''.join(map(lambda x: x[0], col.split(col_sep))) if col_sep in col else col[:2] for col in dummy_cols]
    
    for col, pre in zip(dummy_cols, dummy_prefix):
        dummy_X = X.join(X[col].str.get_dummies(sep = data_sep).add_prefix(pre + col_sep))            
        
    dummy_X.drop(columns = dummy_cols, inplace=True)
    columns = dummy_X.columns
    
    for col, pre in zip(dummy_cols, dummy_prefix):
        X_transformed = X.join(X[col].str.get_dummies(sep = data_sep).add_prefix(pre + col_sep))   

    X_transformed = X_transformed.reindex(columns = columns, fill_value = 0)   
           
    return X_transformed

In [None]:
cleaned_df = df_train_copy.copy()

In [None]:
cleaned_df.head(10)

In [None]:
cleaned_df.columns

In [None]:
cleaned_df.drop(columns=['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], inplace=True)

In [None]:
X = cleaned_df.drop(columns="Credit_Score")
y = cleaned_df['Credit_Score']

In [None]:
X_processed = process_type_of_loan(X)
X_processed

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

In [None]:
one_hot_encode_cols = ['Occupation', 'Payment_Behaviour']
ordinal_encode_cols = ['Payment_of_Min_Amount','Credit_Mix']
min_max_scale_cols = ['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Num_Credit_Inquiries',
                      'Num_of_Delayed_Payment']
standard_scale_cols = ['Credit_Utilization_Ratio', 'Changed_Credit_Limit', 'Credit_History_Age']
robust_scale_cols = ['Annual_Income', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Num_of_Loan',
                     'Delay_from_due_date', 'Outstanding_Debt', 'Total_EMI_per_month',
                     'Amount_invested_monthly', 'Monthly_Balance']

In [None]:
column_transformations = [('one_hot_encode', OneHotEncoder( sparse=False,handle_unknown='ignore'), one_hot_encode_cols),
                          ('ordinal_encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_encode_cols)]

scaling_transformer = [('min_max_scale', MinMaxScaler(), min_max_scale_cols),
                       ('standard_scale', StandardScaler(), standard_scale_cols),
                       ('robust_scale', RobustScaler(), robust_scale_cols)]

preprocessor = ColumnTransformer(transformers = column_transformations + scaling_transformer)

pipeline = make_pipeline(preprocessor)

X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [None]:
X_train_preprocessed.shape, y_train_encoded.shape

In [None]:
# from tpot import TPOTClassifier
# from sklearn.metrics import accuracy_score

# tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
# tpot.fit(X_train_preprocessed, y_train_encoded)

# # Print the best pipeline found by TPOT
# print(tpot.fitted_pipeline_)

# # Evaluate the best pipeline on the test data
# accuracy = tpot.score(X_test_preprocessed, y_test_encoded)
# print("Accuracy:", accuracy)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# rf_classifier = RandomForestClassifier(bootstrap=False, max_features=0.1, min_samples_leaf=2, 
#                                        min_samples_split=5, n_estimators=100, criterion='gini')
# rf_classifier.fit(X_train_preprocessed, y_train_encoded)

# # Calculate accuracy on the training set
# accuracy_train = rf_classifier.score(X_train_preprocessed, y_train_encoded)

# # Calculate accuracy on the test set (assuming you have a separate X_test and y_test)
# accuracy_test = rf_classifier.score(X_test_preprocessed, y_test_encoded)
# print("Accuracy on training set:", accuracy_train)
# print("Accuracy on test set:", accuracy_test)

In [None]:
# from sklearn.ensemble import StackingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC

# # Assuming you have your X_train, y_train, and X_test data ready
# # Define the base classifiers
# base_classifiers = [
#     ('random_forest', RandomForestClassifier(bootstrap=False, max_features=0.1, min_samples_leaf=2, min_samples_split=5, n_estimators=100, criterion='gini')),
#     ('svm', SVC())
# ]

# # Define the meta-classifier
# meta_classifier = LogisticRegression()

# # Create the stacking classifier
# stacking_classifier = StackingClassifier(
#     estimators=base_classifiers,
#     final_estimator=meta_classifier
# )
# # Train the stacking classifier
# stacking_classifier.fit(X_train_preprocessed, y_train_encoded)

# # Make predictions on the test set
# predictions = stacking_classifier.predict(X_test_preprocessed)

# # Calculate accuracy on the test set
# accuracy = stacking_classifier.score(X_test_preprocessed, y_test_encoded)
# print("Accuracy:", accuracy)

In [None]:
cleaned_df.columns