# Data Preprocessing Section

In [2]:
import pandas as pd
data = pd.read_csv('cleaned_train.csv')

In [3]:
data.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [4]:
# Display the data types of each column
print(data.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           int64
SSN                          object
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   int64
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment        int64
Changed_Credit_Limit        float64
Num_Credit_Inquiries          int64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [6]:
# Create OneHotEncoder Object
onehot = OneHotEncoder(drop = 'first')

# Occupation is the only column that should need one hot encoding
data_onehot = data[['Occupation']]

data_onehot = onehot.fit_transform(data_onehot.values)

# Make sure the new column names look to be correct
onehot.get_feature_names_out()


array(['x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer',
       'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager',
       'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist',
       'x0_Teacher', 'x0_Writer'], dtype=object)

In [7]:
# Re-construct the result to a DataFrame
data_onehot = pd.DataFrame(data = data_onehot.toarray(), columns = onehot.get_feature_names_out())

# Update the orginal DataFrame data with the newly OneHot encoded data.
data[onehot.get_feature_names_out()] = data_onehot

# Drop the original column
data.drop(['Occupation'], axis = 1, inplace = True)

# Check the columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64740 entries, 0 to 64739
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        64740 non-null  object 
 1   Customer_ID               64740 non-null  object 
 2   Month                     64740 non-null  object 
 3   Name                      64740 non-null  object 
 4   Age                       64740 non-null  int64  
 5   SSN                       64740 non-null  object 
 6   Annual_Income             64740 non-null  float64
 7   Monthly_Inhand_Salary     64740 non-null  float64
 8   Num_Bank_Accounts         64740 non-null  int64  
 9   Num_Credit_Card           64740 non-null  int64  
 10  Interest_Rate             64740 non-null  int64  
 11  Num_of_Loan               64740 non-null  int64  
 12  Type_of_Loan              64740 non-null  object 
 13  Delay_from_due_date       64740 non-null  int64  
 14  Num_of

In [8]:
# Splitting features and target variable
X = data.drop(columns=['Credit_Score'])
y = data['Credit_Score']

In [9]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [10]:
# Check to make sure the dimensions are still the same
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(51792, 40)
(12948, 40)
(51792,)
(12948,)


In [11]:
# Define ordinal mappings for the variables that should be label encoded
credit_mix_mapping = {'Good': 2, 'Standard': 1, 'Bad': 0}
payment_of_min_amount_mapping = {'Yes': 2, 'NM': 1, 'No': 0}
payment_behavior_mapping = {
    'Low_spent_Large_value_payments': 5,
    'Low_spent_Medium_value_payments': 4,
    'Low_spent_Small_value_payments': 3,
    'High_spent_Large_value_payments': 2,
    'High_spent_Medium_value_payments': 1,
    'High_spent_Small_value_payments': 0
}

# Perform the mapping on each of the ordinal columns for both training and testing sets
X_train['Credit_Mix'] = X_train['Credit_Mix'].map(credit_mix_mapping)
X_test['Credit_Mix'] = X_test['Credit_Mix'].map(credit_mix_mapping)

X_train['Payment_of_Min_Amount'] = X_train['Payment_of_Min_Amount'].map(payment_of_min_amount_mapping)
X_test['Payment_of_Min_Amount'] = X_test['Payment_of_Min_Amount'].map(payment_of_min_amount_mapping)

X_train['Payment_Behaviour'] = X_train['Payment_Behaviour'].map(payment_behavior_mapping)
X_test['Payment_Behaviour'] = X_test['Payment_Behaviour'].map(payment_behavior_mapping)


In [12]:
# Check one of the columns to make sure it worked
X_train['Payment_Behaviour']

55277    4
63884    2
40962    3
48812    5
57189    4
        ..
5695     3
8006     4
17745    3
17931    2
45919    3
Name: Payment_Behaviour, Length: 51792, dtype: int64

In [13]:
# Check to make sure X_train and y_train have the same dimensions
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(51792, 40)
(12948, 40)
(51792,)
(12948,)


In [15]:
# Fill any null value swith 0.0 (might no longer need this part)
X_train = X_train.fillna(0.0)
X_test = X_test.fillna(0.0)

In [16]:
# Check to make sure X_train and y_train have the same dimensions, which they do not so that is a problem 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(51792, 40)
(12948, 40)
(51792,)
(12948,)


In [17]:
# Import the necessary things
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from itertools import chain

# Define the numeric columns that will be in the models
numeric_columns = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
                   'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
                   'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
                   'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']

# Define the appropriate scalers for each column based on the columns characteristics
scalers = {
    'Age': MinMaxScaler(),
    'Annual_Income': RobustScaler(),
    'Monthly_Inhand_Salary': RobustScaler(),
    'Num_Bank_Accounts': MinMaxScaler(),
    'Num_Credit_Card': MinMaxScaler(),
    'Interest_Rate': MinMaxScaler(),
    'Num_of_Loan': MinMaxScaler(),
    'Delay_from_due_date': RobustScaler(),
    'Num_of_Delayed_Payment': RobustScaler(),
    'Changed_Credit_Limit': RobustScaler(),
    'Num_Credit_Inquiries': MinMaxScaler(),
    'Outstanding_Debt': RobustScaler(),
    'Credit_Utilization_Ratio': MinMaxScaler(),
    'Credit_History_Age': MinMaxScaler(),
    'Total_EMI_per_month': MinMaxScaler(),
    'Amount_invested_monthly': MinMaxScaler(),
    'Monthly_Balance': MinMaxScaler()
}

# Function to scale a single column
def scale_column(scaler, column, X_train, X_test):
    X_train[column] = scaler.fit_transform(X_train[[column]])
    X_test[column] = scaler.transform(X_test[[column]])

# Incremental scaling function so that we hopefully do not run out of memory anymore
def incremental_scaling(scalers, columns, X_train, X_test, batch_size):
    for i in range(0, len(columns), batch_size):
        batch_columns = columns[i:i + batch_size]
        for column in batch_columns:
            scaler = scalers[column]
            scale_column(scaler, column, X_train, X_test)

# Set batch size to something small to help with memory
batch_size = 5

# Perform incremental scaling
incremental_scaling(scalers, numeric_columns, X_train, X_test, batch_size)


In [18]:
# To show that the dataset has been preprocessed
X_train.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,...,x0_Entrepreneur,x0_Journalist,x0_Lawyer,x0_Manager,x0_Mechanic,x0_Media_Manager,x0_Musician,x0_Scientist,x0_Teacher,x0_Writer
55277,0x2012b,CUS_0xb367,February,Vlastelicaf,0.071429,301-31-7681,1.34536,1.303194,0.277778,0.4375,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
63884,0x24e4d,CUS_0x599e,August,Victoriap,0.095238,370-99-4111,1.631819,1.674992,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
40962,0x18129,CUS_0x8274,April,Norihikov,0.761905,968-42-8366,-0.30333,-0.329687,0.333333,0.375,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48812,0x1c7b4,CUS_0x1dab,March,Jonathan Stempeld,0.380952,481-04-0516,-0.501451,-0.545294,0.333333,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
57189,0x2126c,CUS_0x8ca,March,Marcyb,0.333333,450-99-9426,-0.00922,-0.012842,0.333333,0.4375,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27848,0x10c97,CUS_0x61c2,June,Aradhana Aravindanw,0.214286,008-17-0608,1.053865,0.987536,0.277778,0.375,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38451,0x16acd,CUS_0x43a7,April,Suvashreea,0.404762,385-07-5752,-0.072871,-0.09671,0.388889,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5381,0x4613,CUS_0x4ab4,June,raq,0.571429,723-08-9989,-0.149234,-0.13599,0.555556,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24384,0xedee,CUS_0xa843,May,Siva Govindasamyx,0.380952,744-38-4293,0.816028,0.70763,0.5,0.5625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
29707,0x11dbe,CUS_0x51e5,May,Osamun,0.404762,001-11-0655,-0.542998,-0.571916,0.388889,0.3125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
X_train.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer',
       'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager',
       'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist',
       'x0_Teacher', 'x0_Writer'],
      dtype='object')

In [20]:
X_test.head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,...,x0_Entrepreneur,x0_Journalist,x0_Lawyer,x0_Manager,x0_Mechanic,x0_Media_Manager,x0_Musician,x0_Scientist,x0_Teacher,x0_Writer
15244,0x9ca8,CUS_0xa01,July,Ferreira-Marquesf,0.309524,980-65-8757,-0.00711,-0.019361,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
27791,0x10c1a,CUS_0x11ae,January,Xiaoyi Shaou,0.166667,032-97-5902,-0.423256,-0.384414,0.388889,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31948,0x131a1,CUS_0x81ee,August,Liana B.f,0.214286,992-84-5158,-0.560336,-0.626775,0.333333,0.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54810,0x1fd70,CUS_0x4916,July,Jim Finkler,0.5,270-26-6300,-0.322671,-0.350558,0.055556,0.3125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43478,0x19782,CUS_0x627b,January,Wakako Satod,0.380952,217-43-5920,0.819942,0.856198,0.388889,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6765,0x528a,CUS_0x7e25,May,Ayesha Rascoey,0.119048,755-90-5835,-0.397708,-0.382987,0.277778,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39553,0x174c1,CUS_0xa2dd,August,Anjuli Daviesa,0.333333,603-21-2338,-0.236017,-0.293019,0.166667,0.4375,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7723,0x5acb,CUS_0x7b66,June,Shaod,0.095238,221-80-6215,0.435379,0.416556,0.388889,0.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
42076,0x18ae9,CUS_0x6bff,April,Davido,0.214286,545-42-9297,-0.541602,-0.545313,0.388889,0.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3713,0x3741,CUS_0x5718,April,Sumantax,0.47619,888-84-4641,-0.400607,-0.371259,0.222222,0.4375,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [21]:
X_test.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'x0_Architect', 'x0_Developer', 'x0_Doctor', 'x0_Engineer',
       'x0_Entrepreneur', 'x0_Journalist', 'x0_Lawyer', 'x0_Manager',
       'x0_Mechanic', 'x0_Media_Manager', 'x0_Musician', 'x0_Scientist',
       'x0_Teacher', 'x0_Writer'],
      dtype='object')

In [22]:
y_train.head(10)

55277    Standard
63884        Good
40962    Standard
48812    Standard
57189    Standard
27848    Standard
38451        Poor
5381     Standard
24384    Standard
29707        Poor
Name: Credit_Score, dtype: object

In [23]:
y_test.head(10)

15244    Standard
27791    Standard
31948    Standard
54810    Standard
43478        Poor
6765     Standard
39553        Poor
7723     Standard
42076    Standard
3713     Standard
Name: Credit_Score, dtype: object

In [24]:
# Save X_train, X_test, y_train, and y_test to a pickle file so I can reference them in the model building part

# Do the importing 
import pickle

# Save the data
with open('train_test_data.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)