In [1]:
import pandas as pd
data = pd.read_csv('cleaned_train.csv')

In [2]:
data.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [3]:
# Display the data types of each column
print(data.dtypes)

ID                           object
Customer_ID                  object
Month                        object
Name                         object
Age                           int64
SSN                          object
Occupation                   object
Annual_Income               float64
Monthly_Inhand_Salary       float64
Num_Bank_Accounts             int64
Num_Credit_Card               int64
Interest_Rate                 int64
Num_of_Loan                   int64
Type_of_Loan                 object
Delay_from_due_date           int64
Num_of_Delayed_Payment        int64
Changed_Credit_Limit        float64
Num_Credit_Inquiries          int64
Credit_Mix                   object
Outstanding_Debt            float64
Credit_Utilization_Ratio    float64
Credit_History_Age          float64
Payment_of_Min_Amount        object
Total_EMI_per_month         float64
Amount_invested_monthly     float64
Payment_Behaviour            object
Monthly_Balance             float64
Credit_Score                

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [5]:
# Splitting features and target variable
X = data.drop(columns=['Credit_Score'])
y = data['Credit_Score']

In [6]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# Define ordinal mappings for the variables that should be label encoded
credit_mix_mapping = {'Good': 2, 'Standard': 1, 'Bad': 0}
payment_of_min_amount_mapping = {'Yes': 2, 'NM': 1, 'No': 0}
payment_behavior_mapping = {
    'Low_spent_Large_value_payments': 5,
    'Low_spent_Medium_value_payments': 4,
    'Low_spent_Small_value_payments': 3,
    'High_spent_Large_value_payments': 2,
    'High_spent_Medium_value_payments': 1,
    'High_spent_Small_value_payments': 0
}

# Apply LabelEncoder with the above ordinal mapping
label_encoder = LabelEncoder()

X_train['Credit_Mix'] = X_train['Credit_Mix'].map(credit_mix_mapping)
X_test['Credit_Mix'] = X_test['Credit_Mix'].map(credit_mix_mapping)

X_train['Payment_of_Min_Amount'] = X_train['Payment_of_Min_Amount'].map(payment_of_min_amount_mapping)
X_test['Payment_of_Min_Amount'] = X_test['Payment_of_Min_Amount'].map(payment_of_min_amount_mapping)

X_train['Payment_Behaviour'] = X_train['Payment_Behaviour'].map(payment_behavior_mapping)
X_test['Payment_Behaviour'] = X_test['Payment_Behaviour'].map(payment_behavior_mapping)

# Combine the training and test sets for OneHotEncoder fitting
combined_data = pd.concat([X_train, X_test])

# Apply OneHotEncoder for remaining categorical columns that should be in the models
categorical_columns = ['Occupation', 'Name'] 

for column in categorical_columns:
    # Perform one-hot encoding
    one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
    combined_encoded = one_hot_encoder.fit_transform(combined_data[[column]])
    
    # Convert the encoded features to DataFrames
    combined_encoded_df = pd.DataFrame(combined_encoded, columns=one_hot_encoder.get_feature_names_out([column]))
    
    # Split the combined_encoded_df back into training and test sets
    X_train_encoded_df = combined_encoded_df[:len(X_train)]
    X_test_encoded_df = combined_encoded_df[len(X_train):]
    
    # Concatenate the encoded features with the original DataFrames
    X_train = pd.concat([X_train.drop(columns=[column]), X_train_encoded_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=[column]), X_test_encoded_df], axis=1)


In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from joblib import Parallel, delayed

# Define the numeric columns that will be in the models
numeric_columns = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
                   'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
                   'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
                   'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']

# Define the appropriate scalers for each column based on the columns characteristics
scalers = {
    'Age': MinMaxScaler(),
    'Annual_Income': RobustScaler(),
    'Monthly_Inhand_Salary': RobustScaler(),
    'Num_Bank_Accounts': MinMaxScaler(),
    'Num_Credit_Card': MinMaxScaler(),
    'Interest_Rate': MinMaxScaler(),
    'Num_of_Loan': MinMaxScaler(),
    'Delay_from_due_date': RobustScaler(),
    'Num_of_Delayed_Payment': RobustScaler(),
    'Changed_Credit_Limit': RobustScaler(),
    'Num_Credit_Inquiries': MinMaxScaler(),
    'Outstanding_Debt': RobustScaler(),
    'Credit_Utilization_Ratio': MinMaxScaler(),
    'Credit_History_Age': MinMaxScaler(),
    'Total_EMI_per_month': MinMaxScaler(),
    'Amount_invested_monthly': MinMaxScaler(),
    'Monthly_Balance': MinMaxScaler()
}

# Function to scale a single column that will run through each of them
def scale_column(column):
    scaler = scalers[column]
    X_train[column] = scaler.fit_transform(X_train[[column]])
    X_test[column] = scaler.transform(X_test[[column]])

# Use joblib to parallelize scaling across numeric columns to speed things up because it takes forever
Parallel(n_jobs=-1)(delayed(scale_column)(column) for column in numeric_columns)

# HAVING ISSUES WITH THIS PART

In [None]:
# To show that the dataset has been preprocessed
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Excluding non-numeric columns from input features
columns_to_exclude = ['ID', 'Customer_ID', 'Month', 'SSN', 'Type_of_Loan']  # List of non-numeric columns to exclude
X_train = X_train.drop(columns=columns_to_exclude)
X_test = X_test.drop(columns=columns_to_exclude)

# Train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Predictions using logistic regression model
logistic_regression_preds = logistic_regression_model.predict(X_test)

# Evaluate the logistic regression model
logistic_regression_accuracy = logistic_regression_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", logistic_regression_accuracy)