### 📊 Credit Scoring Model
This notebook predicts credit scores (Poor, Standard, Good) using machine learning models.

### 💼 Dataset
- `train.csv`: contains labeled data with credit score
- `test.csv`: contains unlabeled data for prediction

### 🧠 Models Used
- Logistic Regression
- Random Forest Classifier

### 📌 Key Steps
1. Data Loading and Preprocessing
2. Label Encoding and Missing Value Handling
3. Model Training and Evaluation
4. Final Predictions for Submission

# 📥 Upload and Extract Dataset

In [None]:

from google.colab import files
uploaded = files.upload()  # Upload your ZIP here


Saving Credit score classification.zip to Credit score classification.zip


In [None]:
import zipfile
import os

with zipfile.ZipFile("Credit score classification.zip", 'r') as zip_ref:
    zip_ref.extractall("credit_data")

os.listdir("credit_data")  # You should see train.csv and test.csv


['test.csv', 'train.csv']

# 📚 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


## 📂 Step 1: Load Dataset

In [None]:
# Load CSV files
train = pd.read_csv('credit_data/train.csv')
test = pd.read_csv('credit_data/test.csv')

# Check shapes
print("Train shape:", train.shape)
print("Test shape:", test.shape)


  train = pd.read_csv('credit_data/train.csv')


Train shape: (100000, 28)
Test shape: (50000, 27)


## 🧹 Step 2: Preprocessing - Dropping Unnecessary Columns

In [None]:
cols_to_drop = ['ID', 'Customer_ID', 'Name', 'SSN', 'Month']
train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)


## 🔄 Step 3: Convert Object Columns to Numeric

In [None]:
to_convert = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
              'Changed_Credit_Limit', 'Outstanding_Debt',
              'Amount_invested_monthly', 'Monthly_Balance']

for col in to_convert:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')


In [None]:
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)


In [None]:
print("Train columns:\n", train.columns)
print("\nTest columns:\n", test.columns)


Train columns:
 Index(['Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

Test columns:
 Index(['Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Type_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
       'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
    

## 🔖 Step 4: Encode Categorical Features & Target

In [None]:
score_map = {"Poor": 0, "Standard": 1, "Good": 2}
train["Credit_Score"] = train["Credit_Score"].map(score_map)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Get object columns from train (excluding target 'Credit_Score')
cat_cols = [col for col in train.select_dtypes(include='object').columns if col != 'Credit_Score']

# Encode both train and test consistently
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)

    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))


In [None]:
X_train = train.drop("Credit_Score", axis=1)
y_train = train["Credit_Score"]


In [None]:
# Reload original train.csv just in case
train = pd.read_csv('credit_data/train.csv')

# Drop unused columns again
cols_to_drop = ['ID', 'Customer_ID', 'Name', 'SSN', 'Month']
train.drop(columns=cols_to_drop, inplace=True)

# Convert numeric-looking object columns
to_convert = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
              'Changed_Credit_Limit', 'Outstanding_Debt',
              'Amount_invested_monthly', 'Monthly_Balance']
for col in to_convert:
    train[col] = pd.to_numeric(train[col], errors='coerce')

# Drop rows where Credit_Score is missing or invalid
train = train[train['Credit_Score'].isin(['Poor', 'Standard', 'Good'])]

# Now map target labels
score_map = {"Poor": 0, "Standard": 1, "Good": 2}
train["Credit_Score"] = train["Credit_Score"].map(score_map)

# Fill numeric NaNs
train.fillna(train.median(numeric_only=True), inplace=True)

# Label encode categorical features
from sklearn.preprocessing import LabelEncoder
cat_cols = [col for col in train.select_dtypes(include='object').columns if col != 'Credit_Score']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))

# Final X and y
X_train = train.drop("Credit_Score", axis=1)
y_train = train["Credit_Score"]


  train = pd.read_csv('credit_data/train.csv')


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import classification_report

y_pred = lr.predict(X_train)

print("Logistic Regression Classification Report:\n")
print(classification_report(y_train, y_pred))


Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.55      0.32      0.41     28998
           1       0.55      0.84      0.67     53174
           2       0.35      0.03      0.05     17828

    accuracy                           0.55    100000
   macro avg       0.48      0.40      0.38    100000
weighted avg       0.51      0.55      0.48    100000



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_rf_pred = rf.predict(X_train)

# Evaluate
print("Random Forest Classification Report:\n")
print(classification_report(y_train, y_rf_pred))


Random Forest Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28998
           1       1.00      1.00      1.00     53174
           2       1.00      1.00      1.00     17828

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000



In [None]:
# Reload test (in case it was modified)
test = pd.read_csv('credit_data/test.csv')

# Drop unused columns
cols_to_drop = ['ID', 'Customer_ID', 'Name', 'SSN', 'Month']
test.drop(columns=cols_to_drop, inplace=True)

# Convert object to numeric
to_convert = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment',
              'Changed_Credit_Limit', 'Outstanding_Debt',
              'Amount_invested_monthly', 'Monthly_Balance']
for col in to_convert:
    test[col] = pd.to_numeric(test[col], errors='coerce')

# Fill NaNs
test.fillna(test.median(numeric_only=True), inplace=True)

# Label encode categorical columns
cat_cols = [col for col in test.select_dtypes(include='object').columns]
for col in cat_cols:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col].astype(str))

# Predict using Random Forest
test_preds = rf.predict(test)

# Map back to original labels
score_map_reverse = {0: "Poor", 1: "Standard", 2: "Good"}
test_preds_labels = [score_map_reverse[pred] for pred in test_preds]

# Save to CSV
submission = pd.DataFrame({
    "Credit_Score_Predicted": test_preds_labels
})
submission.to_csv("credit_score_predictions.csv", index=False)

print("✅ Predictions saved to credit_score_predictions.csv")


✅ Predictions saved to credit_score_predictions.csv
