In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis AI\Dataset\cs-training.csv")

# Display basic info
print("Missing Values:\n", df.isnull().sum())

# Calculate median for imputation from the training data
train_median = df.median(numeric_only=True)

# Fill missing values
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
df['NumberOfDependents'].fillna(df['NumberOfDependents'].median(), inplace=True)

# Confirm no missing values
print("\nMissing after cleaning:\n", df.isnull().sum())

  df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis AI\Dataset\cs-training.csv")


Missing Values:
 Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

Missing after cleaning:
 Unnamed: 0                              0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEsta

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['NumberOfDependents'].fillna(df['NumberOfDependents'].median(), inplace=True)


In [10]:
# Separate features and labels
X = df.drop(['SeriousDlqin2yrs', 'Unnamed: 0'], axis=1, errors= 'ignore')
y = df['SeriousDlqin2yrs']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Check class distribution after SMOTE
print("\nClass distribution after SMOTE (Training Data):")
print(pd.Series(y_resampled).value_counts())

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


Class distribution after SMOTE (Training Data):
SeriousDlqin2yrs
1    139974
0    139974
Name: count, dtype: int64


In [11]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[26570  1355]
 [ 1818 26247]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94     27925
           1       0.95      0.94      0.94     28065

    accuracy                           0.94     55990
   macro avg       0.94      0.94      0.94     55990
weighted avg       0.94      0.94      0.94     55990



In [13]:
def evaluate_thresholds(model, X, y_true, thresholds=[0.3, 0.4, 0.5, 0.6]):
    y_probs = model.predict_proba(X)[:, 1]  # Get probability of class 1
    
    for threshold in thresholds:
        print(f"\n🔎 Threshold = {threshold}")
        y_pred = (y_probs >= threshold).astype(int)
        
        # Print summary stats
        print("Confusion Matrix:")
        print(confusion_matrix(y_true, y_pred))
        print("Classification Report:")
        print(classification_report(y_true, y_pred, digits=4))
        
        # Show additional custom summary
        print(f"Precision: {precision_score(y_true, y_pred):.4f}")
        print(f"Recall:    {recall_score(y_true, y_pred):.4f}")
        print(f"F1 Score:  {f1_score(y_true, y_pred):.4f}")

evaluate_thresholds(model, X_test, y_test)


🔎 Threshold = 0.3
Confusion Matrix:
[[23480  4445]
 [  612 27453]]
Classification Report:
              precision    recall  f1-score   support

           0     0.9746    0.8408    0.9028     27925
           1     0.8606    0.9782    0.9157     28065

    accuracy                         0.9097     55990
   macro avg     0.9176    0.9095    0.9092     55990
weighted avg     0.9175    0.9097    0.9092     55990

Precision: 0.8606
Recall:    0.9782
F1 Score:  0.9157

🔎 Threshold = 0.4
Confusion Matrix:
[[25318  2607]
 [ 1066 26999]]
Classification Report:
              precision    recall  f1-score   support

           0     0.9596    0.9066    0.9324     27925
           1     0.9119    0.9620    0.9363     28065

    accuracy                         0.9344     55990
   macro avg     0.9358    0.9343    0.9343     55990
weighted avg     0.9357    0.9344    0.9343     55990

Precision: 0.9119
Recall:    0.9620
F1 Score:  0.9363

🔎 Threshold = 0.5
Confusion Matrix:
[[26483  1442]
 [ 1

In [None]:
import joblib

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(model, 'random_forest_model.pkl')

# Load the test dataset
test_df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis AI\Dataset\cs-test.csv")

# Save and remove 'Id' column for final submission
test_ids = test_df['Id'] if 'Id' in test_df.columns else pd.Series(range(len(test_df)))
test_df = test_df.drop(columns=['Id'], errors='ignore')

# Drop columns not used during training
test_df = test_df.drop(columns=['Unnamed: 0', 'SeriousDlqin2yrs'], errors='ignore')

# Handle missing values in the test set using the median from the training data
test_df['MonthlyIncome'].fillna(train_median['MonthlyIncome'], inplace=True)
test_df['NumberOfDependents'].fillna(train_median['NumberOfDependents'], inplace=True)

# Verify feature alignment
print("\nTraining Features (after dropping target):\n", X.columns.tolist())
print("\nTest Features (after dropping irrelevant columns):\n", test_df.columns.tolist())

# Load the previously saved scaler and model
scaler = joblib.load("scaler.pkl")            # Scaler fitted on training data
model = joblib.load("random_forest_model.pkl")  # Trained RandomForestClassifier

# Scale the test features using the same scaler
X_test_scaled = scaler.transform(test_df)

# Predict probabilities instead of classes
test_probs = model.predict_proba(X_test_scaled)[:, 1]  # Get probabilities for class 1

# Adjust the threshold (e.g., try 0.4, 0.5 etc)
threshold = 0.4
test_preds = (test_probs >= threshold).astype(int)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'Index': test_ids,
    'Predicted_SeriousDlqin2yrs': test_preds
})


# Save to CSV
submission_df.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")

print("\nSubmission Head:\n", submission_df.head(30))
# Check the number of 1's and 0's in the 'Predicted_SeriousDlqin2yrs' column
number_of_ones = (submission_df['Predicted_SeriousDlqin2yrs'] == 1).sum()
number_of_zeroes = (submission_df['Predicted_SeriousDlqin2yrs'] == 0).sum()
print(f"Number of 1's in Predicted_SeriousDlqin2yrs: {number_of_ones}")
print(f"Number of 0's in the Predicted_SeriousDlqin2yrs: {number_of_zeroes}")

print(submission_df['Predicted_SeriousDlqin2yrs'].value_counts().sum())


  test_df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis AI\Dataset\cs-test.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MonthlyIncome'].fillna(train_median['MonthlyIncome'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Num


Training Features (after dropping target):
 ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

Test Features (after dropping irrelevant columns):
 ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
Submission saved as submission.csv

Submission Head:
     Index  Predicted_SeriousDlqin2yrs
0       0                           0
1       1                           0
2       2                           0
3       3                           0
4       4                           0
5       5                           0
6       6                         

In [12]:
# Ensure test columns match training features
print("Train columns:", list(X.columns))
print("Test columns :", list(test_df.columns))


Train columns: ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
Test columns : ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']


In [None]:
# # Test on Unseen data
# test_df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis\Dataset\cs-test.csv")

# # Fill missing values
# test_df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
# test_df['NumberOfDependents'].fillna(df['NumberOfDependents'].median(), inplace=True)

# # Predict using the trained model
# # Drop ID if present
# if 'Unnamed: 0' in test_df.columns:
#     test_df = test_df.drop('Unnamed: 0', axis=1)

# # Predict
# predictions = model.predict(test_df)


# # Save predictions to a CSV file
# output = pd.DataFrame({
#     "Index": test_df.index,
#     "Predicted_SeriousDlqin2yrs": predictions
# })

# output.to_csv("credit_risk_predictions.csv", index=False)
# print(output.head(20))

  test_df = pd.read_csv("D:\Coding\Internship\Developers Hub Internship\PDF 2\Completed Projects\Credit Risk Analysis\Dataset\cs-test.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['NumberO

    Index  Predicted_SeriousDlqin2yrs
0       0                           1
1       1                           1
2       2                           1
3       3                           1
4       4                           1
5       5                           1
6       6                           1
7       7                           1
8       8                           1
9       9                           1
10     10                           1
11     11                           1
12     12                           1
13     13                           1
14     14                           1
15     15                           1
16     16                           1
17     17                           1
18     18                           1
19     19                           1


In [19]:
test_df.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0
5,,0.509791,63,0,0.342429,4140.0,4,0,0,0,1.0
6,,0.587778,50,0,1048.0,0.0,5,0,0,0,3.0
7,,0.046149,79,1,0.36917,3301.0,8,0,1,0,1.0
8,,0.013527,68,0,2024.0,5400.0,4,0,1,0,0.0
9,,1.0,23,98,0.0,0.0,0,98,0,98,0.0
