# **Importing Libraries and Functions**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# `Importing Dataset`

In [2]:
dataset=pd.read_excel("/content/a_Dataset_CreditScoring.xlsx")

# Data Preparation

In [3]:
dataset.shape

(3000, 30)

In [4]:
dataset.head()

Unnamed: 0,TARGET,ID,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
0,1,582,3,3,0,4,0.0,5,117,27,...,3.0,0.9179,0.2083,2,3,7,0.2083,4,4,0.0
1,1,662,15,9,0,3,1.0,3,14,14,...,1.0,0.8,0.0,0,0,0,1.0,12,0,1.0
2,1,805,0,0,0,1,5.0,1,354,7,...,5.0,0.3552,0.6538,0,1,1,0.7308,1,1,0.5263
3,1,1175,8,5,0,6,1.0,10,16,4,...,3.0,0.9127,0.25,1,1,1,0.75,7,1,1.3333
4,1,1373,3,1,0,9,0.0,8,130,52,...,1.0,1.2511,0.0,0,1,4,0.1429,3,1,0.0


In [5]:
dataset=dataset.drop(columns="ID",axis=1)
dataset.shape

(3000, 29)

In [6]:
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,188
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


In [7]:
dataset=dataset.fillna(dataset.mean())

In [8]:
dataset.isna().sum()

Unnamed: 0,0
TARGET,0
DerogCnt,0
CollectCnt,0
BanruptcyInd,0
InqCnt06,0
InqTimeLast,0
InqFinanceCnt24,0
TLTimeFirst,0
TLTimeLast,0
TLCnt03,0


In [9]:
dataset['TARGET'].value_counts()

Unnamed: 0_level_0,count
TARGET,Unnamed: 1_level_1
0,2500
1,500


In [10]:
dataset.groupby('TARGET').mean()

Unnamed: 0_level_0,DerogCnt,CollectCnt,BanruptcyInd,InqCnt06,InqTimeLast,InqFinanceCnt24,TLTimeFirst,TLTimeLast,TLCnt03,TLCnt12,...,TL50UtilCnt,TLBalHCPct,TLSatPct,TLDel3060Cnt24,TLDel90Cnt24,TLDel60CntAll,TLOpenPct,TLBadDerogCnt,TLDel60Cnt24,TLOpen24Pct
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.3224,0.7664,0.1492,2.9424,3.174638,3.2896,173.002,11.65,0.2844,1.832,...,3.986711,0.628177,0.544963,0.6044,0.6624,2.2236,0.502376,1.18,0.8648,0.556867
1,1.968,1.31,0.174,3.938,2.775459,4.882,155.672,12.992,0.228,1.768,...,4.53387,0.748185,0.385173,1.334,1.576,4.014,0.465127,2.554,2.086,0.600978


# Train Test Split

In [11]:
y=dataset.iloc[:,0].values
X=dataset.iloc[:,1:28].values

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [13]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

# Risk Model building

In [14]:
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)

# Model Performance

In [15]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[482  16]
 [ 87  15]]
0.8283333333333334


# Writing Output File

In [16]:
predictions=classifier.predict_proba(X_test)
predictions

array([[0.04200938, 0.95799062],
       [0.93712307, 0.06287693],
       [0.70429866, 0.29570134],
       ...,
       [0.97248551, 0.02751449],
       [0.44309325, 0.55690675],
       [0.8677644 , 0.1322356 ]])

In [17]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_prediction_target = pd.DataFrame(classifier.predict(X_test), columns = ['predicted_TARGET'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])

dfx=pd.concat([df_test_dataset, df_prediction_prob, df_prediction_target], axis=1)

dfx.to_excel("/content//c1_Model_Prediction.xlsx",index=True,index_label="S No.")

dfx.head()

Unnamed: 0,Actual Outcome,prob_0,prob_1,predicted_TARGET
0,1,0.042009,0.957991,1
1,0,0.937123,0.062877,0
2,0,0.704299,0.295701,0
3,0,0.907845,0.092155,0
4,0,0.866252,0.133748,0


In [18]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.97      0.90       498
           1       0.48      0.15      0.23       102

    accuracy                           0.83       600
   macro avg       0.67      0.56      0.56       600
weighted avg       0.79      0.83      0.79       600



In [19]:
from sklearn.metrics import roc_auc_score

y_prob = classifier.predict_proba(X_test)[:, 1]  # Get probability scores for class 1
roc_auc = roc_auc_score(y_test, y_prob)

print(f"ROC-AUC Score: {roc_auc}")


ROC-AUC Score: 0.770690605559493


In [20]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-validation Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean()}")


Cross-validation Scores: [0.85       0.84166667 0.83958333 0.84166667 0.83958333]
Mean Accuracy: 0.8425


In [21]:
import pandas as pd

# Combine actual and predicted values into a DataFrame
df_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Probability': y_prob})

# Filter False Negatives (FN) → Actual = 1, Predicted = 0
false_negatives = df_results[(df_results['Actual'] == 1) & (df_results['Predicted'] == 0)]

# Display the False Negatives
print(false_negatives)


     Actual  Predicted  Probability
12        1          0     0.474985
13        1          0     0.442905
18        1          0     0.064270
25        1          0     0.434001
27        1          0     0.187837
..      ...        ...          ...
561       1          0     0.363388
573       1          0     0.173603
574       1          0     0.021857
590       1          0     0.486487
591       1          0     0.209721

[87 rows x 3 columns]
