In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [14]:
# Load data
data = pd.read_csv("/MBAN- Schulich/MBAN- Sem 2/Predictive Modeling/Mid term Project/midterm_parttwo.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8081 entries, 0 to 8080
Data columns (total 7 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Years of Education after High School  8081 non-null   int64 
 1   Requested Credit Amount               8081 non-null   object
 2   Number of Dependents                  8081 non-null   object
 3   Monthly Income                        8081 non-null   object
 4   Monthly Expense                       8081 non-null   object
 5   Marital Status                        8081 non-null   object
 6   Credit Rating                         8081 non-null   object
dtypes: int64(1), object(6)
memory usage: 442.1+ KB


In [15]:
# Preprocess data
# Convert categorical columns to one-hot encoding
data = pd.get_dummies(data, columns=['Requested Credit Amount', 'Marital Status','Number of Dependents', 'Monthly Income', 'Monthly Expense'])

In [7]:
# Convert 'Credit Rating' to binary
data['Credit Rating'] = data['Credit Rating'].apply(lambda x: 1 if x == 'Positive' else 0)

In [8]:
# Split data
X = data.drop(columns=['Credit Rating'])
y = data['Credit Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [9]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [10]:
# Predict on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [11]:
# Calculate and print metrics
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

Confusion Matrix:
 [[   0  577]
 [   0 3464]]
Precision: 0.86, Recall: 1.00, F1 Score: 0.92


In [12]:
# Adjust threshold for 15% approval rate
threshold = np.percentile(y_pred_proba, 85)  # 85th percentile corresponds to top 15%
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

In [13]:
# Calculate metrics with new threshold
conf_matrix_adj = confusion_matrix(y_test, y_pred_adjusted)
precision_adj = precision_score(y_test, y_pred_adjusted)
recall_adj = recall_score(y_test, y_pred_adjusted)
f1_adj = f1_score(y_test, y_pred_adjusted)

print("Adjusted Confusion Matrix:\n", conf_matrix_adj)
print(f"Adjusted Precision: {precision_adj:.2f}, Adjusted Recall: {recall_adj:.2f}, Adjusted F1 Score: {f1_adj:.2f}")

Adjusted Confusion Matrix:
 [[ 495   82]
 [2937  527]]
Adjusted Precision: 0.87, Adjusted Recall: 0.15, Adjusted F1 Score: 0.26
