In [3]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/Housing.csv")

# Review the DataFrame
df_lending_data = pd.read_csv(file_path)

df_lending_data


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


In [5]:
#Clean the data 
# Handling missing values
# Replace missing values in numeric columns with the mean
numeric_columns = ['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
for column in numeric_columns:
    df_lending_data[column].fillna(df_lending_data[column].mean(), inplace=True)

# Replace missing values in categorical columns with the mode
categorical_columns = ['ocean_proximity']
for column in categorical_columns:
    df_lending_data[column].fillna(df_lending_data[column].mode()[0], inplace=True)

# Handling outliers
# Identify and remove or cap outliers in numeric columns (e.g., using z-scores)
from scipy.stats import zscore
z_scores = zscore(df_lending_data[numeric_columns])
df_lending_data = df_lending_data[(z_scores < 3).all(axis=1)]

# Data transformations
# Apply transformations to features if needed (e.g., log transformation)
df_lending_data['median_income'] = np.log(df_lending_data['median_income'])

# Review the cleaned DataFrame
print(df_lending_data)


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                  41          880           129.0   
1        -122.22     37.86                  21         7099          1106.0   
2        -122.24     37.85                  52         1467           190.0   
3        -122.25     37.85                  52         1274           235.0   
4        -122.25     37.85                  52         1627           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                  25         1665           374.0   
20636    -121.21     39.49                  18          697           150.0   
20637    -121.22     39.43                  17         2254           485.0   
20638    -121.32     39.43                  18         1860           409.0   
20639    -121.24     39.37                  16         2785           616.0   

       population  households  median_income  media

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lending_data['median_income'] = np.log(df_lending_data['median_income'])


In [7]:
from sklearn.model_selection import train_test_split

# Define your features and target variable
features = df_lending_data.drop(columns=['median_house_value'])  # Replace with your actual feature columns
target = df_lending_data['median_house_value']  # Replace with your actual target column

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Print the shapes of the split datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



X_train shape: (15796, 9)
X_test shape: (3950, 9)
y_train shape: (15796,)
y_test shape: (3950,)


In [None]:
# Perform one-hot encoding on the categorical feature
features_encoded = pd.get_dummies(features, columns=['ocean_proximity'])

# Split the encoded features into training and testing sets
X_train_encoded, X_test_encoded, _, _ = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_encoded, y_train)



In [None]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Use the trained model to make predictions on the testing data
y_pred = model.predict(X_test_encoded)

# Calculate balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Balanced Accuracy:", balanced_acc)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters to search through
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga']  # Optimization solver
}

# Initialize the Logistic Regression model
model = LogisticRegression()

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='balanced_accuracy')
grid_search.fit(X_train_encoded, y_train)

# Get the best hyperparameters from grid search
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_model = LogisticRegression(**best_params)
best_model.fit(X_train_encoded, y_train)

# Use the best model to make predictions on the testing data
y_pred_best = best_model.predict(X_test_encoded)

# Evaluate the best model's performance
best_balanced_acc = balanced_accuracy_score(y_test, y_pred_best)
best_conf_matrix = confusion_matrix(y_test, y_pred_best)
best_class_report = classification_report(y_test, y_pred_best)

# Print evaluation metrics for the best model
print("Best Balanced Accuracy:", best_balanced_acc)
print("Best Confusion Matrix:\n", best_conf_matrix)
print("Best Classification Report:\n", best_class_report)


In [None]:
# Print original model's performance
print("Original Model:")
print("Balanced Accuracy:", balanced_acc)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print("\n")

# Print best model's performance
print("Best Model (After Optimization):")
print("Best Balanced Accuracy:", best_balanced_acc)
print("Best Confusion Matrix:\n", best_conf_matrix)
print("Best Classification Report:\n", best_class_report)
print("\n")

# Compare performance improvement
improvement_balanced_acc = best_balanced_acc - balanced_acc
print("Performance Improvement in Balanced Accuracy:", improvement_balanced_acc)


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df_lending_data['loan_status']

# Separate the X variable, the features
x = df_lending_data.drop(columns=['loan_status'])

In [4]:
# Review the y variable Series
y

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

In [5]:
# Review the X variable DataFrame
x

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values
y.value_counts()


0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [9]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [10]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

# Fit the model using training data
# YOUR CODE HERE!



# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistics_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistics_regression_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [11]:
# Make a prediction using the testing data
training_predictions = lr_model.predict(X_train)
testing_predictions = logistics_regression_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [12]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import balanced_accuracy_score

# Calculate and print the balanced_accuracy score of the model
balanced_acc_train = balanced_accuracy_score(y_train, training_predictions)
balanced_acc_test = balanced_accuracy_score(y_test, testing_predictions)

print("Balanced Accuracy Score (Training):", balanced_acc_train)
print("Balanced Accuracy Score (Testing):", balanced_acc_test)


Balanced Accuracy Score (Training): 0.9494259906569136
Balanced Accuracy Score (Testing): 0.9520479254722232


In [13]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix

training_matrix = confusion_matrix(y_train, training_predictions)

print(training_matrix)

[[55994   277]
 [  181  1700]]


In [14]:
# Print the classification report for the model
training_report = classification_report(y_train, training_predictions)

print(training_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56271
           1       0.86      0.90      0.88      1881

    accuracy                           0.99     58152
   macro avg       0.93      0.95      0.94     58152
weighted avg       0.99      0.99      0.99     58152



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [17]:
# Import the RandomOverSampler module form imbalanced-learn

from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)




In [18]:
# Count the distinct values of the resampled labels data
distinct_label_counts = y_resampled.value_counts()

print(distinct_label_counts)


0    56271
1    56271
Name: loan_status, dtype: int64


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [19]:
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
lr_model_resampled.fit(x_resampled, y_resampled)

# Make a prediction using the testing data
testing_predictions_resampled = lr_model_resampled.predict(X_test)


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [20]:
# Print the balanced_accuracy score of the model 
from sklearn.metrics import balanced_accuracy_score

# Calculate the balanced accuracy score for the resampled model
balanced_acc_resampled = balanced_accuracy_score(y_test, testing_predictions_resampled)

print("Balanced Accuracy Score (Resampled Model):", balanced_acc_resampled)


Balanced Accuracy Score (Resampled Model): 0.9936781215845847


In [21]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the resampled model
confusion_matrix_resampled = confusion_matrix(y_test, testing_predictions_resampled)

print("Confusion Matrix (Resampled Model):")
print(confusion_matrix_resampled)


Confusion Matrix (Resampled Model):
[[18649   116]
 [    4   615]]


In [22]:
# Print the classification report for the model
from sklearn.metrics import classification_report

# Generate the classification report for the resampled model
classification_report_resampled = classification_report(y_test, testing_predictions_resampled)

print("Classification Report (Resampled Model):")
print(classification_report_resampled)


Classification Report (Resampled Model):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** YOUR ANSWER HERE!