In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report,confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.naive_bayes import GaussianNB

In [2]:
data=pd.read_csv('MSB-Mortgage-Backed-Securities-Pipeline-main-LoanExport.csv')

In [3]:
data.head()

Unnamed: 0,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,...,LTV_Range,CreditRange,RePayRange,OrigInterestRate_Monthly,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount
0,16974,25,1,O,89,27,117000,6.75,T,N,...,High,Excellent,4-8yrs,0.0056,756,111758,2800,15456,272160,155160
1,19740,0,1,O,73,17,109000,6.5,R,N,...,High,Excellent,12-16yrs,0.0054,687,94830,4041,31992,247320,138320
2,29940,0,1,O,75,16,88000,6.875,T,N,...,High,Excellent,4-8yrs,0.0057,576,83015,3600,29376,207360,119360
3,31084,0,1,O,76,14,160000,6.875,R,N,...,High,Excellent,0-4yrs,0.0057,1047,155275,7478,64608,376920,216920
4,35644,0,1,O,78,18,109000,7.125,R,N,...,High,Excellent,4-8yrs,0.0059,731,104253,4061,31176,263160,154160


In [4]:
data.columns

Index(['MSA', 'MIP', 'Units', 'Occupancy', 'OCLTV', 'DTI', 'OrigUPB',
       'OrigInterestRate', 'Channel', 'PPM', 'PropertyState', 'PropertyType',
       'LoanPurpose', 'OrigLoanTerm', 'NumBorrowers', 'SellerName',
       'ServicerName', 'EverDelinquent', 'MonthsDelinquent',
       'MonthsInRepayment', 'FirstPaymentYear', 'FirstPaymentMonth',
       'MaturityYear', 'MaturityMonth', 'IsFirstTimeHomebuyer', 'LTV_Range',
       'CreditRange', 'RePayRange', 'OrigInterestRate_Monthly',
       'MonthlyInstallment', 'CurrentUPB', 'MonthlyIncome', 'Prepayment',
       'Totalpayment', 'InterestAmount'],
      dtype='object')

<div style="text-align: center; background-color: orange; padding: 10px;">
    <h2 style="font-weight: bold;">ENCODING</h2>
</div>

In [5]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cat_col=['IsFirstTimeHomebuyer','PPM','PropertyState','ServicerName','PropertyType','Channel','SellerName','LTV_Range','CreditRange','RePayRange']
data[cat_col]=data[cat_col].apply(le.fit_transform)
data.head()

Unnamed: 0,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,...,LTV_Range,CreditRange,RePayRange,OrigInterestRate_Monthly,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount
0,16974,25,1,O,89,27,117000,6.75,3,0,...,0,0,3,0.0056,756,111758,2800,15456,272160,155160
1,19740,0,1,O,73,17,109000,6.5,2,0,...,0,0,1,0.0054,687,94830,4041,31992,247320,138320
2,29940,0,1,O,75,16,88000,6.875,3,0,...,0,0,3,0.0057,576,83015,3600,29376,207360,119360
3,31084,0,1,O,76,14,160000,6.875,2,0,...,0,0,0,0.0057,1047,155275,7478,64608,376920,216920
4,35644,0,1,O,78,18,109000,7.125,2,0,...,0,0,3,0.0059,731,104253,4061,31176,263160,154160


In [6]:
one_col=['LoanPurpose','Occupancy']
data_one=pd.get_dummies(data[one_col], drop_first=True)
data_one.head()

Unnamed: 0,LoanPurpose_N,LoanPurpose_P,Occupancy_O,Occupancy_S
0,0,1,1,0
1,1,0,1,0
2,1,0,1,0
3,1,0,1,0
4,1,0,1,0


In [7]:
data=pd.concat([data,data_one], axis = 1)
data.drop(['LoanPurpose','Occupancy'],inplace=True,axis=1)
data.head()

Unnamed: 0,MSA,MIP,Units,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,PropertyState,...,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount,LoanPurpose_N,LoanPurpose_P,Occupancy_O,Occupancy_S
0,16974,25,1,89,27,117000,6.75,3,0,15,...,756,111758,2800,15456,272160,155160,0,1,1,0
1,19740,0,1,73,17,109000,6.5,2,0,5,...,687,94830,4041,31992,247320,138320,1,0,1,0
2,29940,0,1,75,16,88000,6.875,3,0,17,...,576,83015,3600,29376,207360,119360,1,0,1,0
3,31084,0,1,76,14,160000,6.875,2,0,4,...,1047,155275,7478,64608,376920,216920,1,0,1,0
4,35644,0,1,78,18,109000,7.125,2,0,32,...,731,104253,4061,31176,263160,154160,1,0,1,0


<div style="text-align: center; background-color: orange; padding: 10px;">
    <h2 style="font-weight: bold;">PREPAYMENT RISK</h2>
</div>

In [8]:
# Calculate statistics for prepayment
prepayment_data = data['Prepayment'].dropna()
min_prepayment = prepayment_data.min()
max_prepayment = prepayment_data.max()
print(min_prepayment)
print(max_prepayment)

-32592
2103864


In [9]:
# Calculate percentiles
threshold_75th_percentile = np.percentile(prepayment_data, 75)
threshold_90th_percentile = np.percentile(prepayment_data, 90)
print(threshold_75th_percentile)
print(threshold_90th_percentile)

21036.0
33984.0


In [10]:
# Calculate mean and standard deviation
mean_prepayment = prepayment_data.mean()
std_prepayment = prepayment_data.std()
threshold_mean_plus_sd = mean_prepayment + std_prepayment
print(f"Mean + Standard Deviation Threshold: {threshold_mean_plus_sd}")

Mean + Standard Deviation Threshold: 42125.63231103643


In [11]:
# Set the chosen threshold based on your decision
chosen_threshold =33984.0  # Example: 90th percentile

# Create the PrepaymentRisk binary indicator based on the chosen threshold
data['PrepaymentRisk'] = (data['Prepayment'] > chosen_threshold).astype(int)  # 1 if prepayment > threshold, 0 otherwise

# Display the distribution of the new PrepaymentRisk column
print(data['PrepaymentRisk'].value_counts())


0    262300
1     29103
Name: PrepaymentRisk, dtype: int64


<div style="text-align: center; background-color: orange; padding: 10px;">
    <h2 style="font-weight: bold;">MODEL BUILDING</h2>
</div>

In [12]:
data.drop(['EverDelinquent','Prepayment'],inplace=True,axis=1)

In [13]:
X= data.drop('PrepaymentRisk', axis=1)
y = data.PrepaymentRisk

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# importing library for mi score for classification 
from sklearn.feature_selection import mutual_info_classif
# Feature selection with Mutual Information
mi = mutual_info_classif(X_train_scaled, y_train)
mi_sorted_indices = np.argsort(mi)[::-1]
mi_sorted_indices

array([28,  4, 12, 32, 31,  7, 30, 26, 27,  5, 29,  1, 23, 22, 21, 13, 24,
       10, 20, 18, 33,  3, 14, 34,  2, 15,  9,  8,  0, 11,  6, 25, 19, 16,
       17], dtype=int64)

In [16]:
# Select top 10 features
top_features_indices = mi_sorted_indices[:10]
top_features_indices 

array([28,  4, 12, 32, 31,  7, 30, 26, 27,  5], dtype=int64)

In [17]:
# Transform X_train and X_test to include only the top features
X_train_selected = X_train_scaled[:, top_features_indices]
X_test_selected = X_test_scaled[:, top_features_indices]

In [18]:
# Get selected feature names
selected_feature_names = X.columns[top_features_indices]
print("Selected Features (Mutual Information):", selected_feature_names)

Selected Features (Mutual Information): Index(['MonthlyIncome', 'DTI', 'NumBorrowers', 'LoanPurpose_P',
       'LoanPurpose_N', 'Channel', 'InterestAmount', 'MonthlyInstallment',
       'CurrentUPB', 'OrigUPB'],
      dtype='object')


In [19]:
# Train a model with the selected features
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_selected, y_train)

# Make predictions
y_pred = model.predict(X_test_selected)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Evaluate the model
print("Prepayment Risk Prediction Accuracy Score:", accuracy_score(y_test, y_pred))
print("Prepayment Risk Classification Report:\n", classification_report(y_test, y_pred))

Prepayment Risk Prediction Accuracy Score: 0.9974434206688286
Prepayment Risk Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     52376
           1       0.99      0.98      0.99      5905

    accuracy                           1.00     58281
   macro avg       1.00      0.99      0.99     58281
weighted avg       1.00      1.00      1.00     58281



In [21]:
model_nb = GaussianNB()
model_nb.fit(X_train_selected, y_train)

# Make predictions
y_pred = model_nb.predict(X_test_selected)

# Evaluate the model
print("Prepayment Risk Prediction Accuracy Score:", accuracy_score(y_test, y_pred))
print("Prepayment Risk Classification Report:\n", classification_report(y_test, y_pred))

Prepayment Risk Prediction Accuracy Score: 0.9423139616684683
Prepayment Risk Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97     52376
           1       0.70      0.76      0.73      5905

    accuracy                           0.94     58281
   macro avg       0.84      0.86      0.85     58281
weighted avg       0.94      0.94      0.94     58281



In [22]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_selected, y_train)

# Make predictions
y_pred= rf_model.predict(X_test_selected)

In [23]:
# Evaluate the model
print("Prepayment Risk Prediction Accuracy Score:", accuracy_score(y_test, y_pred))
print("Prepayment Risk Classification Report:\n", classification_report(y_test, y_pred))

Prepayment Risk Prediction Accuracy Score: 0.9999485252483656
Prepayment Risk Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     52376
           1       1.00      1.00      1.00      5905

    accuracy                           1.00     58281
   macro avg       1.00      1.00      1.00     58281
weighted avg       1.00      1.00      1.00     58281



In [24]:
# Cross-validation scores
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.99883324 0.99924504 0.99912493 0.99943377 0.99926218]
Mean CV Score: 0.9991798311489184


<div style="text-align: center; background-color: orange; padding: 10px;">
    <h2 style="font-weight: bold;">PIPELINE BUILDING</h2>
</div>

In [25]:
data_pipe=pd.read_csv('MSB-Mortgage-Backed-Securities-Pipeline-main-LoanExport.csv')

In [26]:
data_pipe.head()

Unnamed: 0,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,...,LTV_Range,CreditRange,RePayRange,OrigInterestRate_Monthly,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount
0,16974,25,1,O,89,27,117000,6.75,T,N,...,High,Excellent,4-8yrs,0.0056,756,111758,2800,15456,272160,155160
1,19740,0,1,O,73,17,109000,6.5,R,N,...,High,Excellent,12-16yrs,0.0054,687,94830,4041,31992,247320,138320
2,29940,0,1,O,75,16,88000,6.875,T,N,...,High,Excellent,4-8yrs,0.0057,576,83015,3600,29376,207360,119360
3,31084,0,1,O,76,14,160000,6.875,R,N,...,High,Excellent,0-4yrs,0.0057,1047,155275,7478,64608,376920,216920
4,35644,0,1,O,78,18,109000,7.125,R,N,...,High,Excellent,4-8yrs,0.0059,731,104253,4061,31176,263160,154160


In [27]:
# Set the chosen threshold based on your decision
chosen_threshold =33984.0  # Example: 90th percentile

# Create the PrepaymentRisk binary indicator based on the chosen threshold
data_pipe['PrepaymentRisk'] = (data_pipe['Prepayment'] > chosen_threshold).astype(int)  # 1 if prepayment > threshold, 0 otherwise

# Display the distribution of the new PrepaymentRisk column
print(data_pipe['PrepaymentRisk'].value_counts())


0    262300
1     29103
Name: PrepaymentRisk, dtype: int64


In [28]:
data_pipe.drop(['EverDelinquent','Prepayment','SellerName'],inplace=True,axis=1)

In [29]:
X= data_pipe.drop('PrepaymentRisk', axis=1)
y = data_pipe.PrepaymentRisk


In [30]:
# Feature selection (example top features, replace with actual top features)
top_features = ['MonthlyIncome', 'DTI', 'NumBorrowers', 'Channel', 'CurrentUPB', 'Totalpayment',
       'InterestAmount', 'MonthlyInstallment','LoanPurpose']


In [31]:
X=X[top_features]

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define a custom transformer for label encoding
class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.label_encoders = {}
    
    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            self.label_encoders[col] = le.fit(X[col])
        return self
    
    def transform(self, X):
        X = X.copy()
        for col, le in self.label_encoders.items():
            X[col] = le.transform(X[col])
        return X


In [33]:


# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), ['LoanPurpose']),
        ('label', LabelEncoderTransformer(columns=['Channel']), ['Channel']),
        ('scaler', StandardScaler(), ['MonthlyIncome', 'DTI', 'NumBorrowers', 'CurrentUPB', 'Totalpayment', 'InterestAmount', 'MonthlyInstallment'])
    ],
    remainder='passthrough'
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',GaussianNB())  # You can replace this with any other classifier
])


In [34]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[50437  1939]
 [ 1412  4493]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97     52376
           1       0.70      0.76      0.73      5905

    accuracy                           0.94     58281
   macro avg       0.84      0.86      0.85     58281
weighted avg       0.94      0.94      0.94     58281



In [35]:

import joblib
# Save the pipeline
joblib.dump(pipeline, 'MBS_prepayrisk_pipeline.pkl')
pipeline = joblib.load('MBS_prepayrisk_pipeline.pkl')
