In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.naive_bayes import GaussianNB

In [2]:
data = pd.read_csv(r'C:\Users\HP\Desktop\MBS\MSB-Mortgage-Backed-Securities-Pipeline-main-LoanExport-Revised.csv')

In [3]:
data.head()

Unnamed: 0,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,...,MonthsDelinquent,MonthsInRepayment,FirstPaymentYear,FirstPaymentMonth,MaturityYear,MaturityMonth,IsFirstTimeHomebuyer,LTV_Range,CreditRange,RePayRange
0,16974,25,1,O,89,27,117000,6.75,T,N,...,0,52,1999,2,2029,1,0,High,Excellent,4-8yrs
1,19740,0,1,O,73,17,109000,6.5,R,N,...,0,144,1999,2,2029,1,0,High,Excellent,12-16yrs
2,29940,0,1,O,75,16,88000,6.875,T,N,...,0,67,1999,2,2029,1,0,High,Excellent,4-8yrs
3,31084,0,1,O,76,14,160000,6.875,R,N,...,0,35,1999,2,2029,1,0,High,Excellent,0-4yrs
4,35644,0,1,O,78,18,109000,7.125,R,N,...,0,54,1999,2,2029,1,0,High,Excellent,4-8yrs


In [4]:
# monthly interest rate
data['OrigInterestRate_Monthly'] =  np.round((data['OrigInterestRate'] / 12) / 100, 4)

# monthly installment
def calculateEmi(principal, monthly_interest_rate, loan_term_months):
    numerator = (1 + monthly_interest_rate) ** loan_term_months
    denominator = numerator - 1
    interest = numerator / denominator
    emi = principal * monthly_interest_rate * interest
    return np.int64(emi)

data['MonthlyInstallment'] = data.apply(
        lambda features: calculateEmi(
            principal=features['OrigUPB'], 
            monthly_interest_rate=features['OrigInterestRate_Monthly'],
            loan_term_months=features['OrigLoanTerm']), axis=1)

# current unpaid principal

def get_currentUPB(principal, monthly_interest_rate, monthly_installment,
                   payments_made):
    monthly_interest = monthly_interest_rate * principal
    monthly_paid_principal = monthly_installment - monthly_interest
    unpaid_principal = principal - (monthly_paid_principal * payments_made)
    return np.int32(unpaid_principal)

data['CurrentUPB'] = data.apply(
        lambda features: get_currentUPB(
            monthly_interest_rate=features['OrigInterestRate_Monthly'],
            principal=features['OrigUPB'], 
            monthly_installment=features['MonthlyInstallment'],
            payments_made=features['MonthsInRepayment']), axis=1)

# monthly income
def calculate_monthly_income(dti, emi):
    dti = dti if dti <1 else dti / 100
    # Calculate montly income
    if dti == 0:
        monthly_income = emi
    else:
        monthly_income = emi / dti
    return np.int64 (monthly_income)

data['MonthlyIncome'] = data.apply(
        lambda features: calculate_monthly_income(
            dti = features['DTI'],
            emi= features['MonthlyInstallment']), axis=1)

# prepayment
def calculatePrepayment(dti, monthly_income):
    if (dti < 40):
        prepayment = monthly_income / 2;
    else:
        prepayment = monthly_income * 3 / 4;
    return np.int64(prepayment)

data['Prepayment'] = data.apply(
        lambda features: calculatePrepayment(
            dti=features['DTI'],
            monthly_income=features['MonthlyIncome']), axis=1)
data['Prepayment']=(data['Prepayment']*24)-(data['MonthlyInstallment']*24)

# total payment and interest amount
data['Totalpayment'] = data['MonthlyInstallment'] * data['OrigLoanTerm']
data['InterestAmount'] = data['Totalpayment'] - data['OrigUPB']

In [5]:
data.to_csv('MSB-Mortgage-Backed-Securities-Pipeline-main-LoanExport.csv',index=False)

In [6]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cat_col=['IsFirstTimeHomebuyer','PPM','PropertyState','ServicerName','PropertyType','Channel','SellerName','LTV_Range','CreditRange','RePayRange']
data[cat_col]=data[cat_col].apply(le.fit_transform)
data.head()

Unnamed: 0,MSA,MIP,Units,Occupancy,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,...,LTV_Range,CreditRange,RePayRange,OrigInterestRate_Monthly,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount
0,16974,25,1,O,89,27,117000,6.75,3,0,...,0,0,3,0.0056,756,111758,2800,15456,272160,155160
1,19740,0,1,O,73,17,109000,6.5,2,0,...,0,0,1,0.0054,687,94830,4041,31992,247320,138320
2,29940,0,1,O,75,16,88000,6.875,3,0,...,0,0,3,0.0057,576,83015,3600,29376,207360,119360
3,31084,0,1,O,76,14,160000,6.875,2,0,...,0,0,0,0.0057,1047,155275,7478,64608,376920,216920
4,35644,0,1,O,78,18,109000,7.125,2,0,...,0,0,3,0.0059,731,104253,4061,31176,263160,154160


In [7]:
one_col=['LoanPurpose','Occupancy']
data_one=pd.get_dummies(data[one_col], drop_first=True)
data_one.head()

Unnamed: 0,LoanPurpose_N,LoanPurpose_P,Occupancy_O,Occupancy_S
0,0,1,1,0
1,1,0,1,0
2,1,0,1,0
3,1,0,1,0
4,1,0,1,0


In [8]:
data=pd.concat([data,data_one], axis = 1)
data.drop(['LoanPurpose','Occupancy'],inplace=True,axis=1)
data.head()

Unnamed: 0,MSA,MIP,Units,OCLTV,DTI,OrigUPB,OrigInterestRate,Channel,PPM,PropertyState,...,MonthlyInstallment,CurrentUPB,MonthlyIncome,Prepayment,Totalpayment,InterestAmount,LoanPurpose_N,LoanPurpose_P,Occupancy_O,Occupancy_S
0,16974,25,1,89,27,117000,6.75,3,0,15,...,756,111758,2800,15456,272160,155160,0,1,1,0
1,19740,0,1,73,17,109000,6.5,2,0,5,...,687,94830,4041,31992,247320,138320,1,0,1,0
2,29940,0,1,75,16,88000,6.875,3,0,17,...,576,83015,3600,29376,207360,119360,1,0,1,0
3,31084,0,1,76,14,160000,6.875,2,0,4,...,1047,155275,7478,64608,376920,216920,1,0,1,0
4,35644,0,1,78,18,109000,7.125,2,0,32,...,731,104253,4061,31176,263160,154160,1,0,1,0


In [9]:
data.columns


Index(['MSA', 'MIP', 'Units', 'OCLTV', 'DTI', 'OrigUPB', 'OrigInterestRate',
       'Channel', 'PPM', 'PropertyState', 'PropertyType', 'OrigLoanTerm',
       'NumBorrowers', 'SellerName', 'ServicerName', 'EverDelinquent',
       'MonthsDelinquent', 'MonthsInRepayment', 'FirstPaymentYear',
       'FirstPaymentMonth', 'MaturityYear', 'MaturityMonth',
       'IsFirstTimeHomebuyer', 'LTV_Range', 'CreditRange', 'RePayRange',
       'OrigInterestRate_Monthly', 'MonthlyInstallment', 'CurrentUPB',
       'MonthlyIncome', 'Prepayment', 'Totalpayment', 'InterestAmount',
       'LoanPurpose_N', 'LoanPurpose_P', 'Occupancy_O', 'Occupancy_S'],
      dtype='object')

In [10]:
# Split data into features and target
X = data.drop(['EverDelinquent', 'Prepayment'], axis=1)
y_class = data['EverDelinquent']
y_reg = data['Prepayment']

# Split into training and testing sets
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42
)

In [11]:
# importing library for mi score for classification 
from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_train, y_class_train)
mutual_info

array([5.48447874e-03, 5.47002152e-03, 3.56100361e-02, 5.79241912e-03,
       3.16724962e-03, 3.40207859e-03, 5.37415075e-03, 1.88177106e-02,
       4.67218156e-04, 4.16881282e-03, 3.24743770e-02, 3.81442392e-02,
       2.66054501e-02, 8.25156407e-03, 1.37982640e-02, 4.98048658e-01,
       3.09801514e-02, 2.69683510e-02, 1.24129133e-02, 2.45914236e-02,
       1.19376152e-02, 3.71486578e-04, 1.22820165e-03, 4.11508679e-02,
       3.05954191e-02, 3.26276901e-03, 1.93755022e-03, 1.36224608e-02,
       2.95608052e-03, 3.43150879e-03, 4.87842454e-03, 5.15590099e-03,
       7.99697584e-03, 3.56882013e-02, 0.00000000e+00])

In [12]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X.columns
mutual_info.sort_values(ascending=False)

MonthsDelinquent            0.498049
CreditRange                 0.041151
OrigLoanTerm                0.038144
Occupancy_O                 0.035688
Units                       0.035610
PropertyType                0.032474
MonthsInRepayment           0.030980
RePayRange                  0.030595
FirstPaymentYear            0.026968
NumBorrowers                0.026605
MaturityYear                0.024591
Channel                     0.018818
ServicerName                0.013798
CurrentUPB                  0.013622
FirstPaymentMonth           0.012413
MaturityMonth               0.011938
SellerName                  0.008252
LoanPurpose_P               0.007997
OCLTV                       0.005792
MSA                         0.005484
MIP                         0.005470
OrigInterestRate            0.005374
LoanPurpose_N               0.005156
InterestAmount              0.004878
PropertyState               0.004169
Totalpayment                0.003432
OrigUPB                     0.003402
O

In [13]:
# For selecting best feature based on mi score
from sklearn.feature_selection import SelectKBest
#Now we Will select the  top 10 important features
selector= SelectKBest(mutual_info_classif, k=10)
x_train_selected=selector.fit_transform(X_train, y_class_train)
x_test_selected=selector.transform(X_test)

In [14]:
selected_features=X_train.columns[selector.get_support()]
selected_features

Index(['Units', 'PropertyType', 'OrigLoanTerm', 'NumBorrowers',
       'MonthsDelinquent', 'MonthsInRepayment', 'FirstPaymentYear',
       'CreditRange', 'RePayRange', 'Occupancy_O'],
      dtype='object')

In [15]:
from sklearn.feature_selection import f_regression
selector= SelectKBest(score_func=f_regression,k=10)
x_train_sel=selector.fit_transform(X_train,y_reg_train)
x_test_sel=selector.transform(X_test)

In [16]:
# Create a DataFrame for F-values and p-values
p_values=selector.pvalues_
f_values=selector.scores_
anova_df = pd.DataFrame({
    'Feature': X.columns,
    'F-value': f_values,
    'p-value': p_values
})

# Sort features by F-value in descending order
anova_df = anova_df.sort_values(by='F-value', ascending=False)

# Select the top 10 features
top_features = anova_df.head(10)

# Display only feature names
top_feature_names = top_features['Feature'].tolist()
print(top_feature_names)


['MonthlyIncome', 'InterestAmount', 'Totalpayment', 'MonthlyInstallment', 'OrigUPB', 'CurrentUPB', 'DTI', 'NumBorrowers', 'LoanPurpose_N', 'OCLTV']


In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomPipelineWithFeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self, clf, reg, clf_features, reg_features):
        self.clf = clf
        self.reg = reg
        self.clf_features = clf_features
        self.reg_features = reg_features
        self.scaler_clf = StandardScaler()
        self.scaler_reg = StandardScaler()

    def fit(self, X, y_class, y_reg):
        # Ensure X is a DataFrame and contains the specified features
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X should be a pandas DataFrame")

        # Extract features for classification and scale them
        X_clf = X[self.clf_features]
        X_clf_scaled = self.scaler_clf.fit_transform(X_clf)
        self.clf.fit(X_clf_scaled, y_class)
        
        # Filter data where classification is 1
        X_filtered = X[y_class == 1]
        y_reg_filtered = y_reg[y_class == 1]
        
        # Extract features for regression and scale them
        X_filtered_reg = X_filtered[self.reg_features]
        X_filtered_reg_scaled = self.scaler_reg.fit_transform(X_filtered_reg)
        self.reg.fit(X_filtered_reg_scaled, y_reg_filtered)
        return self

    def predict(self, X):
        # Ensure X is a DataFrame and contains the specified features
        if not isinstance(X, pd.DataFrame):
            raise ValueError("X should be a pandas DataFrame")

        # Extract features for classification and scale them
        X_clf = X[self.clf_features]
        X_clf_scaled = self.scaler_clf.transform(X_clf)
        y_class_pred = self.clf.predict(X_clf_scaled)
        
        # Initialize predictions for regression with NaNs
        y_reg_pred = np.full(X.shape[0], np.nan)
        
        # Filter data where classification is 1
        X_filtered = X[y_class_pred == 1]
        if len(X_filtered) > 0:
            # Extract features for regression and scale them
            X_filtered_reg = X_filtered[self.reg_features]
            X_filtered_reg_scaled = self.scaler_reg.transform(X_filtered_reg)
            y_reg_pred_filtered = self.reg.predict(X_filtered_reg_scaled)
            # Assign regression predictions to corresponding positions
            y_reg_pred[y_class_pred == 1] = y_reg_pred_filtered
        return y_class_pred, y_reg_pred

# Define feature sets
clf_features = ['Units', 'PropertyType', 'OrigLoanTerm', 'MonthsDelinquent','MonthsInRepayment', 
                'FirstPaymentYear', 'MaturityYear', 'CreditRange','RePayRange', 'Occupancy_O']

reg_features = ['MonthlyIncome', 'InterestAmount', 'Totalpayment', 'MonthlyInstallment', 'OrigUPB', 
                'CurrentUPB', 'DTI', 'NumBorrowers', 'LoanPurpose_N', 'OCLTV']
# Create and fit the custom pipeline with Random Forest Regressor
pipeline = CustomPipelineWithFeatureSelection(
    clf=GaussianNB(),
    reg=RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=5, random_state=42),
    clf_features=clf_features,
    reg_features=reg_features
)

# Fit the pipeline
pipeline.fit(X_train, y_class_train, y_reg_train)

# Make predictions
y_class_pred, y_reg_pred = pipeline.predict(X_test)




In [18]:
# Evaluate classification model
print("Naive Bayes Classification Report:")
print("Accuracy:", accuracy_score(y_class_test, y_class_pred))
print("Precision:", precision_score(y_class_test, y_class_pred, average='weighted'))
print("Recall:", recall_score(y_class_test, y_class_pred, average='weighted'))
print("F1 Score:", f1_score(y_class_test, y_class_pred, average='weighted'))

Naive Bayes Classification Report:
Accuracy: 0.9986959729585971
Precision: 0.998704551341281
Recall: 0.9986959729585971
F1 Score: 0.9986975948339435


In [19]:
mask=~np.isnan(y_reg_pred)
print("RandomForest Regressor Performance Metrics:")
print("Mean Absolute Error:", mean_absolute_error(y_reg_test[mask], y_reg_pred[mask]))
print("Mean Squared Error:", mean_squared_error(y_reg_test[mask],y_reg_pred[mask]))
print("R^2 Score:", r2_score(y_reg_test[mask], y_reg_pred[mask]))


RandomForest Regressor Performance Metrics:
Mean Absolute Error: 338.09222354969756
Mean Squared Error: 13332968.177746935
R^2 Score: 0.9782528940194305


In [20]:
# Create DataFrame of predicted values
results_df = pd.DataFrame({
    'True Classification': y_class_test,
    'Predicted Classification': y_class_pred,
    'True Regression': y_reg_test,
    'Predicted Regression': y_reg_pred
})
results_df

Unnamed: 0,True Classification,Predicted Classification,True Regression,Predicted Regression
91521,0,0,9288,
158012,0,0,3288,
113839,0,0,5328,
158630,0,0,15264,
115807,0,0,22632,
...,...,...,...,...
233168,0,0,7344,
167583,0,0,21072,
10151,0,0,165840,
276893,0,0,32976,


In [21]:
import joblib

# Save the pipeline
joblib.dump(pipeline, 'MBS_combined_pipeline.pkl')
pipeline = joblib.load('MBS_combined_pipeline.pkl')

In [22]:
results_df[results_df['Predicted Classification']==1]

Unnamed: 0,True Classification,Predicted Classification,True Regression,Predicted Regression
56835,1,1,258048,258682.514899
29064,1,1,11952,11587.262781
17747,1,1,14880,14933.413807
19997,1,1,12336,11885.501683
69581,1,1,4824,4786.801992
...,...,...,...,...
16243,1,1,9696,9819.315975
84158,1,1,3696,4078.461222
171419,1,1,8016,8068.793911
29468,1,1,23328,22902.641750
