In [205]:
#importing essential libraries
import pandas as pd # for data manipulation
import warnings #for ignore some warnings throughout the program
from sklearn.impute import SimpleImputer #for filling null values with wanted strategy
from sklearn.preprocessing import OneHotEncoder #for encoding nominal data
from sklearn.preprocessing import OrdinalEncoder #for encoding ordinal data
import matplotlib.pyplot as plt #for visuvalization
from sklearn.model_selection import train_test_split #for slpiting the data into train and test datasets
from sklearn.preprocessing import MinMaxScaler #for scaling
from sklearn.ensemble import RandomForestClassifier #for finding important predictors
import seaborn as sns #for more accurate visivalization

In [24]:
#loading the data
warnings.filterwarnings("ignore")
df = pd.read_csv("LoanExport.csv")
#look first 5 rows of df
df.head()

Unnamed: 0,CreditScore,FirstPaymentDate,FirstTimeHomebuyer,MaturityDate,MSA,MIP,Units,Occupancy,OCLTV,DTI,...,PostalCode,LoanSeqNum,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment
0,0,199902,N,202901,16974,25,1,O,89,27,...,60400,F199Q1268030,P,360,2,FL,WASHINGTONMUTUALBANK,0,0,52
1,0,199902,N,202901,19740,0,1,O,73,17,...,80200,F199Q1015092,N,360,1,FT,CHASEHOMEFINANCELLC,0,0,144
2,0,199902,N,202901,29940,0,1,O,75,16,...,66000,F199Q1266886,N,360,2,FL,WASHINGTONMUTUALBANK,0,0,67
3,0,199902,N,202901,31084,0,1,O,76,14,...,90700,F199Q1178167,N,360,2,GM,GMACMTGECORP,0,0,35
4,0,199902,N,202901,35644,0,1,O,78,18,...,7600,F199Q1178517,N,360,2,GM,GMACMTGECORP,0,0,54


In [25]:
#shape of the dataset
df.shape

(291451, 28)

dataset consists 291451 rows and 28 columns

In [29]:
#list of columns
df.columns

Index(['CreditScore', 'FirstPaymentDate', 'FirstTimeHomebuyer', 'MaturityDate',
       'MSA', 'MIP', 'Units', 'Occupancy', 'OCLTV', 'DTI', 'OrigUPB', 'LTV',
       'OrigInterestRate', 'Channel', 'PPM', 'ProductType', 'PropertyState',
       'PropertyType', 'PostalCode', 'LoanSeqNum', 'LoanPurpose',
       'OrigLoanTerm', 'NumBorrowers', 'SellerName', 'ServicerName',
       'EverDelinquent', 'MonthsDelinquent', 'MonthsInRepayment'],
      dtype='object')

# Data Preprocessing 

### a)Handling missing values

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291451 entries, 0 to 291450
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CreditScore         291451 non-null  int64  
 1   FirstPaymentDate    291451 non-null  int64  
 2   FirstTimeHomebuyer  291451 non-null  object 
 3   MaturityDate        291451 non-null  int64  
 4   MSA                 291451 non-null  object 
 5   MIP                 291451 non-null  int64  
 6   Units               291451 non-null  int64  
 7   Occupancy           291451 non-null  object 
 8   OCLTV               291451 non-null  int64  
 9   DTI                 291451 non-null  int64  
 10  OrigUPB             291451 non-null  int64  
 11  LTV                 291451 non-null  int64  
 12  OrigInterestRate    291451 non-null  float64
 13  Channel             291451 non-null  object 
 14  PPM                 291451 non-null  object 
 15  ProductType         291451 non-nul

Here we see,SellerName having null values and is data type object

In [36]:
df.isnull().sum()

CreditScore               0
FirstPaymentDate          0
FirstTimeHomebuyer        0
MaturityDate              0
MSA                       0
MIP                       0
Units                     0
Occupancy                 0
OCLTV                     0
DTI                       0
OrigUPB                   0
LTV                       0
OrigInterestRate          0
Channel                   0
PPM                       0
ProductType               0
PropertyState             0
PropertyType              0
PostalCode                0
LoanSeqNum                0
LoanPurpose               0
OrigLoanTerm              0
NumBorrowers              0
SellerName            24994
ServicerName              0
EverDelinquent            0
MonthsDelinquent          0
MonthsInRepayment         0
dtype: int64

SellerName has 24994 null values , now we fill them by imputation

In [45]:
imputer = SimpleImputer(strategy = "most_frequent")
df["SellerName"] = imputer.fit_transform(df[["SellerName"]]).ravel()

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291451 entries, 0 to 291450
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CreditScore         291451 non-null  int64  
 1   FirstPaymentDate    291451 non-null  int64  
 2   FirstTimeHomebuyer  291451 non-null  object 
 3   MaturityDate        291451 non-null  int64  
 4   MSA                 291451 non-null  object 
 5   MIP                 291451 non-null  int64  
 6   Units               291451 non-null  int64  
 7   Occupancy           291451 non-null  object 
 8   OCLTV               291451 non-null  int64  
 9   DTI                 291451 non-null  int64  
 10  OrigUPB             291451 non-null  int64  
 11  LTV                 291451 non-null  int64  
 12  OrigInterestRate    291451 non-null  float64
 13  Channel             291451 non-null  object 
 14  PPM                 291451 non-null  object 
 15  ProductType         291451 non-nul

Now all columns without null values

In [50]:
df.isnull().sum()

CreditScore           0
FirstPaymentDate      0
FirstTimeHomebuyer    0
MaturityDate          0
MSA                   0
MIP                   0
Units                 0
Occupancy             0
OCLTV                 0
DTI                   0
OrigUPB               0
LTV                   0
OrigInterestRate      0
Channel               0
PPM                   0
ProductType           0
PropertyState         0
PropertyType          0
PostalCode            0
LoanSeqNum            0
LoanPurpose           0
OrigLoanTerm          0
NumBorrowers          0
SellerName            0
ServicerName          0
EverDelinquent        0
MonthsDelinquent      0
MonthsInRepayment     0
dtype: int64

### b)Encoding

encoding is the process of convrting categorical data into numerical data using encoding techniques like OneHotEncoding for nominal data or Ordinal encoding for ordinal data

In [64]:
df["LoanPurpose"].unique()

array(['P', 'N', 'C'], dtype=object)

In [72]:
df["PPM"].unique()

array(['N', 'X', 'Y'], dtype=object)

In [99]:
ordinal_encoder = OrdinalEncoder()
df["LoanPurpose"] = ordinal_encoder.fit_transform(df[["LoanPurpose"]])
df["PropertyType"] = ordinal_encoder.fit_transform(df[["PropertyType"]])
df["PropertyState"] = ordinal_encoder.fit_transform(df[["PropertyState"]])
df["PPM"] = ordinal_encoder.fit_transform(df[["PPM"]])
df["Channel"] = ordinal_encoder.fit_transform(df[["Channel"]])
df["FirstTimeHomebuyer"] = ordinal_encoder.fit_transform(df[["FirstTimeHomebuyer"]])
df["ProductType"] = ordinal_encoder.fit_transform(df[["ProductType"]])
df["Occupancy"] = ordinal_encoder.fit_transform(df[["Occupancy"]])

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291451 entries, 0 to 291450
Data columns (total 28 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CreditScore         291451 non-null  int64  
 1   FirstPaymentDate    291451 non-null  int64  
 2   FirstTimeHomebuyer  291451 non-null  float64
 3   MaturityDate        291451 non-null  int64  
 4   MSA                 291451 non-null  object 
 5   MIP                 291451 non-null  int64  
 6   Units               291451 non-null  int64  
 7   Occupancy           291451 non-null  object 
 8   OCLTV               291451 non-null  int64  
 9   DTI                 291451 non-null  int64  
 10  OrigUPB             291451 non-null  int64  
 11  LTV                 291451 non-null  int64  
 12  OrigInterestRate    291451 non-null  float64
 13  Channel             291451 non-null  float64
 14  PPM                 291451 non-null  float64
 15  ProductType         291451 non-nul

we observe NumBorrowers column numeric values but it showing dtype as boject.so we change dtype than encoding it.

In [83]:
df["NumBorrowers"] = ordinal_encoder.fit_transform(df[["NumBorrowers"]].astype(str))

in LoanSeqNum has both numeric and characters.so do same as.

In [93]:
df["LoanSeqNum"] = ordinal_encoder.fit_transform(df[["LoanSeqNum"]].astype(str))

In [107]:
# MSA column having integer values but data type as object,we simply chance the data type to numerical.
df["MSA"] = pd.to_numeric(df["MSA"],errors ="coerce")

PostalCode , SellerName and ServicerName are may not be usefull for further analysis , so that we just drop those columns.

In [112]:
new_df = df.drop(columns = ["PostalCode","SellerName" , "ServicerName"])

In [115]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291451 entries, 0 to 291450
Data columns (total 25 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   CreditScore         291451 non-null  int64  
 1   FirstPaymentDate    291451 non-null  int64  
 2   FirstTimeHomebuyer  291451 non-null  float64
 3   MaturityDate        291451 non-null  int64  
 4   MSA                 252351 non-null  float64
 5   MIP                 291451 non-null  int64  
 6   Units               291451 non-null  int64  
 7   Occupancy           291451 non-null  float64
 8   OCLTV               291451 non-null  int64  
 9   DTI                 291451 non-null  int64  
 10  OrigUPB             291451 non-null  int64  
 11  LTV                 291451 non-null  int64  
 12  OrigInterestRate    291451 non-null  float64
 13  Channel             291451 non-null  float64
 14  PPM                 291451 non-null  float64
 15  ProductType         291451 non-nul

### c)Feature transformation

in Feature transformations , we perform scaling.is used to set all column values are in the same scale , this improves model performence inorder to get accurate results.

In [121]:
#new_df.to_csv("encoded.csv")

In [149]:
imp = SimpleImputer(strategy = "mean")
new_df["MSA"] = imp.fit_transform(new_df[["MSA"]])

For avaid the Data likage problem , by which model could not gedralize well.To avaid this we apply scaling on train data and transform test data only.

In [151]:
features = new_df.drop(columns = ["EverDelinquent"])
target = new_df["EverDelinquent"]

In [153]:
#split data into train and test datasets with 20% test
feature_train , feature_test , target_train , target_test = train_test_split(features,target,test_size = 0.2 , random_state = 42)

apply Normalization or MinMaxScaler on feature_train data(fit_transform) and feature_test(transform)

In [155]:
min_max = MinMaxScaler()
feature_train_scaled = min_max.fit_transform(feature_train)
feature_test_scaled = min_max.transform(feature_test)

### Feature Importance

find the best features , that are improve the model performace.we use an ensemble method called Random forst classifier algorithm which has atribute feature_importance_ to findout the importance of each column with target column

In [163]:
rfc = RandomForestClassifier()
rfc.fit(feature_train_scaled , target_train)
importances = rfc.feature_importances_
features = features.columns
im_df = pd.DataFrame({"feature":features,"importance":importances}).sort_values(by = "importance" , ascending = False) 

In [167]:
im_df

Unnamed: 0,feature,importance
22,MonthsDelinquent,0.904125
0,CreditScore,0.031689
23,MonthsInRepayment,0.028013
10,OrigUPB,0.004555
18,LoanSeqNum,0.004226
9,DTI,0.003601
12,OrigInterestRate,0.003188
4,MSA,0.003142
16,PropertyState,0.002874
11,LTV,0.002753


Here we can Observe,MonthsDelinquent,CreditScore,MonthsInRepayment,OrigUPB are important features against to EverDelinquent. Hence we conclude  This features as good predicters.