## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import eli5
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [3]:
from sklearn.feature_selection import mutual_info_regression, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier, RandomForestRegressor, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from imblearn.over_sampling import RandomOverSampler, SMOTE
from eli5.sklearn import PermutationImportance
from pprint import pprint

from xgboost import XGBRegressor, XGBClassifier

In [4]:
df = pd.read_csv('kf_data.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106159 entries, 0 to 106158
Data columns (total 48 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   ProsperRating (numeric)             106159 non-null  float64
 1   ProsperScore                        106159 non-null  float64
 2   Term                                106159 non-null  int64  
 3   BorrowerRate                        106159 non-null  float64
 4   ProsperRating (Alpha)               106159 non-null  object 
 5   ListingCategory (numeric)           106159 non-null  int64  
 6   IsBorrowerHomeowner                 106159 non-null  int64  
 7   CurrentlyInGroup                    106159 non-null  int64  
 8   OpenRevolvingAccounts               106159 non-null  int64  
 9   OpenRevolvingMonthlyPayment         106159 non-null  float64
 10  IncomeRange                         106159 non-null  object 
 11  IncomeVerifiable          

In [6]:
df.drop('ProsperRating (numeric)',axis=1,inplace=True)

In [7]:
y = df[['EMI','ELA','PROI','Riskk']]
df.drop(['EMI','ELA','PROI','Riskk'],axis=1,inplace=True)
x = df

## 2. Train-Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, shuffle = True,test_size=0.33,random_state=100)

In [10]:
yEMI_train = y_train.iloc[:,0]
yELA_train = y_train.iloc[:,1]
yPROI_train = y_train.iloc[:,2]
yRisk_train = y_train.iloc[:,3]
yEMI_test = y_test.iloc[:,0]
yELA_test = y_test.iloc[:,1]
yPROI_test = y_test.iloc[:,2]
yRisk_test = y_test.iloc[:,3]

In [11]:
trf1 = ColumnTransformer([
    ('ohe',OneHotEncoder(sparse=False,handle_unknown='ignore'),[23,24,25,42]),
    ('ord',OrdinalEncoder(categories=[['Missing','HR','E','D','C','B','A','AA'],['Not displayed','$0','Not employed','$1-24,999','$75,000-99,999','$100,000+','$50,000-74,999','$25,000-49,999']]),[3,9])
],remainder = 'passthrough')

## 3. Creating Pipelines

**Pipeline to predict EMI**

In [12]:
pipe_EMI = Pipeline([
    ('trf1',trf1),
    ('scale', StandardScaler()),
    ('reg',XGBRegressor(random_state = 10))
])

In [13]:
pipe_EMI.fit(X_train,yEMI_train)

In [14]:
yEMI2 = pipe_EMI.predict(X_test)

In [15]:
EMI_accuracy = r2_score(yEMI_test, yEMI2)

In [16]:
EMI_accuracy

0.9986296148265381

**Accuracy for EMI (Equated Monthly Income) prediction is about 98.07%**

**Pipeline to predict EMI**

In [17]:
pipe_ELA = Pipeline([
    ('trf1',trf1),
    ('scale', StandardScaler()),
    ('reg',XGBRegressor(random_state = 10))
])

In [18]:
pipe_ELA.fit(X_train,yELA_train)

In [19]:
yELA2 = pipe_ELA.predict(X_test)

In [20]:
ELA_accuracy = r2_score(yELA_test, yELA2)

In [21]:
ELA_accuracy

0.9318443066923667

**Accuracy for ELA (Eligible Loan Amount) prediction is about 93.18%**

**Pipeline to predict PROI**

In [22]:
pipe_PROI = Pipeline([
    ('trf1',trf1),
    ('scale', StandardScaler()),
    ('reg',XGBRegressor(random_state = 10))
])

In [23]:
pipe_PROI.fit(X_train,yPROI_train)

In [24]:
yPROI2 = pipe_PROI.predict(X_test)

In [25]:
PROI_accuracy = r2_score(yPROI_test, yPROI2)

In [26]:
PROI_accuracy

0.9999126732352437

**Accuracy for PROI (Preferred Return on Investment) prediction is about 98.07%**

**Pipeline to predict Credit Risk Status**

In [42]:
pipex = Pipeline([
    ('trf1',trf1),
    ('scale', StandardScaler()),
    ('clas',GradientBoostingClassifier(max_depth=4))
])

In [43]:
pipex.fit(X_train,yRisk_train)

In [44]:
yRisk2 = pipex.predict(X_test)

In [46]:
Risk_accuracy = accuracy_score(yRisk_test, yRisk2)

In [47]:
Risk_accuracy

0.9807895412896412

**Accuracy for risk prediction is about 98.07%**

## 4. Export the pipelines

In [27]:
pickle.dump(pipe_EMI,open('pipeEMI.pkl','wb'))
pickle.dump(pipe_ELA,open('pipeELA.pkl','wb'))
pickle.dump(pipe_PROI,open('pipePROI.pkl','wb'))

In [45]:
pickle.dump(pipex,open('pipenew2.pkl','wb'))