### Notebook Preperation:

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

---

In [None]:
pd.set_option('display.max_columns', None)

---

### Loading Data, Creating Target variable, and Preprocessing:

**Loading data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df= pd.read_csv("/content/drive/MyDrive/final_kk_data.csv")
df.head()

In [None]:

print(df.isnull().sum().sum())
df.columns

#### **Creating Target Variables**

**Calculating LoanTenure**

In [None]:
# Load raw data
raw_data = pd.read_csv('/content/drive/MyDrive/Bondora_raw.csv')

# Filter the for Status = Repaid or Late
raw_data = raw_data[raw_data['Status'].isin(['Repaid', 'Late'])]

# Choose only needed columns to calculate loan tenure
raw_data = raw_data[['MaturityDate_Original', 'LoanDate']]

In [5]:
# Convert date columns into datetime objects
for col in raw_data.columns.values:
    raw_data[col] = pd.to_datetime(raw_data[col])

# Calculate LoanTenure
raw_data['LoanTenure'] = ((raw_data['MaturityDate_Original'].dt.year - raw_data['LoanDate'].dt.year)*12 +\
     (raw_data['MaturityDate_Original'].dt.month - raw_data['LoanDate'].dt.month))

# Set that column in loan_data
df['LoanTenure'] = raw_data['LoanTenure'].values

# Drop LoanDuration as it has errors
df = df.drop('LoanDuration', axis=1)

NameError: name 'raw_data' is not defined

I. Equaty Monthly Installments (EMI)

In [None]:
loan_data_temp1 = df[[ 'Amount', 'Interest','LoanTenure']]
loan_data_temp1.info()

In [None]:
def emi_cal(p, r, n):
  r = ((r /12)/100)
  emi = np.round((p * r * pow(1 + r, n)) / (pow(1 + r, n) - 1))
  return emi 

In [None]:
loan_data_temp1['EMI'] = emi_cal(loan_data_temp1['Amount'], loan_data_temp1['Interest'], loan_data_temp1['LoanTenure'])
loan_data_temp1['tot_pay_back__amt']= loan_data_temp1['EMI'] * loan_data_temp1['LoanTenure']

In [None]:
df['EMI'] = loan_data_temp1['EMI']

In [None]:
loan_data_temp1.head(20)

II. Eligible Loan Amount (ELA)

In [None]:
loan_data_temp2 = df[['AppliedAmount', 'Interest', 'IncomeTotal', 'LiabilitiesTotal', 'LoanTenure']]
loan_data_temp2.info()

In [None]:
def avlb_incm(inc,lia):
  avlb_incm = np.round((inc-lia)*0.3)
  return avlb_incm

def tot_amt_pay(app_amt, r,n):
  amt_pay_month = np.round((app_amt+(app_amt * (r/100) * (n/12))) / n)
  return amt_pay_month


In [None]:
# Step 1
loan_data_temp2['Avlb_Incm_Monthly'] = avlb_incm (loan_data_temp2['IncomeTotal'],loan_data_temp2['LiabilitiesTotal'])
loan_data_temp2['Amt_pay_Monthly'] = tot_amt_pay(loan_data_temp2['AppliedAmount'],loan_data_temp2['Interest'],loan_data_temp2['LoanTenure'])
loan_data_temp2.head()

In [None]:
def ela(df):
  avlb_incm = df['Avlb_Incm_Monthly'].values
  tot_amt_pay= df['Amt_pay_Monthly'].values 
  n = df['LoanTenure'].values
  ELA = np.empty(0)
  for i in range(len(avlb_incm)):
    if tot_amt_pay[i] <= avlb_incm[i]:
      ELA = np.append(ELA, (tot_amt_pay[i] * n[i]))
    else:
      ELA = np.append(ELA, (avlb_incm[i] * n[i]))  
  return ELA

In [None]:
loan_data_temp2['ELA'] = ela(loan_data_temp2)
loan_data_temp2.head(10)

In [None]:
df['ELA'] = loan_data_temp2['ELA']
df.columns

III. Preferred ROI (PROI) ---> needs redefining

In [None]:
loan_data_temp3=df[[ 'Amount','AppliedAmount', 'Interest','LoanTenure','IncomeTotal','DebtToIncome']]
loan_data_temp3.head()

In [None]:
loan_data_temp3['InterestAmount'] = (loan_data_temp3['Amount']*(loan_data_temp3['Interest']/100)*(loan_data_temp3['LoanTenure']/12))
loan_data_temp3['TotalAmount'] = (loan_data_temp3['InterestAmount'] + loan_data_temp3['Amount'])
loan_data_temp3['ROI'] = (loan_data_temp3['InterestAmount'] / loan_data_temp3['Amount'])*100
df['ROI'] = loan_data_temp3['ROI']

In [None]:

def proi(df):
    df['PROI'] = df['ROI'].median()      # Setting PROI

    for i in range(df.shape[0]):
        # Check out LoanTenure
        if df['LoanTenure'].loc[i] <= 19:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['LoanTenure'].loc[i] > 25:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5
        
        # Check out AppliedAmount
        if (df['AppliedAmount'].loc[i] <= 1175) & (df['AppliedAmount'].loc[i] >= 850):
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['AppliedAmount'].loc[i] > 2000:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

        # Check out IncomeTotal
        if df['IncomeTotal'].loc[i] <= 1000:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5

        # Check out DebtToIncome
        if df['DebtToIncome'].loc[i] == 0:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        else:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

    return df['PROI']

In [None]:
loan_data_temp3['PROI'] = proi(loan_data_temp3)
df['PROI'] = loan_data_temp3['PROI']

IV. LoanStatus

In [None]:
df['LoanStatus'] = np.where(df['LoanStatus']=='NotDefault', 0, 1)

In [None]:
df.columns

In [None]:
# Other Unwanted Columns

df.drop(columns=['Unnamed: 0','PreviousEarlyRepaymentsCountBeforeLoan','LoanTenure','LiabilitiesTotal','PrincipalBalance','InterestAndPenaltyBalance'], inplace = True)

In [None]:
df.shape

**1. Imputing both the Categorical and Numerical Features having Missing Values.**

In [None]:
#Categorical Features in Dataset 
catg_features = [col_name for col_name in df.columns if df[col_name].dtype == 'O']
print("Number of Categorical Features:",format(len(catg_features )))
print("Categorical Features: {}",catg_features )

In [None]:
#Imputing the missing values in categorical features using the most frequent value which is mode
catg_features_with_null = [feature for feature in catg_features if df[feature].isnull().sum()]
for each_feature in catg_features_with_null:
  mode_val = df[each_feature].mode()[0]
  df[each_feature].fillna(mode_val,inplace=True)

In [None]:
#Numerical Features in Dataset
num_features = [col_name for col_name in df.columns if df[col_name].dtype != 'O']
print("Number of Numerical Features: ",format(len(num_features)))
print("Numerical Features: {}",num_features)

In [None]:
#Impute missing values in numerical features using mean
num_features_with_null = [feature for feature in num_features if df[feature].isnull().sum()]
for feature in num_features_with_null:
   mean_value = df[feature].mean()
   df[feature].fillna(mean_value,inplace=True)

#### **2. Handling Outliers:**

In [None]:
#Let's compute IQR for each numerical feature

df_IQR = df[df.select_dtypes([float, int]).columns].quantile(.75) - df[df.select_dtypes([float, int]).columns].quantile(.25)

# Let's compute maximum and minimum limits
df_Max =  df[df.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  df[df.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

In [None]:
#Loop for replacing outliers above upper bound with the upper bound value:
for column in df.select_dtypes([float, int]).columns :
  col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
  col_Max =  df[column].quantile(.75) + (1.5*col_IQR)
  df[column][df[column] > col_Max] =  col_Max  

In [None]:
#Loop for replacing outliers under lower bound with the lower bound value:
for column in df.select_dtypes([float, int]).columns :
    col_IQR = df[column].quantile(.75) - df[column].quantile(.25)
    col_Min =  df[column].quantile(.25) - (1.5*col_IQR)
    df[column][df[column] < col_Min] =  col_Min

Making Certain adjustments for convenience :

Rounding off upto 2 decimal places ('PreviousRepaymentsBeforeLoan').

Converting features with boolean values ('NewCreditCustomer','Restructured') to categorical values.

In [None]:
df['PreviousRepaymentsBeforeLoan']=df['PreviousRepaymentsBeforeLoan'].round(decimals = 2)

In [None]:
df['NewCreditCustomer'] = df['NewCreditCustomer'].replace({True: 'True', False: 'False'})   
df['Restructured'] = df['Restructured'].replace({True: 'True', False: 'False'})   

In [None]:
df.dtypes

#### 3. **X, y split**

In [None]:
# Defining Independent variables Dataset
X = df.drop(['EMI', 'ELA', 'PROI', 'LoanStatus'], axis=1)

# Assigning target variables for both Models 
y = df[['LoanStatus', 'EMI', 'ELA', 'PROI']]

#### **4. Feature Selection**

In [None]:
# A function to select highly correlated features.
def Correlation(dataset, threshold): 
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

In [None]:
# let's selected features with a correlation factor > 0.8
Correlation(X, 0.8)

In [None]:
# Now we can drop these features from our dataset
X.drop(columns= [ 'ROI', 'Amount', 'AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan'], inplace = True )

In [None]:
print(X.shape)
X.columns

#### **5.Feature Encoding**

In [None]:
# X = pd.get_dummies(X, drop_first=True)

---

In [None]:
X.shape

#### **6. train, test split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Separating Target values for classifications and regression problems
y_class_train = y_train.iloc[:,0]
y_reg_train = y_train.iloc[:,1:]

y_class_test = y_test.iloc[:,0]
y_reg_test = y_test.iloc[:,1:]

In [None]:
#df.to_csv('df_kk.csv')

In [None]:
input=X_test.iloc[0].values

In [None]:
input=input.reshape(1,-1)

In [None]:
input

In [None]:
# from sklearn.impute import SimpleImputer

# numerical processing pipeline
numeric_processor=Pipeline(
    steps=[('stdscaler', StandardScaler(with_mean=False))]

)

numeric_processor

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# categorical procesing pipeline
categorical_processor=Pipeline(
    steps=[("ord_enc",OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
           ('stdscaler', StandardScaler(with_mean=False))]
)

categorical_processor

In [None]:
# combine processing technqiues
from sklearn.compose import ColumnTransformer

preprocessor=ColumnTransformer(
    [("categorical",categorical_processor,[3,4, 5, 7, 8, 12, 13, 14,15,16,17,18,24,25,26]),
    ("numerical",numeric_processor,[0,1,2,6,9,10,11,19,20,21,22,23,27,28,29])]
)

BidsPortfolioManager 0
BidsApi 1
BidsManual 2
NewCreditCustomer 3
VerificationType 4
LanguageCode 5
Age 6
Gender 7
Country 8
AppliedAmount 9
Interest 10
MonthlyPayment 11
UseOfLoan 12
Education 13
MaritalStatus 14
EmploymentStatus 15
EmploymentDurationCurrentEmployer 16
OccupationArea 17
HomeOwnershipType 18
IncomeTotal 19
ExistingLiabilities 20
RefinanceLiabilities 21
DebtToIncome 22
FreeCash 23
Rating 24
Restructured 25
CreditScoreEsMicroL 26
PrincipalPaymentsMade 27
InterestAndPenaltyPaymentsMade 28
PreviousRepaymentsBeforeLoan 29

In [None]:
X.dtypes

In [None]:
X.columns

In [None]:
# combine processing technqiues
# from sklearn.compose import ColumnTransformer

# preprocessor=ColumnTransformer(
#     [("categorical",categorical_processor,["VerificationType", "LanguageCode", "Gender", "Country", "UseOfLoan", "Education", "MaritalStatus",
#                                            "EmploymentStatus", "EmploymentDurationCurrentEmployer","OccupationArea", "HomeOwnershipType",
#                                            "Rating", "CreditScoreEsMicroL"]),
#     ("numerical",numeric_processor,["BidsPortfolioManager", "BidsApi", "BidsManual", "NewCreditCustomer", "Age", "AppliedAmount",
#                                     "Interest", "MonthlyPayment", "IncomeTotal", "ExistingLiabilities", "RefinanceLiabilities",
#                                     "DebtToIncome", "FreeCash", "Restructured", "PrincipalPaymentsMade", "InterestAndPenaltyPaymentsMade",
#                                     "PreviousRepaymentsBeforeLoan"])]
# )

In [None]:
catg_features = [col_name for col_name in X.columns if X[col_name].dtype == 'O']
print("Number of Categorical Features: ",format(len(catg_features)))
print("Categorical Features: {}",catg_features)

In [None]:
num_features = [col_name for col_name in X.columns if X[col_name].dtype != 'O']
print("Number of Numerical Features: ",format(len(num_features)))
print("Numerical Features: {}",num_features)

---

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.linear_model import Ridge

In [None]:
pca = PCA(n_components=30)
svc_classifier = SVC(kernel='rbf')
regressor = Ridge(random_state=0)


## Classification Pipeline

In [None]:
# Create Pipeline
pipeline_class = make_pipeline(preprocessor,pca,svc_classifier)

# fit and transform the pipeline
pipeline_class.fit(X_train, y_class_train)

# predict using the pipeline
pred_class = pipeline_class.predict(X_test)

In [None]:
print("Support Vector Classifier:")

print("\nAccuracy score:\n", round(accuracy_score(y_class_test, pred_class)*100,2), '%')
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_class_test, pred_class))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_class_test, pred_class,
                                        target_names=['Default', 'NotDefault']))

## Regression Pipeline

In [None]:
pipeline_reg =make_pipeline(preprocessor,pca,regressor)

# fit and transform the pipeline
pipeline_reg.fit(X_train, y_reg_train)

# predict using the pipeline
pred_reg = pipeline_reg.predict(X_test)

In [None]:
# Score and test results
print('R2_score : ', round(r2_score(y_reg_test, pred_reg)*100,2), '%')

---

## saving model

In [None]:
import pickle as pkl 

In [None]:
pickle_out1 = open("pipeline_class2.pkl", "wb")  
pkl.dump(pipeline_class, pickle_out1)  
pickle_out1.close()  

In [None]:
pickle_out2 = open("pipeline_reg2.pkl", "wb")  
pkl.dump(pipeline_reg, pickle_out2)  
pickle_out2.close()  

In [None]:
# import pickle
# pickle.dump(pipeline_class, open('pipeline_class2.pkl', 'wb'))
# pickle.dump(pipeline_reg, open('pipeline_reg2.pkl', 'wb'))

In [None]:
input=X_test.iloc[0].values

In [None]:
input=input.reshape(1,-1)

In [None]:
input

In [None]:
model1=pickle.load(open('pipeline_class2.pkl','rb'))

In [None]:
int(model1.predict(input))

In [None]:
# import json
# columns = {
#     'data_columns' : [col.lower() for col in X.columns]
# }
# with open("columns.json","w") as f:
#     f.write(json.dumps(columns))

---