### Notebook Preperation:

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

sns.set()
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline

---

In [3]:
pd.set_option('display.max_columns', None)

---

### Loading Data, Creating Target variable, and Preprocessing:

**Loading data**

In [4]:
loan_data = pd.read_csv('Bondora_preprocessed.csv')

print(loan_data.isnull().sum().sum())
loan_data.columns

0


Index(['BidsPortfolioManager', 'BidsApi', 'BidsManual', 'NewCreditCustomer',
       'VerificationType', 'LanguageCode', 'Age', 'Gender', 'Country',
       'AppliedAmount', 'Amount', 'Interest', 'LoanDuration', 'MonthlyPayment',
       'UseOfLoan', 'Education', 'MaritalStatus', 'EmploymentStatus',
       'EmploymentDurationCurrentEmployer', 'OccupationArea',
       'HomeOwnershipType', 'IncomeTotal', 'ExistingLiabilities',
       'LiabilitiesTotal', 'RefinanceLiabilities', 'DebtToIncome', 'FreeCash',
       'Rating', 'Restructured', 'CreditScoreEsMicroL',
       'PrincipalPaymentsMade', 'InterestAndPenaltyPaymentsMade',
       'PrincipalBalance', 'InterestAndPenaltyBalance',
       'NoOfPreviousLoansBeforeLoan', 'AmountOfPreviousLoansBeforeLoan',
       'PreviousRepaymentsBeforeLoan',
       'PreviousEarlyRepaymentsCountBeforeLoan', 'LoanStatus'],
      dtype='object')

#### **Creating Target Variables**

**Calculating LoanTenure**

In [5]:
# Load raw data
row_data = pd.read_csv('Bondora_raw.csv')

# Filter the for Status = Repaid or Late
row_data = row_data[row_data['Status'].isin(['Repaid', 'Late'])]

# Choose only needed columns to calculate loan tenure
row_data = row_data[['MaturityDate_Original', 'LoanDate']]

In [6]:
# Convert date columns into datetime objects
for col in row_data.columns.values:
    row_data[col] = pd.to_datetime(row_data[col])

# Calculate LoanTenure
row_data['LoanTenure'] = ((row_data['MaturityDate_Original'].dt.year - row_data['LoanDate'].dt.year)*12 +\
     (row_data['MaturityDate_Original'].dt.month - row_data['LoanDate'].dt.month))

# Set that column in loan_data
loan_data['LoanTenure'] = row_data['LoanTenure'].values

# Drop LoanDuration as it has errors
loan_data = loan_data.drop('LoanDuration', axis=1)

I. Equaty Monthly Installments (EMI)

In [7]:
def cal_EMI(P, r, n):
  P = P.values
  r = r.values
  n = n.values
  #print(P.shape[0])
  result_1 = np.empty(0)
  result_2 = np.empty(0)
  result = np.empty(0)
  for i in range(P.shape[0]):
    #print(P[i])
    #print(r[i])
    #print(n[i])
    # EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)
    #print(P[i] * (1 + r[i]))
    result_1 = np.append(result_1, P[i] * r[i] * np.power((1 + r[i]),n[i]))
    result_2 = np.append(result_2, np.power((1 + r[i]),n[i]) - 1)
    result = np.append(result, (result_1[i] / result_2[i]))

  return result

In [8]:
loan_data['EMI'] = cal_EMI(loan_data['Amount'], loan_data['Interest']/100, loan_data['LoanTenure'])

II. Eligible Loan Amount (ELA)

In [10]:
def eligible_loan_amnt(df):
  df['Ava_Inc'] = ((df['IncomeTotal']-df['LiabilitiesTotal'])*0.3) * df['LoanTenure']
  df['Total_Loan_Amnt'] = np.round((df['AppliedAmount'] + (df['AppliedAmount'] * df['Interest']) /100)*df['LoanTenure'])
  #Ava_Inc = df['Ava_Inc'].values
  #Total_Loan_Amnt = df['Total_Loan_Amnt'].values
  ELA = np.empty(0)
  for i in range(len(df['Ava_Inc'].values)):
    if df['Ava_Inc'].iloc[i] <= 0:
      ELA = np.append(ELA, 0)
    elif df['Total_Loan_Amnt'].iloc[i] <= df['Ava_Inc'].iloc[i]:
      ELA = np.append(ELA, df['Total_Loan_Amnt'].iloc[i])
    else:
      ELA = np.append(ELA, df['Ava_Inc'].iloc[i])
  return ELA

In [11]:
loan_data['ELA'] = eligible_loan_amnt(loan_data)

In [12]:
loan_data['ELA'].describe()

count    7.739400e+04
mean     1.427982e+04
std      2.104740e+04
min      0.000000e+00
25%      4.007115e+03
50%      1.019737e+04
75%      1.925187e+04
max      1.018758e+06
Name: ELA, dtype: float64

III. Preferred ROI (PROI) ---> needs redefining

In [13]:

def PROI(df):
    # Calculate ROI
    df['InterestAmount'] = (df['Amount']*(df['Interest']/100))
    df['TotalAmount'] = (df['InterestAmount'] + df['Amount'])
    df['ROI'] = (df['InterestAmount'] / df['TotalAmount'])*100
    
    # Setting PROI
    df['PROI'] = df['ROI'].median()

    for i in range(df.shape[0]):
        # Check out LoanTenure
        if df['LoanTenure'].loc[i] <= 19:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['LoanTenure'].loc[i] > 25:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5
        
        # Check out AppliedAmount
        if (df['AppliedAmount'].loc[i] <= 1175) & (df['AppliedAmount'].loc[i] >= 850):
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        elif df['AppliedAmount'].loc[i] > 2000:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

        # Check out IncomeTotal
        if df['IncomeTotal'].loc[i] <= 1000:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5

        # Check out DebtToIncome
        if df['DebtToIncome'].loc[i] == 0:
            df['PROI'].loc[i] = df['PROI'].loc[i] - 5
        else:
            df['PROI'].loc[i] = df['PROI'].loc[i] + 5

    return df['PROI']

In [14]:
loan_data['PROI'] = PROI(loan_data)
loan_data['PROI'].describe()

count    77394.000000
mean        27.574537
std          6.950948
min          4.216589
25%         24.216589
50%         29.216589
75%         29.216589
max         39.216589
Name: PROI, dtype: float64

VI. LoanStatus

In [15]:
loan_data['LoanStatus'] = np.where(loan_data['LoanStatus']=='NoDefault', 1, 0)

#### **2. Handling Outliers:**

In [16]:
# Let's compute IQR for each numerical feature
df_IQR = loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) - loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25)

# Let's compute maximum and minimum limits
df_Max =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.75) + (1.5*df_IQR)
df_Min =  loan_data[loan_data.select_dtypes([float, int]).columns].quantile(.25) - (1.5*df_IQR)

In [17]:
# Loop for replacing outliers above upper bound with the upper bound value:
for column in loan_data.select_dtypes([float, int]).columns :
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Max =  loan_data[column].quantile(.75) + (1.5*col_IQR)
    loan_data[column][loan_data[column] > col_Max] =  col_Max

In [18]:
# Loop for replacing outliers under lower bound with the lower bound value:
for column in loan_data.select_dtypes([float, int]).columns :
    col_IQR = loan_data[column].quantile(.75) - loan_data[column].quantile(.25)
    col_Min =  loan_data[column].quantile(.25) - (1.5*col_IQR)
    loan_data[column][loan_data[column] < col_Min] =  col_Min

---

#### 3. **X, y split**

In [19]:
# Defining Independent variables Dataset
X = loan_data.drop(['EMI', 'ELA', 'PROI', 'LoanStatus'], axis=1)

# Assigning target variables for both Models 
y = loan_data[['LoanStatus', 'EMI', 'ELA', 'PROI']]

#### 4. Feature Selection

In [20]:
# A function to select highly correlated features.
def Correlation(dataset, threshold): 
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

In [21]:
# let's selected features with a correlation factor > 0.8
Correlation(X, 0.8)

{'Amount',
 'AmountOfPreviousLoansBeforeLoan',
 'NoOfPreviousLoansBeforeLoan',
 'ROI',
 'TotalAmount',
 'Total_Loan_Amnt'}

In [22]:
# Now we can drop these features from our dataset
X.drop(columns= ['LoanTenure', 'ROI', 'Amount', 'TotalAmount', 'Total_Loan_Amnt', 'AmountOfPreviousLoansBeforeLoan', 'NoOfPreviousLoansBeforeLoan'], inplace = True )

#### **5.Feature Encoding**

In [23]:
X = pd.get_dummies(X, drop_first=True)

---

#### **6. train, test split**

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Separating Target values for classifications and regression problems
y_class_train = y_train.iloc[:,0]
y_reg_train = y_train.iloc[:,1:]

y_class_test = y_test.iloc[:,0]
y_reg_test = y_test.iloc[:,1:]

---

## Classification Pipeline

In [25]:
# Create Pipeline
pipeline_class = Pipeline([
    ('stdscaler', StandardScaler()),
    ('pca', PCA(n_components=110)),
    ('Classifier', LogisticRegression(random_state=0))
])

# fit and transform the pipeline
pipeline_class.fit(X_train, y_class_train)

# predict using the pipeline
pred_class = pipeline_class.predict(X_test)

In [26]:
print("Logistic Regression:")

print("\nAccuracy score:\n", round(accuracy_score(y_class_test, pred_class)*100,2), '%')
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_class_test, pred_class))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_class_test, pred_class,
                                        target_names=['Default', 'Non-deafault']))

Logistic Regression:

Accuracy score:
 89.35 %
****************************************

Confusion Matrix:
 [[9031 1648]
 [ 412 8258]]
****************************************

Classification Report:
               precision    recall  f1-score   support

     Default       0.96      0.85      0.90     10679
Non-deafault       0.83      0.95      0.89      8670

    accuracy                           0.89     19349
   macro avg       0.90      0.90      0.89     19349
weighted avg       0.90      0.89      0.89     19349



## Regression Pipeline

In [27]:
pipeline_reg = Pipeline([
    ('stdscaler', StandardScaler()),
    ('pca', PCA(n_components=110)),
    ('Regressor', Ridge(random_state=0))
])

# fit and transform the pipeline
pipeline_reg.fit(X_train, y_reg_train)

# predict using the pipeline
pred_reg = pipeline_reg.predict(X_test)

In [28]:
# Score and test results
print('R2_score : ', round(r2_score(y_reg_test, pred_reg)*100,2), '%')

R2_score :  89.62 %


---

## saving model

In [29]:
pickle.dump(pipeline_class, open('pipeline_class.pkl', 'wb'))
pickle.dump(pipeline_reg, open('pipeline_reg.pkl', 'wb'))

---