# ASSESSMENT PROJECT 


# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


# Load Data

In [2]:
columns_to_use = ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
                  'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
                  'loan_status', 'purpose','repay_fail']

loan = pd.read_excel('loan-data.xlsX', usecols=columns_to_use)

#      Understanding Data

In [3]:
loan.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose', 'repay_fail'],
      dtype='object')

In [4]:
# loan.head()

# Checking missing values

In [5]:
loan.isnull().sum()

loan_amnt                1
funded_amnt              1
term                     0
int_rate                 0
installment              1
emp_length             993
home_ownership           0
annual_inc               2
verification_status      0
loan_status              0
purpose                  0
repay_fail               0
dtype: int64

In [6]:
loan=loan.dropna()

In [7]:
# loan.info()

# Clean Unnecessary Columns

In [8]:
loan_dropped=loan

In [9]:
loan_dropped['term'].unique()

array(['36 months', '60 months'], dtype=object)

In [10]:
loan_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37485 entries, 0 to 38479
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   loan_amnt            37485 non-null  float64
 1   funded_amnt          37485 non-null  float64
 2   term                 37485 non-null  object 
 3   int_rate             37485 non-null  float64
 4   installment          37485 non-null  float64
 5   emp_length           37485 non-null  object 
 6   home_ownership       37485 non-null  object 
 7   annual_inc           37485 non-null  float64
 8   verification_status  37485 non-null  object 
 9   loan_status          37485 non-null  object 
 10  purpose              37485 non-null  object 
 11  repay_fail           37485 non-null  int64  
dtypes: float64(5), int64(1), object(6)
memory usage: 3.7+ MB


In [11]:
loan_dropped['term']=loan_dropped['term'].str.split().str.get(0)

In [12]:
loan_dropped['term']=loan_dropped['term'].astype(int)

In [13]:
loan_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37485 entries, 0 to 38479
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   loan_amnt            37485 non-null  float64
 1   funded_amnt          37485 non-null  float64
 2   term                 37485 non-null  int32  
 3   int_rate             37485 non-null  float64
 4   installment          37485 non-null  float64
 5   emp_length           37485 non-null  object 
 6   home_ownership       37485 non-null  object 
 7   annual_inc           37485 non-null  float64
 8   verification_status  37485 non-null  object 
 9   loan_status          37485 non-null  object 
 10  purpose              37485 non-null  object 
 11  repay_fail           37485 non-null  int64  
dtypes: float64(5), int32(1), int64(1), object(5)
memory usage: 3.6+ MB


In [14]:
loan_dropped['emp_length'].unique()

array(['< 1 year', '4 years', '10+ years', '3 years', '5 years',
       '7 years', '2 years', '1 year', '6 years', '9 years', '8 years'],
      dtype=object)

In [15]:
emp_length_mapping = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}

# Use the mapping to replace the strings with integers
loan_dropped['emp_length'] = loan_dropped['emp_length'].replace(emp_length_mapping)

In [16]:
loan_dropped['emp_length'].value_counts()

10    8464
0     4565
2     4292
3     3939
4     3314
1     3254
5     3170
6     2144
7     1702
8     1445
9     1196
Name: emp_length, dtype: int64

In [17]:
# loan_dropped.info()

In [18]:
loan_dropped['emp_length']=loan_dropped['emp_length'].astype(float)

In [19]:
# loan_dropped.info()

In [20]:
loan_dropped['verification_status'].unique()

array(['Not Verified', 'Verified', 'Source Verified'], dtype=object)

In [21]:
verification_status_mapping = {
    'Not Verified': 0,
    'Verified': 1,
    'Source Verified': 1
}

loan_dropped['verification_status'] = loan_dropped['verification_status'].replace(verification_status_mapping)

In [22]:
# loan_dropped.info()

In [23]:
loan_dropped['home_ownership'].value_counts()

RENT        17868
MORTGAGE    16723
OWN          2765
OTHER         125
NONE            4
Name: home_ownership, dtype: int64

In [24]:
home_ownership_mapping = {
    'NONE': "OTHER",
}

loan_dropped['home_ownership'] = loan_dropped['home_ownership'].replace(home_ownership_mapping)

In [25]:
# loan_dropped.info()

In [26]:
loan_dropped['loan_status'].value_counts()

Fully Paid                                             29266
Charged Off                                             4908
Does not meet the credit policy. Status:Fully Paid      1764
Current                                                  832
Does not meet the credit policy. Status:Charged Off      674
Late (31-120 days)                                        19
In Grace Period                                           14
Late (16-30 days)                                          7
Default                                                    1
Name: loan_status, dtype: int64

In [27]:
loan_status_mapping = {
   "Does not meet the credit policy. Status:Fully Paid" : "Fully Paid",
    "Does not meet the credit policy. Status:Charged Off": "Charged Off" ,
    "In Grace Period":"Late",
    "Default":"Late",
    
    "Late (31-120 days)":"Late",
   "Late (16-30 days)" :"Late"
}

loan_dropped['loan_status'] = loan_dropped['loan_status'].replace(loan_status_mapping)

In [28]:
loan_dropped['loan_status'].value_counts()

Fully Paid     31030
Charged Off     5582
Current          832
Late              41
Name: loan_status, dtype: int64

In [29]:
loan_dropped['purpose'].value_counts()

debt_consolidation    17545
credit_card            4849
other                  3804
home_improvement       2811
major_purchase         2014
small_business         1769
car                    1433
wedding                 900
medical                 652
moving                  542
educational             377
house                   373
vacation                332
renewable_energy         84
Name: purpose, dtype: int64

In [30]:
# loan_dropped.info()

In [31]:
loan_dropped.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose', 'repay_fail'],
      dtype='object')

In [33]:
loan_dropped.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose', 'repay_fail'],
      dtype='object')

In [34]:
# loan_dropped

# Split Data

In [35]:

X = loan_dropped[['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose']]
y = loan_dropped['repay_fail']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# Create Dummy Variables

In [36]:

X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Create a logistic regression model

In [37]:

logistic_regression_model = LogisticRegression()

# Fit the model on the training data
logistic_regression_model.fit(X_train_encoded, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# Predict and Evaluate
y_pred = logistic_regression_model.predict(X_test_encoded)

# Classification Report

In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.99      0.92      6366
           1       0.49      0.03      0.06      1131

    accuracy                           0.85      7497
   macro avg       0.67      0.51      0.49      7497
weighted avg       0.80      0.85      0.79      7497



In [40]:
loan_dropped['repay_fail'].value_counts()


0    31876
1     5609
Name: repay_fail, dtype: int64

# Balance Data

In [44]:
# Filter samples with 'repay_fail' column value 0
repay_not_fail_samples = loan_dropped[loan_dropped['repay_fail'] == 0]

# Sample exactly 6000 rows from 'repay_fail' column value 0 samples
repay_not_fail_sampled = repay_not_fail_samples.sample(6000, random_state=0)

# Filter samples with 'repay_fail' column value 1
repay_fail_samples = loan_dropped[loan_dropped['repay_fail'] == 1]

# Combine 'repay_fail' column value 0 sampled and 'repay_fail' column value 1 samples
balanced_dataset = pd.concat([repay_not_fail_sampled, repay_fail_samples])

X_balanced = balanced_dataset[['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
                               'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
                               'purpose']]
y_balanced = balanced_dataset['repay_fail']

# Split the balanced dataset into training and testing sets
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=0)

In [47]:
balanced_dataset['repay_fail'].value_counts()

0    6000
1    5609
Name: repay_fail, dtype: int64

In [52]:
balanced_dataset.columns

Index(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose', 'repay_fail'],
      dtype='object')

In [54]:
X = balanced_dataset[['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'loan_status', 'purpose']]
y = balanced_dataset['repay_fail']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [55]:

X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

In [56]:
logistic_regression_model = LogisticRegression()

# Fit the model on the training data
logistic_regression_model.fit(X_train_encoded, y_train)

In [57]:
# Predict and Evaluate
y_pred = logistic_regression_model.predict(X_test_encoded)

# Classification Report on Balanced Data

In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      1150
           1       0.66      0.65      0.66      1172

    accuracy                           0.66      2322
   macro avg       0.66      0.66      0.66      2322
weighted avg       0.66      0.66      0.66      2322

