In [1]:
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load  the loan.csv file into a pandas dataframe
loans_data = pd.read_csv('loans.csv')

In [3]:
print('loan data shape: ', loans_data.shape)

loan data shape:  (6019, 11)


In [4]:
loans_data.head()

Unnamed: 0,id_number,loan_amount,lender_count,status,funded_date,funded_amount,repayment_term,location_country_code,sector,description,use
0,736066,4825,60,funded,2014-08-03T17:51:50Z,4825,8,BJ,Retail,,
1,743090,975,34,funded,2014-08-18T09:10:54Z,975,12,BJ,Food,,
2,743120,950,25,funded,2014-08-09T17:46:35Z,950,14,BJ,Services,,
3,743121,825,28,funded,2014-08-24T17:00:38Z,825,14,BJ,Retail,,
4,743124,725,21,funded,2014-08-25T03:24:54Z,725,13,BJ,Retail,,


In [5]:
loans_data.isnull().sum()

id_number                  0
loan_amount                0
lender_count               0
status                     0
funded_date              937
funded_amount              0
repayment_term             0
location_country_code     17
sector                     0
description              342
use                      342
dtype: int64

In [6]:
loans_data['location_country_code'].fillna(method='ffill', inplace=True)

In [7]:
loans_data.dropna(axis=1, inplace=True)

In [8]:
loans_data.isnull().sum()

id_number                0
loan_amount              0
lender_count             0
status                   0
funded_amount            0
repayment_term           0
location_country_code    0
sector                   0
dtype: int64

In [9]:
x = loans_data.drop(['id_number','funded_amount','location_country_code','repayment_term'], axis=1)
y = loans_data['repayment_term']

In [10]:
x.columns

Index(['loan_amount', 'lender_count', 'status', 'sector'], dtype='object')

In [11]:
x.head()

Unnamed: 0,loan_amount,lender_count,status,sector
0,4825,60,funded,Retail
1,975,34,funded,Food
2,950,25,funded,Services
3,825,28,funded,Retail
4,725,21,funded,Retail


In [12]:
x['sector'].unique()

array(['Retail', 'Food', 'Services', 'Clothing', 'Arts', 'Agriculture',
       'Manufacturing', 'Construction', 'Wholesale', 'Health',
       'Transportation', 'Education', 'Housing', 'Personal Use'],
      dtype=object)

In [13]:
print(y)

0        8
1       12
2       14
3       14
4       13
        ..
6014    14
6015    14
6016    14
6017    14
6018    14
Name: repayment_term, Length: 6019, dtype: int64


In [14]:
from sklearn.preprocessing import MinMaxScaler
# function to preprocess our data from train models
def preprocessing_data(data):

    # Convert the following numerical labels from interger to float
    float_array = data[['loan_amount', 'lender_count']].values.astype(float)
    
    # categorical features to be onverted to One Hot Encoding
    categ = ['status', 'sector']
                                                                            #encoding of categorical features?
    # One Hot Encoding conversion
    data = pd.get_dummies(data, columns=categ)
    
    # scale our data into range of 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    data = scaler.fit_transform(data)
    
    return data                  

In [15]:
X = preprocessing_data(x)


In [16]:
X.shape

(6019, 19)

In [18]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Further split training data into actual training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [20]:
# Display the shapes of the datasets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (4333, 19)
X_val shape: (482, 19)
X_test shape: (1204, 19)


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
# using the Linear Regression Model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

In [23]:
# Make predictions on the validation set using the linear regression model
y_val_pred = reg_model.predict(X_val)

In [24]:
# Evaluating the performance of the linear regression model
MSE = mean_squared_error(y_val, y_val_pred)
R2 = r2_score(y_val, y_val_pred)

print("Mean Squared Error: ", MSE)
print("R-squared: ", R2)

Mean Squared Error:  49.28866741766079
R-squared:  0.274773331815404


In [25]:
# Using the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [26]:
y_rf_pred = rf_model.predict(X_val)

In [27]:
# Evaluating the performance of the Random Forest model
MSE = mean_squared_error(y_val, y_rf_pred)
R2 = r2_score(y_val, y_rf_pred)

print("Mean Squared Error: ", MSE)
print("R-squared: ", R2)

Mean Squared Error:  12.034858578107672
R-squared:  0.822920747385707


In [36]:
y_test_rf_prediction = rf_model.predict(X_test)

In [37]:
# Evaluating the performance of the Random Forest model
MSE = mean_squared_error(y_test, y_test_rf_prediction)
R2 = r2_score(y_test, y_test_rf_prediction)

print("Mean Squared Error: ", MSE)
print("R-squared: ", R2)

Mean Squared Error:  17.308459928295633
R-squared:  0.8610010897853573


In [38]:
import pickle
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)