In [2]:
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
#from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Load  the loan.csv file into a pandas dataframe
loans_data = pd.read_csv('loans.csv')

In [4]:
print('loan data shape: ', loans_data.shape)

loan data shape:  (6019, 11)


In [5]:
loans_data.head()

Unnamed: 0,id_number,loan_amount,lender_count,status,funded_date,funded_amount,repayment_term,location_country_code,sector,description,use
0,736066,4825,60,funded,2014-08-03T17:51:50Z,4825,8,BJ,Retail,,
1,743090,975,34,funded,2014-08-18T09:10:54Z,975,12,BJ,Food,,
2,743120,950,25,funded,2014-08-09T17:46:35Z,950,14,BJ,Services,,
3,743121,825,28,funded,2014-08-24T17:00:38Z,825,14,BJ,Retail,,
4,743124,725,21,funded,2014-08-25T03:24:54Z,725,13,BJ,Retail,,


In [6]:
loans_data.isnull().sum()

id_number                  0
loan_amount                0
lender_count               0
status                     0
funded_date              937
funded_amount              0
repayment_term             0
location_country_code     17
sector                     0
description              342
use                      342
dtype: int64

In [7]:
# Forward fill the 'location_country_code' column
loans_data['location_country_code'].fillna(method='ffill', inplace=True)

In [8]:
# Drop columns with missing values
loans_data.drop(['funded_date', 'description', 'use'], axis=1, inplace=True)

In [9]:
# Check for any remaining missing values
print(loans_data.isnull().sum())

id_number                0
loan_amount              0
lender_count             0
status                   0
funded_amount            0
repayment_term           0
location_country_code    0
sector                   0
dtype: int64


In [10]:
loans_data.dtypes

id_number                 int64
loan_amount               int64
lender_count              int64
status                   object
funded_amount             int64
repayment_term            int64
location_country_code    object
sector                   object
dtype: object

In [11]:
loans_data['status'].unique()

array(['funded', 'fundraising', 'expired'], dtype=object)

In [13]:
# Separate features and target variable
X = loans_data[['loan_amount','lender_count', 'repayment_term']]
y = loans_data['status']

In [12]:
X.dtypes

loan_amount       int64
lender_count      int64
repayment_term    int64
dtype: object

In [13]:
X.head()

Unnamed: 0,loan_amount,lender_count,repayment_term
0,4825,60,8
1,975,34,12
2,950,25,14
3,825,28,14
4,725,21,13


In [14]:
# One-hot encode the 'status' feature
y = pd.get_dummies(y, columns=['status'], prefix='status')

In [15]:
# Check for any remaining missing values
print(y.columns)

Index(['status_expired', 'status_funded', 'status_fundraising'], dtype='object')


In [16]:
from sklearn.preprocessing import MinMaxScaler
# function to preprocess our data from train models
def preprocessing_data(data):
    numerical_data = ['loan_amount', 'lender_count', 'repayment_term']
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    data[numerical_data] = scaler.fit_transform(data[numerical_data])
    return data

In [17]:
processed_data = preprocessing_data(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numerical_data] = scaler.fit_transform(data[numerical_data])


In [18]:
processed_data.shape

(6019, 3)

In [19]:
processed_data.head()

Unnamed: 0,loan_amount,lender_count,repayment_term
0,0.059725,0.022514,0.038462
1,0.01157,0.012758,0.069231
2,0.011257,0.009381,0.084615
3,0.009694,0.010507,0.084615
4,0.008443,0.00788,0.076923


In [20]:
x = processed_data

In [21]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
# Further split training data into actual training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [23]:
# Display the shapes of the datasets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (4333, 3)
X_val shape: (482, 3)
X_test shape: (1204, 3)


In [24]:
X_train.head()

Unnamed: 0,loan_amount,lender_count,repayment_term
3048,0.121951,0.116698,0.053846
1854,0.003127,0.0,0.084615
2857,0.023765,0.028518,0.053846
2156,0.007817,0.009006,0.061538
3192,0.002502,0.003752,0.115385


In [25]:
from xgboost import XGBClassifier
# Create XGBoost classifier
xg_model = XGBClassifier()

# Train the model on the training data
xg_model.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Make predictions on the validation set
y_val_pred = xg_model.predict(X_val)

# Get error rate
print("Error rate of XGB classifier: ", 1 - accuracy_score(y_val, y_val_pred))

Error rate of XGB classifier:  0.06846473029045641


In [27]:
from sklearn.metrics import confusion_matrix, classification_report
#Get classification report
print(classification_report(y_val,y_val_pred, target_names=['funded','fundraising','expired']))

              precision    recall  f1-score   support

      funded       1.00      0.33      0.50         6
 fundraising       0.96      0.98      0.97       402
     expired       0.86      0.74      0.80        74

   micro avg       0.94      0.93      0.94       482
   macro avg       0.94      0.68      0.75       482
weighted avg       0.94      0.93      0.93       482
 samples avg       0.93      0.93      0.93       482



  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Make predictions on the test data
y_test_pred = xg_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9260797342192691


In [32]:
y_test_pred.shape

(1204, 3)

In [33]:
predicted_values=np.argmax(y_test_pred, axis=1)

In [34]:
predicted_values

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [39]:
# Manually create a dictionary to map class indices to status names
class_index_to_status = {1: 'funded', 2: 'fundraising', 0: 'expired'}  # Add more classes if needed

# Map the predicted class indices back to their actual status names
predicted_statuses = [class_index_to_status.get(prediction, 'unknown') for prediction in predicted_values]


In [40]:
submission_df = pd.DataFrame({'predicted_status': predicted_statuses})

In [41]:
submission_df['predicted_status'].unique()

array(['funded', 'fundraising', 'expired'], dtype=object)

In [42]:
submission_df.head()

Unnamed: 0,predicted_status
0,funded
1,funded
2,funded
3,funded
4,funded


In [43]:
submission_df['predicted_status'].value_counts()

predicted_status
funded         1035
fundraising     135
expired          34
Name: count, dtype: int64

In [66]:
submission_df.to_csv('final_submission.csv', index=False)

In [67]:
import pickle

In [68]:
with open('xg_model.pkl', 'wb') as file:
    pickle.dump(xg_model, file)