# Springboard - Data Science Career Track

# Capstone Project I: Kiva.org loan delays

In [1]:
# link for the data
kiva_url = "https://www.kaggle.com/lucian18/mpi-on-regions#all_kiva_loans.csv"

### Goal of the project:
    
**Understand what may cause a higher delay between:**

-  the "posted time" (the time at which the loan is posted on Kiva by the field agent)
-  the "funded time" (the time at which the loan posted to Kiva gets 100% funded by lenders)
- the "disbursed time" (the time at which the loan is disbursed by the field agent to the borrower)

#### **The journey of a Kiva loan**

- A borrower applies for a loan.
- The loan goes through the underwriting and approval process
- If it is approved, the loan disbursal period starts
    - The loan is posted to Kiva for lenders to support
    - Fundraising period
    - Fundraising complete (or not)
- Borrower repays the loan
- Lenders use repayments to fund new loans, donate or withdraw the money.

Partner loans are administered by Kiva’s Field Partners and are available to borrowers in more than 80 countries. Direct loans are made using the digital payment system and don’t involve a Field Partner.

**Note**: *Disbursal* refers to when the borrower can access the money, but the timing of it can vary:    
  - For most Field Partner loans, the money is *pre-disbursed* (when partners give the funds out before the loan is posted), so the borrower can access the funds right away. 
  - For direct loans, the money is disbursed only after the loan has been fully crowdfunded on the Kiva website.

### Libraries

In [2]:
%matplotlib inline
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import zscore, normaltest, shapiro, norm, t
from collections import Counter
sns.set()
from scipy import stats
import scipy
import statsmodels.api as sm

## 1) Data inspection and Cleaning

https://github.com/MigBap/Springboard-Capstone-Project-I/blob/master/Capstone%20Project%20I%20--%201)%20Data%20inspection%20and%20Cleaning.ipynb

## 2) Exploratory Data Analysis

https://github.com/MigBap/Springboard-Capstone-Project-I/blob/master/Capstone%20Project%20I%20--%20%202)%20Exploratory%20Data%20Analysis.ipynb

In [3]:
%%capture
%run "Capstone Project I --  2) Exploratory Data Analysis.ipynb"

## 3) Modeling

In [6]:
final_df.head()

Unnamed: 0,real_time,loan_amount,lender_term,num_lenders_total,sector_name_Agriculture,sector_name_Arts,sector_name_Clothing,sector_name_Construction,sector_name_Education,sector_name_Entertainment,...,partner_id_551.0,partner_id_552.0,partner_id_553.0,partner_id_555.0,partner_id_556.0,partner_id_557.0,repayment_interval_bullet,repayment_interval_irregular,repayment_interval_monthly,repayment_interval_weekly
0,3,700.0,10.0,15,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,4,1000.0,12.0,15,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,4,1600.0,18.0,18,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,600.0,15.0,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,6,450.0,9.0,10,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Creating feature and target arrays
y = final_df['real_time'].values
X = final_df.drop('real_time', axis=1).values

In [None]:




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

# LinearRegression
reg_all = LinearRegression()

# Fit and predict
y_pred = reg_all.fit(X_train, y_train).predict(X_test)


# For classification, the performance measure was .score(X_test, y_test) calculating the fraction of hits
# For linear regression, the default measure is R^2 (variance explained by the features variance)

print("R^2:", reg_all.score(X_test, y_test))  #---> coeficiente de determinação

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [None]:
%%time

# preparing target and data values
y = final_df['real_time'].values
X = final_df.drop('real_time', axis=1).values

# Split data into 80% train and 20% test
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=3)

# Instantiate a DecisionTreeRegressor 'dt'
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=1)
# min_samples_leaf=0.1 ---> impose condition in which each leaf has to contain at least 10% of the training data

# Fit 'dt' to the training-set
dt.fit(X_train, y_train)

# Predict test-set labels
y_pred = dt.predict(X_test)

# Compute test-set RMSE 
rmse_dt = MSE(y_test, y_pred) ** (1/2)

print("Mean squared error (MSE):", rmse_dt)