Backstory: Kiva has been collecting this data since for a month and want to get an understanding as to what's influenced borrow success with securing funding so far? Look at: Country of borrowers that have been most successful? Demographics about borrowers such as: gender, sector? Per currency? Per datetime? Term in months? Lender count?

## Factors that influence the success of a loan being fully funded within top 10 countries
## An inference model that explains the relationship between the features and funded peercentage.

In [1]:
import pandas as pd
from datetime import date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from feature_engine.categorical_encoders import MeanCategoricalEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel

# for feature engineering
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce

## DATA COLLECTION

In [9]:
kiva = pd.read_csv('kiva_loans.csv', keep_default_na=False, delimiter=',').copy()
kiva.head(1)

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
0,653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:12:39+00:00,2013-12-17 08:00:00+00:00,2014-01-02 10:06:32+00:00,12.0,12,,female,irregular,2014-01-01


In [10]:
kiva.drop(columns='id', inplace=True)

In [11]:
kiva.isnull().sum()

funded_amount         0
loan_amount           0
activity              0
sector                0
use                   0
country_code          0
country               0
region                0
currency              0
partner_id            0
posted_time           0
disbursed_time        0
funded_time           0
term_in_months        0
lender_count          0
tags                  0
borrower_genders      0
repayment_interval    0
date                  0
dtype: int64

## Binning (creating interval of features)

In [12]:
kiva['borrower_genders'].value_counts()

female                                                                                                                                                                                                                                                                                                                                                                                                            426502
male                                                                                                                                                                                                                                                                                                                                                                                                              134710
female, female                                                                                                                                                                        

In [5]:
kiva['general_female_check'] = (kiva['borrower_genders'] == 'female').astype(int)

# loans with a female in borrower group

kiva['general_female_check'].value_counts()

1    426502
0    244703
Name: general_female_check, dtype: int64

In [7]:
kiva['general_male_check'] = (kiva['borrower_genders'] == 'male').astype(int)

# loans with a male in borrower group. 
# - why isn't the number the same as above? if 1 being yes for female, so 0 for male?

kiva['general_male_check'].value_counts()

0    536495
1    134710
Name: general_male_check, dtype: int64

In [None]:
kiva['more_than_1_gender_check'] = (kiva['borrower_genders'] == 'female' and borrower_genders['female'] > 1).astype(int)

# just 1 female

In [None]:
kiva['general_female_check'] = (kiva['borrower_genders'] == 'female').astype(int)

# count of number of females, # of males in group

In [None]:
kiva['more_than_1_gender_check'] = (kiva['borrower_genders']['female'] > 1).astype(int)

# number of borrowers per loan

In [None]:
kiva.head()

Target- funded amount aka y
Hypothesis about dataset so far: 
- men more successful with securing funding than woman
- women more successful with funding for family oriented sector, then by sector
- 

Notes about data
- High cardinality - easy to overfit aka too much noise

In [None]:
kiva.shape

In [None]:
kiva.dtypes

In [None]:
categorical = [each for each in kiva.columns if kiva[each].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))

In [None]:
categorical

In [None]:
kiva.drop(columns=['tags', 'country_code', 'repayment_interval','currency','region', 'use','disbursed_time','id'],inplace=True)

In [None]:
# list of the numerical variables 
numerical = [each for each in kiva.columns if kiva[each].dtype!='O']
numerical

In [None]:
# list of variables that contain tiime and date information
kiva_temporal = [each for each in numerical if '_time' in each or 'date' in each]
kiva_temporal

In [None]:
# let's visualise the values of the discrete variables
discrete = []

for each in numerical:
    if len(kiva[each].unique()) < 20:
        print(each, ' values: ', kiva[each].unique())
        discrete.append(each)
print()
print('There are {} discrete variables'.format(len(discrete)))

In [None]:
# find continuous variables
# let's remember to skip the Id variable and the target variable SalePrice
# which are both also numerical

continuous = [each for each in numerical if each not in discrete]

print('There are {} continuous variables'.format(len(numerical)))

In [None]:
for each in kiva.columns:
    if kiva[each].isnull().sum() > 0:
        print(each, kiva[var].isnull().mean())

In [None]:
kiva.info()

### FEATURE ENGINEERING

In [None]:
kiva[['posted_time', 'funded_time','date']].head()

In [None]:
kiva['posted_time'] = pd.to_datetime(kiva['posted_time'])
kiva['funded_time'] = pd.to_datetime(kiva['funded_time'])
kiva['date'] = pd.to_datetime(kiva['date'])

kiva[['posted_time','funded_time','date']].head()

In [None]:
kiva['funded_percentage'] = kiva['funded_amount'] / kiva['loan_amount']

In [None]:
kiva.info()

In [None]:
for each in numerical:
    plt.figure(figsize=(6,4))
    plt.subplot(1, 2, 1)
    fig = kiva.boxplot(column=each)
    fig.set_title('')
    fig.set_ylabel(each)
    
    plt.subplot(1, 2, 2)
    fig = kiva[each].hist(bins=20)
    fig.set_xlabel(each)

    plt.show()
    
    
    # higher end showing for all features below

In [None]:
# pd.get_dummies(kiva, drop_first=True)

In [None]:
kiva.head()

In [None]:
for each in kiva.columns:
    print(each, 'has', len(kiva[each].unique()), 'subfeatures')

In [None]:
kiva.describe()

In [None]:
#kiva.columns.nunique().plot.bar(figsize=(10,6))
#plt.title('CARDINALITY: Number of categories in categorical variables')
#plt.xlabel('Categorical variables')
#plt.ylabel('Number of different categories');

In [None]:
basic_model = kiva[['country','activity']]

In [None]:
basic_model.head()

In [None]:
basic_model_2 = kiva[['country', 'funded_percentage']]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder()
enc_fund = pd.DataFrame(enc.fit_transform(basic_model_2).todense())

In [None]:
# enc_bm.head()
enc_fund.head()

In [None]:
X_train = basic_model_2['funded_percentage']
y_train = basic_model['country']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.30, random_state = 42)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
# pd.get_dummies(basic_model, drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    enc_bm,  # predictors
    basic_model['country'],  # target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train), model.score(X_test, y_test)

In [None]:
basic_model['country'].value_counts(normalize=True)

## Feature Selection:  Basic methods + Lasso pipeline

## Feature Selection:  Backwards Method

## Feature Selection:  Forward Method

In [None]:
# select top common features between both backward and forward method

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    kiva.drop(labels='funded_percentage', axis=1),  # predictors
    kiva['funded_percentage'],  # target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

In [None]:
mean_encoder = MeanCategoricalEncoder()

In [None]:
mean_encoder.fit(X_train, y_train)

In [None]:
X_train = mean_encoder.transform(X_train)
X_test = mean_encoder.transform(X_test)

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
X_train.drop(columns=['funded_time','borrower_genders','partner_id'], inplace=True)
X_test.drop(columns=['funded_time','borrower_genders','partner_id'], inplace=True)
                

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

In [None]:
X_train.shape, X_test.shape

In [None]:
X_train_original = X_train.copy()
X_test_original = X_test.copy()

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # we are interested in absolute coeff value
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr


corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)))

In [None]:
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

In [None]:
# scaler = StandardScaler()
# scaler.fit(X_train)

In [None]:
sel_ = SelectFromModel(
    LinearRegression(,
                       random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

# remove features with zero coefficient from dataset
# and parse again as dataframe

X_train_lasso = pd.DataFrame(sel_.transform(X_train))
X_test_lasso = pd.DataFrame(sel_.transform(X_test))

# add the columns name
X_train_lasso.columns = X_train.columns[(sel_.get_support())]
X_test_lasso.columns = X_train.columns[(sel_.get_support())]

In [None]:
mean_encoder.encoder_dict_

In [None]:
X_train.head()

In [None]:
kiva.describe()

In [None]:
X = kiva.drop(columns=['funded_percentage'])
y = kiva['funded_percentage']

In [None]:
lr = LinearRegression()

In [None]:
# cross_val_score(lr, X, y, cv=5).mean()
# for baseline R2 score

In [None]:
# kiva['activity'].value_counts(ascending=True).plot(kind='bar');
# kiva['sector'].value_counts(ascending=True).plot(kind='bar');
# kiva['country'].value_counts(ascending=True).plot(kind='bar');
# kiva['currency'].value_counts(ascending=True).plot(kind='bar');
# kiva['partner_id'].value_counts(ascending=True).plot(kind='bar');
# kiva['term_in_months'].value_counts(ascending=True).plot(kind='bar');

In [None]:
# kiva_10 = [each for each in kiva['country'].value_counts().sort_values(ascending=False).head(10).index]
# kiva_10

In [None]:
kiva['funded_percentage'].value_counts(ascending=True).plot(kind='bar');

In [None]:
for each in kiva.columns:
    print(each, 'has', len(kiva[each].unique()), 'subfeatures')

In [None]:
kiva['funded_percentage'].value_counts().sort_values()

In [None]:
from feature_engine.discretisers import EqualWidthDiscretiser

In [None]:
funded_range = EqualWidthDiscretiser(bins=10, variables='funded_percentage')
funded_range.fit(kiva)

In [None]:
kiva.head()

In [None]:
kiva['lender_count'].value_counts(ascending=True).plot(kind='bar');

In [None]:
kiva['borrower_genders'].value_counts(ascending=True).plot(kind='bar');

In [None]:
kiva['borrower_genders'].value_counts(normalize=True)

In [None]:
sns.histplot(x='funded_amount',data=kiva,bins=80);

# create column of difference between loan amount and funded

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
kiva.country=le.fit_transform(kiva.country)
kiva.head()