In [1]:
import pandas as pd

kiva_datasets = ['kiva_loans', 'kiva_mpi_region_locations', 'loan_theme_ids', 'loan_themes_by_region']

kiva_loans_orig = pd.read_csv('kiva-data/' + kiva_datasets[0] + '.csv')
kiva_loans = kiva_loans_orig.copy()
kiva_loans_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

- id: not too useful
- funded_amount: could be very useful, especially if this can be treated as a label
    * LABEL?
- loan_amount: useful to know how much they are asking for
- activity: could be useful, but there are a lot of categories.
- sector: only 15 categories, could be very useful
- use: terrible without any processing, could be used for NLP or parsing out keywords
    * OPEN ENDED
- country_code: useful to find country based trends
- country: basically the same as country_code, can drop 1
- region: very important for tracking the poverty down to the lowest level possible
- currency: could be interesting to see correlations between funding time and currency
- partner_id: not useful on it's own, could be helpful to merge with other datasets that use the same id
    * USE FOR MERGE
- posted_time: time based features are useful
    * CONVERT
    * LABEL
- disbursed_time: would be useful to see if people's fudning interests align with disbursal time, however occasionally the loan is disbursed before funding, so this might not be too useful
    * CONVERT
    * LABEL?
- funded_time: useful to determine amount of time between posting and funding
    * CONVERT
    * LABEL
- term_in_months: useful to determine how quickly someone will be able to pay back, which might indicate their poverty level (i.e. 200\$ loan over 12 months indicates higher poverty than 200\$ loan over 6 months)
- lender_count: more people in the loan might indicate lower or higher welfare need, depending on other features
- tags: not useful in current state, could use tags as a label in order to predict popular attraction to the loan (separated by ', ')
    * CONVERT
    * LABEL?
- borrower_genders: not useful in current state, could convert to something like num_males, num_females (separated by ', ')
    * CONVERT
- repayment_interval: could hold some trends of welfare need
- date: might be similar/equal to posted_time, might remove
    * CONVERT

In [2]:
import time as t

time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']

for col in time_cols:
    kiva_loans[col] = pd.to_datetime(kiva_loans_orig[col])

In [3]:
ary_cols = ['tags', 'borrower_genders']

def to_ar(strg, sep = ', '):
    if strg == 'nan':
        return []
    return strg.split(sep=sep)

for col in ary_cols:
    kiva_loans[col] = kiva_loans_orig[col].astype(str).apply(to_ar)

In [4]:
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null datetime64[ns]
disbursed_time        668809 non-null datetime64[ns]
funded_time           622874 non-null datetime64[ns]
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  671205 non-null object
borrower_genders      671205 non-null object
repayment_interval    671205 non-null object
date    

In [53]:
kiva_loans['time_till_funded'] = (kiva_loans['funded_time'] - kiva_loans['posted_time']).astype('timedelta64[ns]')
kiva_loans['hours_till_funded'] = kiva_loans['time_till_funded'].astype('timedelta64[h]')

In [54]:
kiva_loans['hours_till_funded'][0]

27.0

In [55]:
kiva_loans['hours_till_funded'].describe()

count    622874.000000
mean        350.896796
std         345.578870
min        -421.000000
25%         124.000000
50%         230.000000
75%         540.000000
max       10093.000000
Name: hours_till_funded, dtype: float64

In [56]:
(kiva_loans['hours_till_funded'] < 0).sum()

1

For some reason, one of the rows has the funded time before the posted time.

In [76]:
def ior(i):
    i = iter(i)
    try:
        s = next(i)
    except StopIteration:
        return None
    for e in i:
        s = s | e
    return s

all_tags = ior(kiva_loans['tags'].apply(set))
all_tags

{'#Animals',
 '#Biz Durable Asset',
 '#Eco-friendly',
 '#Elderly',
 '#Fabrics',
 '#Female Education',
 '#First Loan',
 '#Health and Sanitation',
 '#Hidden Gem',
 '#Inspiring Story',
 '#Interesting Photo',
 '#Job Creator',
 '#Low-profit FP',
 '#Orphan',
 '#Parent',
 '#Post-disbursed',
 '#Refugee',
 '#Repair Renew Replace',
 '#Repeat Borrower',
 '#Schooling',
 '#Single',
 '#Single Parent',
 '#Supporting Family',
 '#Sustainable Ag',
 '#Technology',
 '#Tourism',
 '#Trees',
 '#Unique',
 '#Vegan',
 '#Widowed',
 '#Woman Owned Biz',
 'user_favorite',
 'user_like',
 'volunteer_like',
 'volunteer_pick'}

Not too many possible tags, could just do a one hot encoding without too many problems.

# STOP

In [107]:
kiva_loan_data_region = kiva_data['kiva_loans'].merge(kiva_data['kiva_mpi_region_locations'], on=['region', 'country'], how='left')

In [114]:
kiva_data['kiva_loans']['region'].unique()

array(['Lahore', 'Maynaguri', 'Abdul Hakeem', ..., 'Gbenikoro Village',
       'Morimaraia', 'alejandria'], dtype=object)

In [126]:
import numpy as np
vec_lower = np.vectorize(str.lower)
kiva_data['kiva_mpi_region_locations']['country' and 'region'].unique()

array(['Badakhshan', 'Badghis', 'Baghlan', 'Balkh', 'Bamyan', 'Daykundi',
       'Farah', 'Faryab', 'Ghazni', 'Ghor', 'Helmand', 'Herat', 'Jawzjan',
       'Kabul', 'Kandahar', 'Kapisa', 'Khost', 'Kunarha', 'Kunduz',
       'Laghman', 'Logar', 'Nangarhar', 'Nimroz', 'Nooristan', 'Paktika',
       'Paktya', 'Panjsher', 'Parwan', 'Samangan', 'Sar-E-Pul', 'Takhar',
       'Urozgan', 'Wardak', 'Zabul', nan, 'Bujumbura Mairie', 'Nord',
       'Centre-Est', 'Ouest', 'Sud', 'Alibori', 'Atacora', 'Atlantique',
       'Borgou', 'Collines', 'Couffo', 'Donga', 'Littoral', 'Mono',
       'Ouðmð', 'Plateau', 'Zou', 'Boucle de mouhoun', 'Cascades',
       'Centre', 'Centre-est', 'Centre-nord', 'Centre-ouest',
       'Centre-sud', 'Est', 'Hauts basins', 'Plateau central', 'Sahel',
       'Sud-ouest', 'Barisal', 'Chittagong', 'Dhaka', 'Khulna',
       'Rajshahi', 'Rangpur', 'Sylhet', 'Corozal', 'Orange Walk',
       'Belize (excluding Belize City South Side)', 'Stann Creek',
       'Toledo', 'Cayo', '

In [43]:
import time as t

time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']

for col in time_cols:
    kiva_loan_data_region[col] = pd.to_datetime(kiva_loan_data_region[col])
    print(kiva_loan_data_region[col][0])

2014-01-02 14:25:08
2013-12-17 08:00:00
2014-01-08 22:07:48
2014-01-02 00:00:00


In [45]:
kiva_loan_data_region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52197 entries, 0 to 52196
Data columns (total 27 columns):
id                    52197 non-null int64
funded_amount         52197 non-null float64
loan_amount           52197 non-null float64
activity              52197 non-null object
sector                52197 non-null object
use                   52004 non-null object
country_code          52197 non-null object
country               52197 non-null object
region                50955 non-null object
currency              52197 non-null object
partner_id            52197 non-null float64
posted_time           52197 non-null datetime64[ns]
disbursed_time        52197 non-null datetime64[ns]
funded_time           48681 non-null datetime64[ns]
term_in_months        52197 non-null float64
lender_count          52197 non-null int64
tags                  39273 non-null object
borrower_genders      52005 non-null object
repayment_interval    52197 non-null object
date                  52197 n

In [86]:
%matplotlib inline
kiva_loan_data_region['geo'].describe()#.hist(xrot=90, figsize=(20, 7))

count                       52197
unique                        109
top       (10.5104642, 7.4165053)
freq                        10000
Name: geo, dtype: object

- id: not too useful
- funded_amount: could be very useful, especially if this can be treated as a label
    * LABEL?
- loan_amount: useful to know how much they are asking for
- activity: could be useful, but there are a lot of categories.
- sector: only 15 categories, could be very useful
- use: terrible without any processing, could be used for NLP or parsing out keywords
    * OPEN ENDED
- country_code: useful to find country based trends
- country: basically the same as country_code, can drop 1
- region: very important for tracking the poverty down to the lowest level possible
- currency: could be interesting to see correlations between funding time and currency
- partner_id: not useful on it's own, could be helpful to merge with other datasets that use the same id
    * USE FOR MERGE
- posted_time: time based features are useful
    * LABEL
- disbursed_time: would be useful to see if people's fudning interests align with disbursal time, however occasionally the loan is disbursed before funding, so this might not be too useful
    * LABEL?
- funded_time: useful to determine amount of time between posting and funding
    * LABEL
- term_in_months: useful to determine how quickly someone will be able to pay back, which might indicate their poverty level (i.e. 200\$ loan over 12 months indicates higher poverty than 200\$ loan over 6 months)
- lender_count: more people in the loan might indicate lower or higher welfare need, depending on other features
- tags: not useful in current state, could use tags as a label in order to predict popular attraction to the loan (separated by ', ')
    * CONVERT
    * LABEL?
- borrower_genders: not useful in current state, could convert to something like num_males, num_females (separated by ', ')
    * CONVERT
- repayment_interval: could hold some trends of welfare need
- date: might be similar/equal to posted_time, might remove
- LocationName: Not very useful with country and region features
- ISO: Similar to country_code, not very useful
- world_region: more broad than country, 6 values, could be useful in identifying global trends
- MPI: very useful for welfare, provides metric to the region for general poverty
- geo: basically useless with lat and lon features
- lat: might be useful to find linear trends with lat & lon, but not a very good metric for region based things
- lon: same as lat

In [90]:
kiva_data['loan_themes_by_region'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15736 entries, 0 to 15735
Data columns (total 21 columns):
Partner ID            15736 non-null int64
Field Partner Name    15736 non-null object
sector                15736 non-null object
Loan Theme ID         15736 non-null object
Loan Theme Type       15736 non-null object
country               15736 non-null object
forkiva               15736 non-null object
region                15736 non-null object
geocode_old           1200 non-null object
ISO                   15722 non-null object
number                15736 non-null int64
amount                15736 non-null int64
LocationName          15736 non-null object
geocode               13662 non-null object
names                 13661 non-null object
geo                   15736 non-null object
lat                   13662 non-null float64
lon                   13662 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct     

In [91]:
kiva_data['loan_theme_ids'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779092 entries, 0 to 779091
Data columns (total 4 columns):
id                 779092 non-null int64
Loan Theme ID      764279 non-null object
Loan Theme Type    764279 non-null object
Partner ID         764279 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 23.8+ MB


In [92]:
kiva_data['loan_themes_by_region']

Unnamed: 0,Partner ID,Field Partner Name,sector,Loan Theme ID,Loan Theme Type,country,forkiva,region,geocode_old,ISO,...,amount,LocationName,geocode,names,geo,lat,lon,mpi_region,mpi_geo,rural_pct
0,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Banteay Meanchey,"(13.75, 103.0)",KHM,...,450,"Banteay Meanchey, Cambodia","[(13.6672596, 102.8975098)]",Banteay Meanchey Province; Cambodia,"(13.6672596, 102.8975098)",13.667260,102.897510,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
1,9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Battambang Province,,KHM,...,20275,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.028697,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
2,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Battambang Province,,KHM,...,9150,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.028697,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
3,9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,604950,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
4,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000002X1Uu,Sanitation,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,275,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
5,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,62225,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
6,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,1300,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
7,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000weyk,General,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,237175,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
8,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,3050,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
9,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,31425,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
