In [1]:
import pandas as pd

kiva_datasets = ['kiva_loans', 'kiva_mpi_region_locations', 'loan_theme_ids', 'loan_themes_by_region']

kiva_loans_orig = pd.read_csv('kiva-data/' + kiva_datasets[0] + '.csv')
kiva_loans = kiva_loans_orig.copy()
kiva_loans_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

- id: not too useful
- funded_amount: could be very useful, especially if this can be treated as a label
    * LABEL?
- loan_amount: useful to know how much they are asking for
- activity: could be useful, but there are a lot of categories.
- sector: only 15 categories, could be very useful
- use: terrible without any processing, could be used for NLP or parsing out keywords
    * OPEN ENDED
- country_code: useful to find country based trends
- country: basically the same as country_code, can drop 1
- region: very important for tracking the poverty down to the lowest level possible
- currency: could be interesting to see correlations between funding time and currency
- partner_id: not useful on it's own, could be helpful to merge with other datasets that use the same id
    * USE FOR MERGE
- posted_time: time based features are useful
    * CONVERT
    * LABEL
- disbursed_time: would be useful to see if people's fudning interests align with disbursal time, however occasionally the loan is disbursed before funding, so this might not be too useful
    * CONVERT
    * LABEL?
- funded_time: useful to determine amount of time between posting and funding
    * CONVERT
    * LABEL
- term_in_months: useful to determine how quickly someone will be able to pay back, which might indicate their poverty level (i.e. 200\$ loan over 12 months indicates higher poverty than 200\$ loan over 6 months)
- lender_count: more people in the loan might indicate lower or higher welfare need, depending on other features
- tags: not useful in current state, could use tags as a label in order to predict popular attraction to the loan (separated by ', ')
    * CONVERT
    * LABEL?
- borrower_genders: not useful in current state, could convert to something like num_males, num_females (separated by ', ')
    * CONVERT
- repayment_interval: could hold some trends of welfare need
- date: might be similar/equal to posted_time, might remove
    * CONVERT

In [2]:
import time as t

time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']

for col in time_cols:
    kiva_loans[col] = pd.to_datetime(kiva_loans_orig[col])

In [3]:
ary_cols = ['tags', 'borrower_genders']

def to_ar(strg, sep = ', '):
    if strg == 'nan':
        return []
    return strg.split(sep=sep)

for col in ary_cols:
    kiva_loans[col] = kiva_loans_orig[col].astype(str).apply(to_ar)

In [4]:
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null datetime64[ns]
disbursed_time        668809 non-null datetime64[ns]
funded_time           622874 non-null datetime64[ns]
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  671205 non-null object
borrower_genders      671205 non-null object
repayment_interval    671205 non-null object
date    

In [5]:
kiva_loans['time_till_funded'] = (kiva_loans['funded_time'] - kiva_loans['posted_time']).astype('timedelta64[ns]')
kiva_loans['hours_till_funded'] = kiva_loans['time_till_funded'].astype('timedelta64[h]')

In [6]:
kiva_loans['hours_till_funded'][0]

27.0

In [7]:
kiva_loans['hours_till_funded'].describe()

count    622874.000000
mean        350.896796
std         345.578870
min        -421.000000
25%         124.000000
50%         230.000000
75%         540.000000
max       10093.000000
Name: hours_till_funded, dtype: float64

In [8]:
(kiva_loans['hours_till_funded'] < 0).sum()

1

For some reason, one of the rows has the funded time before the posted time.

In [9]:
def ior(i):
    i = iter(i)
    try:
        s = next(i)
    except StopIteration:
        return None
    for e in i:
        s = s | e
    return s

all_tags = ior(kiva_loans['tags'].apply(set))
all_tags

{'#Animals',
 '#Biz Durable Asset',
 '#Eco-friendly',
 '#Elderly',
 '#Fabrics',
 '#Female Education',
 '#First Loan',
 '#Health and Sanitation',
 '#Hidden Gem',
 '#Inspiring Story',
 '#Interesting Photo',
 '#Job Creator',
 '#Low-profit FP',
 '#Orphan',
 '#Parent',
 '#Post-disbursed',
 '#Refugee',
 '#Repair Renew Replace',
 '#Repeat Borrower',
 '#Schooling',
 '#Single',
 '#Single Parent',
 '#Supporting Family',
 '#Sustainable Ag',
 '#Technology',
 '#Tourism',
 '#Trees',
 '#Unique',
 '#Vegan',
 '#Widowed',
 '#Woman Owned Biz',
 'user_favorite',
 'user_like',
 'volunteer_like',
 'volunteer_pick'}

Not too many possible tags, could just do a one hot encoding without too many problems.

Here is the one hot encoding for it

In [10]:
import numpy as np
vecin = np.vectorize(list.__contains__)

for tag in all_tags:
    kiva_loans['tag_' + tag] = vecin(kiva_loans['tags'], tag).astype(int)

kiva_loans['tag_#Trees'].describe()

count    671205.000000
mean          0.011342
std           0.105895
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: tag_#Trees, dtype: float64

In [11]:
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 57 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector                        671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country                       671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [12]:
kiva_loans['num_male_borrowers'] = kiva_loans['borrower_genders'].apply(lambda x: x.count('male'))
kiva_loans['num_female_borrowers'] = kiva_loans['borrower_genders'].apply(len) - kiva_loans['num_male_borrowers']

In [13]:
(kiva_loans['num_male_borrowers'] > 0).sum()

178903

In [14]:
(kiva_loans['num_female_borrowers'] > 0).sum()

528461

Interesting to see that the majority of loans have at least 1 female borrower

Below I use the dataset I created using the Google Places API to get a second region which hopefully better matches with the `kiva_mpi_region_locations.csv` dataset

In [129]:
loan_theme_ids = pd.read_csv('kiva-data/' + kiva_datasets[2] + '.csv')
loan_theme_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779092 entries, 0 to 779091
Data columns (total 4 columns):
id                 779092 non-null int64
Loan Theme ID      764279 non-null object
Loan Theme Type    764279 non-null object
Partner ID         764279 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 23.8+ MB


In [133]:
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 59 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector                        671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country                       671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [135]:
kiva_loan_theme_id = kiva_loans.merge(loan_theme_ids, how='left', on='id')
kiva_loan_theme_id.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 62 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector                        671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country                       671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [137]:
loan_theme_region = pd.read_csv('kiva-data/' + kiva_datasets[3] + '.csv')
loan_theme_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15736 entries, 0 to 15735
Data columns (total 21 columns):
Partner ID            15736 non-null int64
Field Partner Name    15736 non-null object
sector                15736 non-null object
Loan Theme ID         15736 non-null object
Loan Theme Type       15736 non-null object
country               15736 non-null object
forkiva               15736 non-null object
region                15736 non-null object
geocode_old           1200 non-null object
ISO                   15722 non-null object
number                15736 non-null int64
amount                15736 non-null int64
LocationName          15736 non-null object
geocode               13662 non-null object
names                 13661 non-null object
geo                   15736 non-null object
lat                   13662 non-null float64
lon                   13662 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct     

In [150]:
loan_theme_region.sample(5)

Unnamed: 0,Partner ID,Field Partner Name,sector,Loan Theme ID,Loan Theme Type,country,forkiva,region,geocode_old,ISO,...,amount,LocationName,geocode,names,geo,lat,lon,mpi_region,mpi_geo,rural_pct
6927,136,Gata Daku Multi-purpose Cooperative (GDMPC),General Financial Inclusion,a1050000000wf0h,General,Philippines,No,"p-7, casusan, oroquieta city, misamis occidental",,PHL,...,100,"p-7, casusan, oroquieta city, misamis occident...",,,"(1000.0, 1000.0)",,,PHL,,61.0
9021,169,SEF International,General Financial Inclusion,a1050000006TnqI,Agriculture (Women),Armenia,No,"Mrgastan village, Ejmiatsin region",,ARM,...,1050,"Mrgastan village, Ejmiatsin region, Armenia","[(40.2007545, 44.2770015)]",Mrgastan; Armavir Province; Armenia,"(40.2007545, 44.2770015)",40.200754,44.277002,ARM,,68.0
1893,81,Apoyo Integral,General Financial Inclusion,a1050000000SqOh,At-Risk Youth,El Salvador,No,Gotera,,SLV,...,9050,"Gotera, El Salvador","[(13.6927457, -88.1047287)]",San Francisco Gotera; MorazÍn Department; El ...,"(13.6927457, -88.1047287)",13.692746,-88.104729,"Morazan, El Salvador","(13.7682, -88.1291387)",57.0
580,40,Grounded and Holistic Approach for People's Em...,General Financial Inclusion,a1050000000shPp,Extreme Poverty,Cameroon,No,"Nanga Street, Bamenda",,CMR,...,350,"Nanga Street, Bamenda, Cameroon",,,"(1000.0, 1000.0)",,,CMR,,93.0
9108,169,SEF International,General Financial Inclusion,a1050000000daSp,Conflict Zone,Armenia,Yes,"Ptghavan village, Tavush region",,ARM,...,12775,"Ptghavan village, Tavush region, Armenia","[(41.2272725, 44.86106669999999)]",Ptghavan; Tavush Province; Armenia,"(41.2272725, 44.86106669999999)",41.227272,44.861067,ARM,,68.0


In [141]:
kiva_loans_subregion = kiva_loan_theme_id.merge(loan_theme_region, how='left', on=['Loan Theme ID', 'Partner ID', 'region'])
kiva_loans_subregion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 80 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector_x                      671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country_x                     671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [151]:
kiva_loans_subregion_wmpi = kiva_loans_subregion[pd.notnull(kiva_loans_subregion['mpi_region'])]
kiva_loans_subregion_wmpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 80 columns):
id                            533461 non-null int64
funded_amount                 533461 non-null float64
loan_amount                   533461 non-null float64
activity                      533461 non-null object
sector_x                      533461 non-null object
use                           533456 non-null object
country_code                  533453 non-null object
country_x                     533461 non-null object
region                        533461 non-null object
currency                      533461 non-null object
partner_id                    533461 non-null float64
posted_time                   533461 non-null datetime64[ns]
disbursed_time                533461 non-null datetime64[ns]
funded_time                   499549 non-null datetime64[ns]
term_in_months                533461 non-null float64
lender_count                  533461 non-null int64
tags           

In [153]:
kiva_mpi_locs = pd.read_csv('kiva-data/' + kiva_datasets[1] + '.csv')
kiva_mpi_locs = kiva_mpi_locs[pd.notnull(kiva_mpi_locs['MPI'])]
kiva_mpi_locs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 1007
Data columns (total 9 columns):
LocationName    984 non-null object
ISO             984 non-null object
country         984 non-null object
region          984 non-null object
world_region    984 non-null object
MPI             984 non-null float64
geo             984 non-null object
lat             892 non-null float64
lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 76.9+ KB


In [155]:
kiva_mpi_locs.sample(5) 

Unnamed: 0,LocationName,ISO,country,region,world_region,MPI,geo,lat,lon
994,"Northern, Zambia",ZMB,Zambia,Northern,Sub-Saharan Africa,0.397,"(-9.7670177, 30.8958242)",-9.767018,30.895824
818,"Bonthe, Sierra Leone",SLE,Sierra Leone,Bonthe,Sub-Saharan Africa,0.478,"(30.585164, 36.238414)",30.585164,36.238414
328,"Huehuetenango, Guatemala",GTM,Guatemala,Huehuetenango,Latin America and Caribbean,0.187,"(15.3198766, -91.4918235)",15.319877,-91.491823
523,"El Gharb-Chrarda Bni Hssen, Morocco",MAR,Morocco,El Gharb-Chrarda Bni Hssen,Arab States,0.084,"(34.302131, -6.302078)",34.302131,-6.302078
734,"Sindh, Pakistan",PAK,Pakistan,Sindh,South Asia,0.294,"(25.8943018, 68.52471489999999)",25.894302,68.524715


In [156]:
kiva_loans_mpi = kiva_loans_subregion_wmpi.merge(kiva_mpi_locs, how='left', left_on='mpi_region', right_on='LocationName')
kiva_loans_mpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 89 columns):
id                            533461 non-null int64
funded_amount                 533461 non-null float64
loan_amount                   533461 non-null float64
activity                      533461 non-null object
sector_x                      533461 non-null object
use                           533456 non-null object
country_code                  533453 non-null object
country_x                     533461 non-null object
region_x                      533461 non-null object
currency                      533461 non-null object
partner_id                    533461 non-null float64
posted_time                   533461 non-null datetime64[ns]
disbursed_time                533461 non-null datetime64[ns]
funded_time                   499549 non-null datetime64[ns]
term_in_months                533461 non-null float64
lender_count                  533461 non-null int64
tags           

In [157]:
kiva_loans_mpi_final = kiva_loans_mpi[pd.notnull(kiva_loans_mpi['MPI'])]
kiva_loans_mpi_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458894 entries, 0 to 533459
Data columns (total 89 columns):
id                            458894 non-null int64
funded_amount                 458894 non-null float64
loan_amount                   458894 non-null float64
activity                      458894 non-null object
sector_x                      458894 non-null object
use                           458889 non-null object
country_code                  458886 non-null object
country_x                     458894 non-null object
region_x                      458894 non-null object
currency                      458894 non-null object
partner_id                    458894 non-null float64
posted_time                   458894 non-null datetime64[ns]
disbursed_time                458894 non-null datetime64[ns]
funded_time                   431465 non-null datetime64[ns]
term_in_months                458894 non-null float64
lender_count                  458894 non-null int64
tags           

In [158]:
kiva_loans_mpi_final[~pd.notnull(kiva_loans_mpi_final['funded_time'])]

Unnamed: 0,id,funded_amount,loan_amount,activity,sector_x,use,country_code,country_x,region_x,currency,...,rural_pct,LocationName_y,ISO_y,country,region_y,world_region,MPI,geo_y,lat_y,lon_y
486,653916,475.0,1050.0,Pub,Food,to buy stock wholesale,CO,Colombia,Andes,COP,...,16.0,"Medellin A.M., Colombia",COL,Colombia,Medellin A.M.,Latin America and Caribbean,0.005,"(6.244203, -75.5812119)",6.244203,-75.581212
495,653912,500.0,1050.0,Beverages,Food,buy a motorcycle along with cargo and packaged...,CO,Colombia,La Dorada,COP,...,16.0,"Medellin A.M., Colombia",COL,Colombia,Medellin A.M.,Latin America and Caribbean,0.005,"(6.244203, -75.5812119)",6.244203,-75.581212
591,654153,975.0,1475.0,Wedding Expenses,Personal Use,to put on a wedding party for her son,TJ,Tajikistan,Yavan,TJS,...,0.0,"Dushanbe, Tajikistan",TJK,Tajikistan,Dushanbe,Europe and Central Asia,0.021,"(38.5597722, 68.7870384)",38.559772,68.787038
663,653957,1725.0,2600.0,Agriculture,Agriculture,"to buy wholesale farming supplies (seeds, manu...",CO,Colombia,El Carmen de Viboral,COP,...,16.0,"Medellin A.M., Colombia",COL,Colombia,Medellin A.M.,Latin America and Caribbean,0.005,"(6.244203, -75.5812119)",6.244203,-75.581212
690,654104,550.0,1050.0,Wedding Expenses,Personal Use,pay for her daughter's wedding reception,TJ,Tajikistan,Isfara,TJS,...,0.0,"Districts of Republican Subordination, Tajikistan",TJK,Tajikistan,Districts of Republican Subordination,Europe and Central Asia,0.045,"(39.0857902, 70.2408325)",39.085790,70.240832
700,654005,825.0,1550.0,Agriculture,Agriculture,"To purchase inventory wholesale (snacks, soft ...",CO,Colombia,Sonson,COP,...,16.0,"Medellin A.M., Colombia",COL,Colombia,Medellin A.M.,Latin America and Caribbean,0.005,"(6.244203, -75.5812119)",6.244203,-75.581212
755,654478,750.0,925.0,Motorcycle Transport,Transportation,to repair and service his three motocycles,KE,Kenya,Narok,KES,...,65.0,"Nairobi, Kenya",KEN,Kenya,Nairobi,Sub-Saharan Africa,0.020,"(-1.2920659, 36.8219462)",-1.292066,36.821946
756,654582,3300.0,5525.0,Clothing Sales,Clothing,to buy clothes to resell.,TZ,Tanzania,Dar es Salaam,TZS,...,0.0,"Zanzibar, Tanzania, United Republic of",TZA,"Tanzania, United Republic of",Zanzibar,Sub-Saharan Africa,0.143,"(-6.135729500000001, 39.3621196)",-6.135730,39.362120
786,654721,550.0,925.0,Motorcycle Transport,Transportation,to purchase spare parts for his motorbike and ...,KE,Kenya,Samburu,KES,...,99.0,"North Eastern, Kenya",KEN,Kenya,North Eastern,Sub-Saharan Africa,0.509,"(-0.0190061, 37.6480812)",-0.019006,37.648081
794,654630,875.0,1500.0,Wedding Expenses,Personal Use,to hold a wedding for his son,TJ,Tajikistan,Panjakent,USD,...,0.0,"Dushanbe, Tajikistan",TJK,Tajikistan,Dushanbe,Europe and Central Asia,0.021,"(38.5597722, 68.7870384)",38.559772,68.787038


In [111]:
subnat_regions = pd.read_csv('subnat_regions.csv').drop('id', axis=1)

In [112]:
subnat_regions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12821 entries, 0 to 12820
Data columns (total 3 columns):
region           12749 non-null object
country          12821 non-null object
subnat_region    12733 non-null object
dtypes: object(3)
memory usage: 300.6+ KB


In [113]:
kiva_subnat_loans = kiva_loans.merge(subnat_regions, how='left', on=['region', 'country'])

In [114]:
kiva_subnat_loans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 60 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector                        671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country                       671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [25]:
kiva_mpi_region_locations_orig = pd.read_csv('kiva-data/' + kiva_datasets[1] + '.csv')
mpi_nat_orig = pd.read_csv('mpi-data/MPI_national.csv')
mpi_subnat_orig = pd.read_csv('mpi-data/MPI_subnational.csv')
kiva_mpi_region_locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 9 columns):
LocationName    984 non-null object
ISO             1008 non-null object
country         1008 non-null object
region          984 non-null object
world_region    1008 non-null object
MPI             984 non-null float64
geo             2772 non-null object
lat             892 non-null float64
lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 195.0+ KB


In [26]:
kiva_mpi_region_locations = kiva_mpi_region_locations_orig.rename(lambda x: 'mr_' + str(x), axis=1)
mpi_nat = mpi_nat_orig.rename(lambda x: 'mn_' + str(x), axis=1)
mpi_subnat = mpi_subnat_orig.rename(lambda x: 'ms_' + str(x), axis=1)

In [72]:
kiva_mpi_region_locations = kiva_mpi_region_locations[pd.notnull(kiva_mpi_region_locations['mr_MPI'])]
kiva_mpi_region_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 1007
Data columns (total 9 columns):
mr_LocationName    984 non-null object
mr_ISO             984 non-null object
mr_country         984 non-null object
mr_region          984 non-null object
mr_world_region    984 non-null object
mr_MPI             984 non-null float64
mr_geo             984 non-null object
mr_lat             892 non-null float64
mr_lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 76.9+ KB


In [73]:
mpi_subnat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
ms_ISO country code                     984 non-null object
ms_Country                              984 non-null object
ms_Sub-national region                  984 non-null object
ms_World region                         984 non-null object
ms_MPI National                         984 non-null float64
ms_MPI Regional                         984 non-null float64
ms_Headcount Ratio Regional             984 non-null float64
ms_Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


In [74]:
subnat_info = kiva_mpi_region_locations.merge(mpi_subnat, how='outer', left_on=['mr_country', 'mr_region', 'mr_world_region'], right_on=['ms_Country', 'ms_Sub-national region', 'ms_World region'])

In [75]:
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 17 columns):
mr_LocationName                         984 non-null object
mr_ISO                                  984 non-null object
mr_country                              984 non-null object
mr_region                               984 non-null object
mr_world_region                         984 non-null object
mr_MPI                                  984 non-null float64
mr_geo                                  984 non-null object
mr_lat                                  892 non-null float64
mr_lon                                  892 non-null float64
ms_ISO country code                     984 non-null object
ms_Country                              984 non-null object
ms_Sub-national region                  984 non-null object
ms_World region                         984 non-null object
ms_MPI National                         984 non-null float64
ms_MPI Regional                         984 non-nul

Did outer join in order to add any columns that were in one but not the other, so can remove/rename the country, region and world region features that are duplicated, because they have to match. If statements just prevents errors from repeated runs

In [76]:
rem = ['mr_country','mr_region','mr_world_region','ms_Country','ms_Sub-national region','ms_World region']
if set(rem).issubset(subnat_info.keys()):
    subnat_info['country'] = subnat_info['mr_country']
    subnat_info['region'] = subnat_info['mr_region']
    subnat_info['world_region'] = subnat_info['mr_world_region']
    subnat_info.drop(rem, axis=1, inplace=True)

In [77]:
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 14 columns):
mr_LocationName                         984 non-null object
mr_ISO                                  984 non-null object
mr_MPI                                  984 non-null float64
mr_geo                                  984 non-null object
mr_lat                                  892 non-null float64
mr_lon                                  892 non-null float64
ms_ISO country code                     984 non-null object
ms_MPI National                         984 non-null float64
ms_MPI Regional                         984 non-null float64
ms_Headcount Ratio Regional             984 non-null float64
ms_Intensity of deprivation Regional    983 non-null float64
country                                 984 non-null object
region                                  984 non-null object
world_region                            984 non-null object
dtypes: float64(7), object(7)
memory usage: 115.

In [78]:
(subnat_info['mr_ISO'] != subnat_info['ms_ISO country code']).sum()

0

Features are identical, so remove one, rename the other

In [79]:
rem = ['ms_ISO country code', 'mr_ISO']
if set(rem).issubset(subnat_info.keys()):
    subnat_info['ISO'] = subnat_info['mr_ISO']
    subnat_info.drop(rem, axis=1, inplace=True)

In [80]:
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 13 columns):
mr_LocationName                         984 non-null object
mr_MPI                                  984 non-null float64
mr_geo                                  984 non-null object
mr_lat                                  892 non-null float64
mr_lon                                  892 non-null float64
ms_MPI National                         984 non-null float64
ms_MPI Regional                         984 non-null float64
ms_Headcount Ratio Regional             984 non-null float64
ms_Intensity of deprivation Regional    983 non-null float64
country                                 984 non-null object
region                                  984 non-null object
world_region                            984 non-null object
ISO                                     984 non-null object
dtypes: float64(7), object(6)
memory usage: 107.6+ KB


In [81]:
(subnat_info['mr_MPI'] != subnat_info['ms_MPI Regional']).sum()

0

Features identical, remove/rename them

In [82]:
rem = ['mr_MPI', 'ms_MPI Regional']
if set(rem).issubset(subnat_info.keys()):
    subnat_info['mpi_regional'] = subnat_info['mr_MPI']
    subnat_info.drop(rem, axis=1, inplace=True)

In [83]:
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 12 columns):
mr_LocationName                         984 non-null object
mr_geo                                  984 non-null object
mr_lat                                  892 non-null float64
mr_lon                                  892 non-null float64
ms_MPI National                         984 non-null float64
ms_Headcount Ratio Regional             984 non-null float64
ms_Intensity of deprivation Regional    983 non-null float64
country                                 984 non-null object
region                                  984 non-null object
world_region                            984 non-null object
ISO                                     984 non-null object
mpi_regional                            984 non-null float64
dtypes: float64(6), object(6)
memory usage: 99.9+ KB


Rest of the features are unique, so I will rename them to something a little better

In [85]:
import re
rn_lam = lambda colname: colname[3:].lower().replace(' ', '_') if re.match('m._.*', colname) is not None else colname
subnat_info.rename(rn_lam, axis=1, inplace=True)
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 12 columns):
locationname                         984 non-null object
geo                                  984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
mpi_national                         984 non-null float64
headcount_ratio_regional             984 non-null float64
intensity_of_deprivation_regional    983 non-null float64
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
ISO                                  984 non-null object
mpi_regional                         984 non-null float64
dtypes: float64(6), object(6)
memory usage: 99.9+ KB


In [87]:
mpi_nat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 8 columns):
mn_ISO                               102 non-null object
mn_Country                           102 non-null object
mn_MPI Urban                         102 non-null float64
mn_Headcount Ratio Urban             102 non-null float64
mn_Intensity of Deprivation Urban    102 non-null float64
mn_MPI Rural                         102 non-null float64
mn_Headcount Ratio Rural             102 non-null float64
mn_Intensity of Deprivation Rural    102 non-null float64
dtypes: float64(6), object(2)
memory usage: 6.5+ KB


In [88]:
subnat_info = subnat_info.merge(mpi_nat, how='left', left_on=['country', 'ISO'], right_on=['mn_Country', 'mn_ISO'])
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 20 columns):
locationname                         984 non-null object
geo                                  984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
mpi_national                         984 non-null float64
headcount_ratio_regional             984 non-null float64
intensity_of_deprivation_regional    983 non-null float64
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
ISO                                  984 non-null object
mpi_regional                         984 non-null float64
mn_ISO                               984 non-null object
mn_Country                           984 non-null object
mn_MPI Urban                         984 non-null float64
mn_Headcount Ratio Urban         

In [89]:
subnat_info.drop(['mn_ISO', 'mn_Country'], axis=1, inplace=True)

In [92]:
subnat_info[[key for key in subnat_info.keys() if 'mpi' in key.lower()]].head(5)

Unnamed: 0,mpi_national,mpi_regional,mn_MPI Urban,mn_MPI Rural
0,0.295,0.387,0.132,0.347
1,0.295,0.466,0.132,0.347
2,0.295,0.3,0.132,0.347
3,0.295,0.301,0.132,0.347
4,0.295,0.325,0.132,0.347


All mpi values look to be different, so I'll keep all of them

In [93]:
subnat_info[[key for key in subnat_info.keys() if 'head' in key.lower()]].head(5)

Unnamed: 0,headcount_ratio_regional,mn_Headcount Ratio Urban,mn_Headcount Ratio Rural
0,67.5,28.8,64.66
1,79.3,28.8,64.66
2,59.7,28.8,64.66
3,55.7,28.8,64.66
4,61.0,28.8,64.66


Headcount values all different, so I'll keep them all

In [94]:
subnat_info[[key for key in subnat_info.keys() if 'intens' in key.lower()]].head(5)

Unnamed: 0,intensity_of_deprivation_regional,mn_Intensity of Deprivation Urban,mn_Intensity of Deprivation Rural
0,57.3,45.8,53.6
1,58.8,45.8,53.6
2,50.3,45.8,53.6
3,54.1,45.8,53.6
4,53.3,45.8,53.6


Deprivation intensity also unique, so keep them all.

Rename new features so they have better names

In [95]:
import re
rn_lam = lambda colname: colname[3:].lower().replace(' ', '_') if re.match('mn_.*', colname) is not None else colname
subnat_info.rename(rn_lam, axis=1, inplace=True)
subnat_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 18 columns):
locationname                         984 non-null object
geo                                  984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
mpi_national                         984 non-null float64
headcount_ratio_regional             984 non-null float64
intensity_of_deprivation_regional    983 non-null float64
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
ISO                                  984 non-null object
mpi_regional                         984 non-null float64
mpi_urban                            984 non-null float64
headcount_ratio_urban                984 non-null float64
intensity_of_deprivation_urban       984 non-null float64
mpi_rural                      

This dataset looks good, I will now run through a small procedure of ensuring that if the `subnat_region` feature I added is not in this new dataset (meaning there is not a matching region and country), then I will change the subnat_region to just be the region, in case it produced a value that it can't use anyways.

In [118]:
subnat_not_in_mpi = kiva_subnat_loans[['country','subnat_region']].apply(tuple, axis=1).isin(subnat_info[['country', 'region']].apply(tuple, axis=1))
subnat_not_in_mpi.describe()

count     671205
unique         2
top        False
freq      594868
dtype: object

In [119]:
region_not_in_mpi = kiva_subnat_loans[['country','region']].apply(tuple, axis=1).isin(subnat_info[['country', 'region']].apply(tuple, axis=1))
region_not_in_mpi.describe()

count     671205
unique         2
top        False
freq      620250
dtype: object

In [121]:
kiva_subnat_loans[~subnat_not_in_mpi][['country','subnat_region']]

Unnamed: 0,country,subnat_region
2,India,West Bengal
5,Kenya,
6,India,West Bengal
14,India,West Bengal
18,India,West Bengal
25,India,West Bengal
26,India,West Bengal
28,India,West Bengal
29,India,West Bengal
33,India,West Bengal


In [127]:
subnat_info[subnat_info['country'] == 'Paraguay'][['country', 'region']]

Unnamed: 0,country,region


In [163]:
kiva_loans_mpi_nat = kiva_loans.merge(mpi_nat, how='left', left_on='country', right_on='mpi_nat-Country')
kiva_loans_mpi_nat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 67 columns):
id                                        671205 non-null int64
funded_amount                             671205 non-null float64
loan_amount                               671205 non-null float64
activity                                  671205 non-null object
sector                                    671205 non-null object
use                                       666973 non-null object
country_code                              671197 non-null object
country                                   671205 non-null object
region                                    614405 non-null object
currency                                  671205 non-null object
partner_id                                657698 non-null float64
posted_time                               671205 non-null datetime64[ns]
disbursed_time                            668809 non-null datetime64[ns]
funded_time               

In [166]:
mpi_subnat = mpi_subnat_orig.rename(lambda x: 'mpi_sub-' + str(x), axis=1)
mpi_subnat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
mpi_sub-ISO country code                     984 non-null object
mpi_sub-Country                              984 non-null object
mpi_sub-Sub-national region                  984 non-null object
mpi_sub-World region                         984 non-null object
mpi_sub-MPI National                         984 non-null float64
mpi_sub-MPI Regional                         984 non-null float64
mpi_sub-Headcount Ratio Regional             984 non-null float64
mpi_sub-Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


In [175]:
kiva_loans_mpi = kiva_loans_mpi_nat.merge(mpi_subnat, how='left', left_on=['region','country'], right_on=['mpi_sub-Sub-national region', 'mpi_sub-Country'])
kiva_loans_mpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 75 columns):
id                                           671205 non-null int64
funded_amount                                671205 non-null float64
loan_amount                                  671205 non-null float64
activity                                     671205 non-null object
sector                                       671205 non-null object
use                                          666973 non-null object
country_code                                 671197 non-null object
country                                      671205 non-null object
region                                       614405 non-null object
currency                                     671205 non-null object
partner_id                                   657698 non-null float64
posted_time                                  671205 non-null datetime64[ns]
disbursed_time                               668809 non-null da

In [359]:
unique_reg_con_pairs = kiva_loans_mpi[['region', 'country']].drop_duplicates()

In [364]:
unique_reg_con_pairs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12821 entries, 0 to 670844
Data columns (total 2 columns):
region     12749 non-null object
country    12821 non-null object
dtypes: object(2)
memory usage: 300.5+ KB


The reason I got the list above is that I can use it to create a dataset that would allow me to map a region and country to a more appropriate subnational region that is more likely to have an associated MPI

```python
from functools import lru_cache
import re

api_key = 'AIzaSyBoSQjjGx7kN4XfhLieSBsJIOoAYfO2wBo'
autocomp_addr = 'https://maps.googleapis.com/maps/api/place/autocomplete/json?key=' + api_key + '&input='
search_addr = 'https://maps.googleapis.com/maps/api/place/textsearch/json?key=' + api_key + '&query='

s = reqs.Session()

#Caching not needed since each entry is unique
#@lru_cache(maxsize=None)
def get_subregion(region, country, verbose=False):
    search = str(region) + ' ' + str(country)
    if verbose:
        print(search)
    if not (isinstance(region, str) and isinstance(country, str)):
        return region
    search = search.replace(' ', '+')
    resp = s.get(search_addr + search).json()
    if (resp['status'] != 'OK'):
        if verbose:
            print('\t', resp)
        suggest_resp = s.get(autocomp_addr + search).json()
        if (suggest_resp['status'] != 'OK'):
            if verbose:
                print('\tFAILED TO FIND', search)
                print('\t', suggest_resp)
            return region
        pred = suggest_resp['predictions'][0]['description']
        search = pred.replace(',', '').replace(' ', '+')
        if verbose:
            print('\tDid you mean "', search, '"?')
        resp = s.get(search_addr + search).json()
        if (resp['status'] != 'OK'):
            if verbose:
                print('\tFAILED TO FIND', search)
                print('\t', resp)
            return region
    res0 = resp['results'][0]
    if 'formatted_address' not in res0:
        return region
    addr = res0['formatted_address'].split(', ')
    if len(addr) > 1:
        ret = addr[-2]
        return re.sub('\d+$', '', ret).strip()
    return region

unique_reg_con_pairs['subnat_region'] = unique_reg_con_pairs.apply(tuple, axis=1).apply(lambda x: get_subregion(*x))
unique_reg_con_pairs.to_csv('subnat_regions.csv')
```

In [308]:
from functools import lru_cache
import requests as reqs

api_key = 'AIzaSyBse-g-j8WU99QqOnAtE1j4REMzrX9bwKw'

@lru_cache(maxsize=None)
def get_subregion(region, country):
    try:
        search = '+'.join(region.split(' ') + country.split(' '))
    except:
        print(type(region), region, sep=": ")
        print(str(region) == 'nan')
        print(type(country), country, sep=": ")
        return region
    print(search)
    resp = reqs.get('https://www.google.com/maps/search/' + search + '/')
    print(resp)
    cache_resp = ''
    for i in resp.iter_lines():
        i = i.decode('ascii', errors='backslashreplace').strip()
        if i.startswith('cacheResponse'):
            cache_resp = i[14:-2]
            break
    m = re.search('(?<=")[\w\s]*(, )?(?P<subregion>[\w]+), ' + country + '(?=")', cache_resp)
    return m.group(m.lastgroup)


    
#kiva_loans_mpi[['region', 'country']].apply(tuple, axis=1).apply(lambda x: get_subregion(*x))

'Sindh'

In [269]:
cache_resp = None
for i in resp.iter_lines():
    i = i.decode('ascii', errors='backslashreplace').strip()
    if i.startswith('cacheResponse'):
        cache_resp = i[14:-2]
        break
print(cache_resp)

[[[52525.68908656373,70.65149819999999,30.04420025],[0,0,0],[1024,730],13.10000038146973],"/maps-lite/js/2/ml_20180417_0",107,"!1b0!2b1!3s!4b1!5b0!6b1!7s!8b1!9i0!10b1!11b1!13b0!14b0!15b0!16b0!17b1!18b1!19b0!20b0!21b0!22s1!23zMSwxNywxOCwy!24s!25s!26b0!27b0!28b0!29i2000!30b1!32b0!34i0!36b0!37sNONE!38b0!39b1!40b0!41i0!42s!43b0!44i0!45i3600!46i0!47i10!48b1!49b0!50b0!52i0!53i2000000000!54b0!58i0!60i0!61i0!62b0!63b1",null,["en","us"],["/maps/lite/ApplicationService.GetEntityDetails","/maps/lite/ApplicationService.UpdateStarring","/maps/lite/ApplicationService.Search",null,"/maps/lite/suggest","/maps/lite/directions","/maps/lite/MapsLiteService.GetHotelAvailability","/maps/lite/MapsLiteService.GetSharedLocations","https://www.google.com/maps/api/js/reviews?key=AIzaSyCNWEtGyeVduDK_k5UOq8iBk-qP8G4TJL0\u0026language=en","/maps/lite/reviews","/maps/timeline/_rpc/mas","/maps/timeline/_rpc/pc","//maps.gstatic.com","//www.gstatic.com","/maps/preview/placeactions/writeaction"],[[[2,"spotlight",null,n

In [281]:
import re

country = 'Pakistan'
m = re.search('(?<=")[\w\s]*(, )?(?P<subregion>[\w]+), ' + country + '(?=")', cache_resp)

In [285]:
m.group(m.lastgroup)

AttributeError: '_sre.SRE_Match' object has no attribute 'match'

- mpi_LocationName: Not very useful with country and region features
- mpi_ISO: Similar to country_code, not very useful
- mpi_country: Same as country on `kiva_loans`
- mpi_region: Same as region on `kiva_loans`
- mpi_world_region: more broad than country, 6 values, could be useful in identifying global trends
- mpi_MPI: very useful for welfare, provides metric to the region for general poverty
- mpi_geo: basically useless with lat and lon features
- mpi_lat: might be useful to find linear trends with lat & lon, but not a very good metric for region based things
- mpi_lon: same as lat

In [144]:
kiva_mpi = kiva_mpi[pd.notnull(kiva_mpi['mpi_MPI'])]

In [147]:
kiva_mpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 1007
Data columns (total 9 columns):
mpi_LocationName    984 non-null object
mpi_ISO             984 non-null object
mpi_country         984 non-null object
mpi_region          984 non-null object
mpi_world_region    984 non-null object
mpi_MPI             984 non-null float64
mpi_geo             984 non-null object
mpi_lat             892 non-null float64
mpi_lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 76.9+ KB


In [153]:
kiva_loans_mpi = kiva_loans.merge(kiva_mpi, how='left', left_on=['region', 'country'], right_on=['mpi_region', 'mpi_country'])

In [154]:
kiva_loans_mpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 68 columns):
id                            671205 non-null int64
funded_amount                 671205 non-null float64
loan_amount                   671205 non-null float64
activity                      671205 non-null object
sector                        671205 non-null object
use                           666973 non-null object
country_code                  671197 non-null object
country                       671205 non-null object
region                        614405 non-null object
currency                      671205 non-null object
partner_id                    657698 non-null float64
posted_time                   671205 non-null datetime64[ns]
disbursed_time                668809 non-null datetime64[ns]
funded_time                   622874 non-null datetime64[ns]
term_in_months                671205 non-null float64
lender_count                  671205 non-null int64
tags           

In [114]:
kiva_data['kiva_loans']['region'].unique()

array(['Lahore', 'Maynaguri', 'Abdul Hakeem', ..., 'Gbenikoro Village',
       'Morimaraia', 'alejandria'], dtype=object)

In [126]:
import numpy as np
vec_lower = np.vectorize(str.lower)
kiva_data['kiva_mpi_region_locations']['country' and 'region'].unique()

array(['Badakhshan', 'Badghis', 'Baghlan', 'Balkh', 'Bamyan', 'Daykundi',
       'Farah', 'Faryab', 'Ghazni', 'Ghor', 'Helmand', 'Herat', 'Jawzjan',
       'Kabul', 'Kandahar', 'Kapisa', 'Khost', 'Kunarha', 'Kunduz',
       'Laghman', 'Logar', 'Nangarhar', 'Nimroz', 'Nooristan', 'Paktika',
       'Paktya', 'Panjsher', 'Parwan', 'Samangan', 'Sar-E-Pul', 'Takhar',
       'Urozgan', 'Wardak', 'Zabul', nan, 'Bujumbura Mairie', 'Nord',
       'Centre-Est', 'Ouest', 'Sud', 'Alibori', 'Atacora', 'Atlantique',
       'Borgou', 'Collines', 'Couffo', 'Donga', 'Littoral', 'Mono',
       'Ouðmð', 'Plateau', 'Zou', 'Boucle de mouhoun', 'Cascades',
       'Centre', 'Centre-est', 'Centre-nord', 'Centre-ouest',
       'Centre-sud', 'Est', 'Hauts basins', 'Plateau central', 'Sahel',
       'Sud-ouest', 'Barisal', 'Chittagong', 'Dhaka', 'Khulna',
       'Rajshahi', 'Rangpur', 'Sylhet', 'Corozal', 'Orange Walk',
       'Belize (excluding Belize City South Side)', 'Stann Creek',
       'Toledo', 'Cayo', '

In [43]:
import time as t

time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']

for col in time_cols:
    kiva_loan_data_region[col] = pd.to_datetime(kiva_loan_data_region[col])
    print(kiva_loan_data_region[col][0])

2014-01-02 14:25:08
2013-12-17 08:00:00
2014-01-08 22:07:48
2014-01-02 00:00:00


In [45]:
kiva_loan_data_region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52197 entries, 0 to 52196
Data columns (total 27 columns):
id                    52197 non-null int64
funded_amount         52197 non-null float64
loan_amount           52197 non-null float64
activity              52197 non-null object
sector                52197 non-null object
use                   52004 non-null object
country_code          52197 non-null object
country               52197 non-null object
region                50955 non-null object
currency              52197 non-null object
partner_id            52197 non-null float64
posted_time           52197 non-null datetime64[ns]
disbursed_time        52197 non-null datetime64[ns]
funded_time           48681 non-null datetime64[ns]
term_in_months        52197 non-null float64
lender_count          52197 non-null int64
tags                  39273 non-null object
borrower_genders      52005 non-null object
repayment_interval    52197 non-null object
date                  52197 n

In [86]:
%matplotlib inline
kiva_loan_data_region['geo'].describe()#.hist(xrot=90, figsize=(20, 7))

count                       52197
unique                        109
top       (10.5104642, 7.4165053)
freq                        10000
Name: geo, dtype: object

In [90]:
kiva_data['loan_themes_by_region'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15736 entries, 0 to 15735
Data columns (total 21 columns):
Partner ID            15736 non-null int64
Field Partner Name    15736 non-null object
sector                15736 non-null object
Loan Theme ID         15736 non-null object
Loan Theme Type       15736 non-null object
country               15736 non-null object
forkiva               15736 non-null object
region                15736 non-null object
geocode_old           1200 non-null object
ISO                   15722 non-null object
number                15736 non-null int64
amount                15736 non-null int64
LocationName          15736 non-null object
geocode               13662 non-null object
names                 13661 non-null object
geo                   15736 non-null object
lat                   13662 non-null float64
lon                   13662 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct     

In [91]:
kiva_data['loan_theme_ids'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779092 entries, 0 to 779091
Data columns (total 4 columns):
id                 779092 non-null int64
Loan Theme ID      764279 non-null object
Loan Theme Type    764279 non-null object
Partner ID         764279 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 23.8+ MB


In [92]:
kiva_data['loan_themes_by_region']

Unnamed: 0,Partner ID,Field Partner Name,sector,Loan Theme ID,Loan Theme Type,country,forkiva,region,geocode_old,ISO,...,amount,LocationName,geocode,names,geo,lat,lon,mpi_region,mpi_geo,rural_pct
0,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Banteay Meanchey,"(13.75, 103.0)",KHM,...,450,"Banteay Meanchey, Cambodia","[(13.6672596, 102.8975098)]",Banteay Meanchey Province; Cambodia,"(13.6672596, 102.8975098)",13.667260,102.897510,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
1,9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Battambang Province,,KHM,...,20275,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.028697,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
2,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Battambang Province,,KHM,...,9150,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.028697,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
3,9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,604950,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
4,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000002X1Uu,Sanitation,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,275,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
5,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,62225,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
6,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,...,1300,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.098292,105.313119,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
7,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000weyk,General,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,237175,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
8,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,3050,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
9,9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,...,31425,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.139235,104.565527,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
