In [1]:
import pandas as pd
import numpy as np

# Datasets

In [2]:
# Kiva data
kiva_loans_o = pd.read_csv('kiva-data/kiva_loans.csv')
kiva_mpi_region_locations_o = pd.read_csv('kiva-data/kiva_mpi_region_locations.csv')
loan_theme_ids_o = pd.read_csv('kiva-data/loan_theme_ids.csv')
loan_themes_by_region_o = pd.read_csv('kiva-data/loan_themes_by_region.csv')

# OPHI MPI data
MPI_national_o = pd.read_csv('mpi-data/MPI_national.csv')
MPI_subnational_o = pd.read_csv('mpi-data/MPI_subnational.csv')

# Dataset Cleaning Process

## `kiva_loans`

In [3]:
kiva_loans = kiva_loans_o.copy()
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null object
disbursed_time        668809 non-null object
funded_time           622874 non-null object
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  499789 non-null object
borrower_genders      666984 non-null object
repayment_interval    671205 non-null object
date                  671205 non

In [4]:
# Times
time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']
for col in time_cols:
    kiva_loans[col] = pd.to_datetime(kiva_loans[col])
    
# Lists
def to_list(strg, sep = ', '):
    if strg == 'nan':
        return []
    return strg.split(sep=sep)

list_cols = ['tags', 'borrower_genders']
for col in list_cols:
    kiva_loans[col] = kiva_loans[col].astype(str).apply(to_list)
    
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 20 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null datetime64[ns]
disbursed_time        668809 non-null datetime64[ns]
funded_time           622874 non-null datetime64[ns]
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  671205 non-null object
borrower_genders      671205 non-null object
repayment_interval    671205 non-null object
date    

In [5]:
(kiva_loans['posted_time'] - kiva_loans['date']).max()

Timedelta('0 days 23:59:59')

Note that the `date` is just a less precise version of `posted_time`, so `date` can be dropped

In [6]:
kiva_loans.drop('date', axis=1, inplace=True)
kiva_loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671205 entries, 0 to 671204
Data columns (total 19 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null datetime64[ns]
disbursed_time        668809 non-null datetime64[ns]
funded_time           622874 non-null datetime64[ns]
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  671205 non-null object
borrower_genders      671205 non-null object
repayment_interval    671205 non-null object
dtypes: 

The rest of the dataset looks good, some fields may be unusable/useless, but they can be removed later, since this is the main dataset.

## `kiva_mpi_region_locations`

In [7]:
kiva_mpi_region_locations = kiva_mpi_region_locations_o.copy()
kiva_mpi_region_locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 9 columns):
LocationName    984 non-null object
ISO             1008 non-null object
country         1008 non-null object
region          984 non-null object
world_region    1008 non-null object
MPI             984 non-null float64
geo             2772 non-null object
lat             892 non-null float64
lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 195.0+ KB


In [8]:
kiva_mpi_region_locations = kiva_mpi_region_locations[pd.notnull(kiva_mpi_region_locations['MPI'])]
kiva_mpi_region_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 1007
Data columns (total 9 columns):
LocationName    984 non-null object
ISO             984 non-null object
country         984 non-null object
region          984 non-null object
world_region    984 non-null object
MPI             984 non-null float64
geo             984 non-null object
lat             892 non-null float64
lon             892 non-null float64
dtypes: float64(3), object(6)
memory usage: 76.9+ KB


In [9]:
kiva_mpi_region_locations[['geo', 'lat', 'lon']].sample(5)

Unnamed: 0,geo,lat,lon
824,"(13.8648288, -88.7493998)",13.864829,-88.7494
97,"(-15.7997654, -47.8644715)",-15.799765,-47.864472
279,"(1000.0, 1000.0)",,
647,"(-18.271048, 18.4276047)",-18.271048,18.427605
302,"(1000.0, 1000.0)",,


Note how `geo` is just a more complicated version of `lat` and `lon`, so it can be dropped

In [10]:
kiva_mpi_region_locations.drop('geo', axis=1, inplace=True)
kiva_mpi_region_locations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 1007
Data columns (total 8 columns):
LocationName    984 non-null object
ISO             984 non-null object
country         984 non-null object
region          984 non-null object
world_region    984 non-null object
MPI             984 non-null float64
lat             892 non-null float64
lon             892 non-null float64
dtypes: float64(3), object(5)
memory usage: 69.2+ KB


## `loan_theme_ids`

In [11]:
loan_theme_ids = loan_theme_ids_o.copy()
loan_theme_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779092 entries, 0 to 779091
Data columns (total 4 columns):
id                 779092 non-null int64
Loan Theme ID      764279 non-null object
Loan Theme Type    764279 non-null object
Partner ID         764279 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 23.8+ MB


In [12]:
loan_theme_ids[~pd.notnull(loan_theme_ids['Loan Theme ID'])].sample(10)

Unnamed: 0,id,Loan Theme ID,Loan Theme Type,Partner ID
423690,1082878,,,
435821,1095024,,,
422154,1081342,,,
421159,1080347,,,
425681,1084869,,,
679520,1341862,,,
429806,1088994,,,
428145,1087333,,,
431129,1090317,,,
420784,1079790,,,


In [13]:
loan_theme_ids = loan_theme_ids_o[pd.notnull(loan_theme_ids_o['Loan Theme ID'])]
loan_theme_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 764279 entries, 0 to 779091
Data columns (total 4 columns):
id                 764279 non-null int64
Loan Theme ID      764279 non-null object
Loan Theme Type    764279 non-null object
Partner ID         764279 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 29.2+ MB


## `loan_themes_by_region`

In [14]:
loan_themes_by_region = loan_themes_by_region_o.copy()
loan_themes_by_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15736 entries, 0 to 15735
Data columns (total 21 columns):
Partner ID            15736 non-null int64
Field Partner Name    15736 non-null object
sector                15736 non-null object
Loan Theme ID         15736 non-null object
Loan Theme Type       15736 non-null object
country               15736 non-null object
forkiva               15736 non-null object
region                15736 non-null object
geocode_old           1200 non-null object
ISO                   15722 non-null object
number                15736 non-null int64
amount                15736 non-null int64
LocationName          15736 non-null object
geocode               13662 non-null object
names                 13661 non-null object
geo                   15736 non-null object
lat                   13662 non-null float64
lon                   13662 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct     

Any data without a `mpi_region` is useless to us, so we'll drop all rows with null values for that field

In [15]:
loan_themes_by_region = loan_themes_by_region[pd.notnull(loan_themes_by_region['mpi_region'])]
loan_themes_by_region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15722 entries, 0 to 15735
Data columns (total 21 columns):
Partner ID            15722 non-null int64
Field Partner Name    15722 non-null object
sector                15722 non-null object
Loan Theme ID         15722 non-null object
Loan Theme Type       15722 non-null object
country               15722 non-null object
forkiva               15722 non-null object
region                15722 non-null object
geocode_old           1200 non-null object
ISO                   15722 non-null object
number                15722 non-null int64
amount                15722 non-null int64
LocationName          15722 non-null object
geocode               13648 non-null object
names                 13647 non-null object
geo                   15722 non-null object
lat                   13648 non-null float64
lon                   13648 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct     

In [16]:
loan_themes_by_region[['geocode_old', 'geocode', 'geo', 'lat', 'lon', 'mpi_region', 'mpi_geo']].sample(10)

Unnamed: 0,geocode_old,geocode,geo,lat,lon,mpi_region,mpi_geo
1657,,"[(31.417139, 34.350931)]","(31.417139, 34.350931)",31.417139,34.350931,PSE,
3369,,"[(17.2928979, 121.7594384)]","(17.2928979, 121.7594384)",17.292898,121.759438,"Cagayan Valley, Philippines","(16.9753758, 121.8107079)"
13630,,"[(5.612781, -0.234345)]","(5.612781, -0.234345)",5.612781,-0.234345,"Greater Accra, Ghana","(5.8142836, 0.0746767)"
677,"(-16.5, -68.15)","[(-16.489689, -68.11929359999999)]","(-16.489689, -68.11929359999999)",-16.489689,-68.119294,"La Paz, Bolivia, Plurinational State of","(-16.489689, -68.11929359999999)"
14507,,"[(18.7338573, -72.41693769999999)]","(18.7338573, -72.41693769999999)",18.733857,-72.416938,"Centre, Haiti","(18.9582742, -72.0468164)"
3318,"(31.5, 34.4666667)","[(31.3546763, 34.3088255)]","(31.3546763, 34.3088255)",31.354676,34.308825,PSE,
15700,,"[(-17.4037042, -66.0404057)]","(-17.4037042, -66.0404057)",-17.403704,-66.040406,"Cochabamba, Bolivia, Plurinational State of","(-17.4139766, -66.1653224)"
900,"(-25.2666667, -57.5666667)","[(-25.2415033, -57.48535609999999)]","(-25.2415033, -57.48535609999999)",-25.241503,-57.485356,PRY,
4759,,"[(8.5919119, 123.365447)]","(8.5919119, 123.365447)",8.591912,123.365447,"Northern Mindanao, Philippines","(8.020163499999999, 124.6856509)"
8076,,"[(-1.9515418, 30.1098472)]","(-1.9515418, 30.1098472)",-1.951542,30.109847,"Kigali City, Rwanda","(-1.9705786, 30.1044288)"


Notice that `geocode` and `geo` are basically indentical, and they are both just more complex versions of `lat` and `lon`, so they can be dropped. `geocode_old` is also pretty useless, since it is mostly empty anyways, so it can also be dropped. `mpi_geo` might not be useful either, but it can be dropped later if need be.

In [17]:
loan_themes_by_region.drop(['geocode_old', 'geocode', 'geo'], axis=1, inplace=True)
loan_themes_by_region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15722 entries, 0 to 15735
Data columns (total 18 columns):
Partner ID            15722 non-null int64
Field Partner Name    15722 non-null object
sector                15722 non-null object
Loan Theme ID         15722 non-null object
Loan Theme Type       15722 non-null object
country               15722 non-null object
forkiva               15722 non-null object
region                15722 non-null object
ISO                   15722 non-null object
number                15722 non-null int64
amount                15722 non-null int64
LocationName          15722 non-null object
names                 13647 non-null object
lat                   13648 non-null float64
lon                   13648 non-null float64
mpi_region            15722 non-null object
mpi_geo               9671 non-null object
rural_pct             14331 non-null float64
dtypes: float64(3), int64(3), object(12)
memory usage: 2.3+ MB


## `MPI_national`

In [18]:
MPI_national = MPI_national_o.copy()
MPI_national.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 8 columns):
ISO                               102 non-null object
Country                           102 non-null object
MPI Urban                         102 non-null float64
Headcount Ratio Urban             102 non-null float64
Intensity of Deprivation Urban    102 non-null float64
MPI Rural                         102 non-null float64
Headcount Ratio Rural             102 non-null float64
Intensity of Deprivation Rural    102 non-null float64
dtypes: float64(6), object(2)
memory usage: 6.5+ KB


## `MPI_subnational`

In [19]:
MPI_subnational = MPI_subnational_o.copy()
MPI_subnational.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 8 columns):
ISO country code                     984 non-null object
Country                              984 non-null object
Sub-national region                  984 non-null object
World region                         984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(4), object(4)
memory usage: 61.6+ KB


# Data Cleaning Code (for reseting datasets)

In [20]:
# kiva_loans
kiva_loans = kiva_loans_o.copy()

# Times
time_cols = ['posted_time', 'disbursed_time', 'funded_time', 'date']
for col in time_cols:
    kiva_loans[col] = pd.to_datetime(kiva_loans[col])
    
# Lists
def to_list(strg, sep = ', '):
    if strg == 'nan':
        return []
    return strg.split(sep=sep)

list_cols = ['tags', 'borrower_genders']
for col in list_cols:
    kiva_loans[col] = kiva_loans[col].astype(str).apply(to_list)

kiva_loans.drop('date', axis=1, inplace=True)

# kiva_mpi_region_locations
kiva_mpi_region_locations = kiva_mpi_region_locations_o.copy()
kiva_mpi_region_locations = kiva_mpi_region_locations[pd.notnull(kiva_mpi_region_locations['MPI'])]
kiva_mpi_region_locations.drop('geo', axis=1, inplace=True)

# loan_theme_ids
loan_theme_ids = loan_theme_ids_o.copy()
loan_theme_ids = loan_theme_ids_o[pd.notnull(loan_theme_ids_o['Loan Theme ID'])]

# loan_themes_by_region
loan_themes_by_region = loan_themes_by_region_o.copy()
loan_themes_by_region = loan_themes_by_region[pd.notnull(loan_themes_by_region['mpi_region'])]
loan_themes_by_region.drop(['geocode_old', 'geocode', 'geo'], axis=1, inplace=True)

# MPI_national
MPI_national = MPI_national_o.copy()

# MPI_subnational
MPI_subnational = MPI_subnational_o.copy()

# Merging datasets

In [21]:
merged_data = kiva_loans.merge(
    loan_theme_ids,
    how='left',
    on='id',
    suffixes=('_kl','_lti')
)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 22 columns):
id                    671205 non-null int64
funded_amount         671205 non-null float64
loan_amount           671205 non-null float64
activity              671205 non-null object
sector                671205 non-null object
use                   666973 non-null object
country_code          671197 non-null object
country               671205 non-null object
region                614405 non-null object
currency              671205 non-null object
partner_id            657698 non-null float64
posted_time           671205 non-null datetime64[ns]
disbursed_time        668809 non-null datetime64[ns]
funded_time           622874 non-null datetime64[ns]
term_in_months        671205 non-null float64
lender_count          671205 non-null int64
tags                  671205 non-null object
borrower_genders      671205 non-null object
repayment_interval    671205 non-null object
Loan The

In [22]:
merged_data = merged_data.merge(
    loan_themes_by_region,
    how='left',
    on=['Loan Theme ID', 'Partner ID', 'region'],
    suffixes=('', '_ltbr')
)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 671205 entries, 0 to 671204
Data columns (total 37 columns):
id                      671205 non-null int64
funded_amount           671205 non-null float64
loan_amount             671205 non-null float64
activity                671205 non-null object
sector                  671205 non-null object
use                     666973 non-null object
country_code            671197 non-null object
country                 671205 non-null object
region                  614405 non-null object
currency                671205 non-null object
partner_id              657698 non-null float64
posted_time             671205 non-null datetime64[ns]
disbursed_time          668809 non-null datetime64[ns]
funded_time             622874 non-null datetime64[ns]
term_in_months          671205 non-null float64
lender_count            671205 non-null int64
tags                    671205 non-null object
borrower_genders        671205 non-null object
repayment_interva

Remove all data without a `mpi_region`, since we need the more granular data.

In [23]:
merged_data = merged_data[pd.notnull(merged_data['mpi_region'])]
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 37 columns):
id                      533461 non-null int64
funded_amount           533461 non-null float64
loan_amount             533461 non-null float64
activity                533461 non-null object
sector                  533461 non-null object
use                     533456 non-null object
country_code            533453 non-null object
country                 533461 non-null object
region                  533461 non-null object
currency                533461 non-null object
partner_id              533461 non-null float64
posted_time             533461 non-null datetime64[ns]
disbursed_time          533461 non-null datetime64[ns]
funded_time             499549 non-null datetime64[ns]
term_in_months          533461 non-null float64
lender_count            533461 non-null int64
tags                    533461 non-null object
borrower_genders        533461 non-null object
repayment_interva

In [24]:
(merged_data['sector'] != merged_data['sector_ltbr']).sum()

517133

These columns actually have rather different values, so I will leave both. Let's look at them though, to see if they are actually similar, just formatted differently

In [25]:
merged_data[['sector', 'sector_ltbr']].sample(15)

Unnamed: 0,sector,sector_ltbr
357580,Agriculture,Agriculture
465068,Arts,General Financial Inclusion
670239,Services,General Financial Inclusion
561466,Health,General Financial Inclusion
437891,Retail,General Financial Inclusion
47685,Agriculture,General Financial Inclusion
332434,Agriculture,Other
235003,Agriculture,General Financial Inclusion
649534,Retail,General Financial Inclusion
612482,Agriculture,General Financial Inclusion


In [26]:
temp = merged_data[merged_data['sector_ltbr'] != 'General Financial Inclusion']
temp = temp[temp['sector'] != temp['sector_ltbr']][['sector', 'sector_ltbr']]
temp.sample(10)

Unnamed: 0,sector,sector_ltbr
606715,Personal Use,Water and Sanitation
271534,Personal Use,Water and Sanitation
84856,Education,Clean Energy
421731,Personal Use,Clean Energy
360620,Services,Mobile Money and ICT
567539,Services,Mobile Money and ICT
502694,Personal Use,Water and Sanitation
309916,Personal Use,Water and Sanitation
471916,Food,Other
502501,Personal Use,Water and Sanitation


We can see here that many of the rows that don't have "General Financial Inclusion" and that `sector` and `sector_ltbr` don't match, have either "Personal Use" under `sector` or "other" under `sector_ltbr`. Let's see if any data does not follow this trend

In [27]:
temp = temp[temp['sector'] != 'Personal Use']
temp = temp[temp['sector_ltbr'] != 'other']
temp = temp[temp['sector_ltbr'] != 'Other']
temp.sample(10)

Unnamed: 0,sector,sector_ltbr
250490,Agriculture,Water and Sanitation
242135,Arts,Artisan
286928,Agriculture,Water and Sanitation
375440,Services,Mobile Money and ICT
142492,Education,Clean Energy
116056,Retail,Mobile Money and ICT
525466,Education,Agriculture
578987,Services,Mobile Money and ICT
71803,Agriculture,Education
198076,Arts,Artisan


In [28]:
len(temp)

11348

These last few (11,348) entries have rather conflicting results, so I'll leave both features here for now, but if we want to combine them, it shouldn't be too hard, and we would only lose 11k entries

In [29]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 37 columns):
id                      533461 non-null int64
funded_amount           533461 non-null float64
loan_amount             533461 non-null float64
activity                533461 non-null object
sector                  533461 non-null object
use                     533456 non-null object
country_code            533453 non-null object
country                 533461 non-null object
region                  533461 non-null object
currency                533461 non-null object
partner_id              533461 non-null float64
posted_time             533461 non-null datetime64[ns]
disbursed_time          533461 non-null datetime64[ns]
funded_time             499549 non-null datetime64[ns]
term_in_months          533461 non-null float64
lender_count            533461 non-null int64
tags                    533461 non-null object
borrower_genders        533461 non-null object
repayment_interva

In [30]:
(merged_data['Loan Theme Type'] != merged_data['Loan Theme Type_ltbr']).sum()

68

There seems to be some uniqueness within these features, so let's see what they are

In [31]:
merged_data[merged_data['Loan Theme Type'] != merged_data['Loan Theme Type_ltbr']][['Loan Theme Type','Loan Theme Type_ltbr']].drop_duplicates()

Unnamed: 0,Loan Theme Type,Loan Theme Type_ltbr
70912,Zaf̬n,ZafÍÎn


Since the `_ltbr` version of the dataset has more to do with merging, I will just keep it, but remove the 'lbtr' from the end

In [32]:
merged_data['Loan Theme Type'] = merged_data['Loan Theme Type_ltbr']
merged_data.drop('Loan Theme Type_ltbr', axis=1, inplace=True)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 36 columns):
id                    533461 non-null int64
funded_amount         533461 non-null float64
loan_amount           533461 non-null float64
activity              533461 non-null object
sector                533461 non-null object
use                   533456 non-null object
country_code          533453 non-null object
country               533461 non-null object
region                533461 non-null object
currency              533461 non-null object
partner_id            533461 non-null float64
posted_time           533461 non-null datetime64[ns]
disbursed_time        533461 non-null datetime64[ns]
funded_time           499549 non-null datetime64[ns]
term_in_months        533461 non-null float64
lender_count          533461 non-null int64
tags                  533461 non-null object
borrower_genders      533461 non-null object
repayment_interval    533461 non-null object
Loan The

In [33]:
(merged_data['country'] != merged_data['country_ltbr']).sum()

0

These are identical so I can just drop one

In [34]:
merged_data.drop('country_ltbr', axis=1, inplace=True)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 35 columns):
id                    533461 non-null int64
funded_amount         533461 non-null float64
loan_amount           533461 non-null float64
activity              533461 non-null object
sector                533461 non-null object
use                   533456 non-null object
country_code          533453 non-null object
country               533461 non-null object
region                533461 non-null object
currency              533461 non-null object
partner_id            533461 non-null float64
posted_time           533461 non-null datetime64[ns]
disbursed_time        533461 non-null datetime64[ns]
funded_time           499549 non-null datetime64[ns]
term_in_months        533461 non-null float64
lender_count          533461 non-null int64
tags                  533461 non-null object
borrower_genders      533461 non-null object
repayment_interval    533461 non-null object
Loan The

The rest of the features contain unique data, so I will just leave them as they are.

Below is the data flow for merging all of the MPI related data, which will later be merged with this main data to create one nice, clean dataset.

In [35]:
mpi_info = kiva_mpi_region_locations.merge(
    MPI_subnational,
    how='outer',
    left_on=['country', 'region', 'world_region'],
    right_on=['Country', 'Sub-national region', 'World region'],
    suffixes=('', '_ms')
)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 16 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
MPI                                  984 non-null float64
lat                                  892 non-null float64
lon                                  892 non-null float64
ISO country code                     984 non-null object
Country                              984 non-null object
Sub-national region                  984 non-null object
World region                         984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional 

Remove merged on features `Country`, `Sub-national region`, and `World region`

In [36]:
mpi_info.drop(['Country', 'Sub-national region', 'World region'], axis=1, inplace=True)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 13 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
MPI                                  984 non-null float64
lat                                  892 non-null float64
lon                                  892 non-null float64
ISO country code                     984 non-null object
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(7), object(6)
memory usage: 107.6+ KB


In [37]:
(mpi_info['ISO'] != mpi_info['ISO country code']).sum()

0

`ISO` and `ISO country code` are identical, so one can be removed

In [38]:
mpi_info.drop('ISO country code', axis=1, inplace=True)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 12 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
MPI                                  984 non-null float64
lat                                  892 non-null float64
lon                                  892 non-null float64
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(7), object(5)
memory usage: 99.9+ KB


In [39]:
(mpi_info['MPI'] != mpi_info['MPI Regional']).sum()

0

`MPI` and `MPI Regional` are identical, so one can be removed

In [40]:
mpi_info.drop('MPI', axis=1, inplace=True)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 11 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
dtypes: float64(6), object(5)
memory usage: 92.2+ KB


In [41]:
mpi_info = mpi_info.merge(
    MPI_national,
    how='left',
    left_on=['country', 'ISO'],
    right_on=['Country', 'ISO'],
    suffixes=('', '_mn')
)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 18 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
Country                              984 non-null object
MPI Urban                            984 non-null float64
Headcount Ratio Urban                984 non-null float64
Intensity of Deprivation Urban       984 non-null float64
MPI Rural                      

Again, drop the columns that were merged on from the second dataset (not `ISO` since it was identical and there is still only 1 column with that value)

In [42]:
mpi_info.drop(['Country'], axis=1, inplace=True)

In [43]:
mpi_info[[key for key in mpi_info.keys() if 'mpi' in key.lower()]].sample(5)

Unnamed: 0,MPI National,MPI Regional,MPI Urban,MPI Rural
737,0.043,0.038,0.011,0.113
895,0.36,0.444,0.176,0.414
645,0.605,0.646,0.276,0.669
165,0.248,0.024,0.091,0.393
412,0.045,0.038,0.028,0.083


Notice that all features with *'mpi'* in the name are different, so they are all good to stay

In [44]:
mpi_info[[key for key in mpi_info.keys() if 'head' in key.lower()]].sample(5)

Unnamed: 0,Headcount Ratio Regional,Headcount Ratio Urban,Headcount Ratio Rural
323,24.2,10.7,35.05
873,88.1,64.8,93.41
76,13.6,1.6,6.98
440,1.8,1.7,1.81
807,7.4,2.4,12.51


The same goes for the features with *'head'* in the name

In [45]:
mpi_info[[key for key in mpi_info.keys() if 'intensity' in key.lower()]].sample(5)

Unnamed: 0,Intensity of deprivation Regional,Intensity of Deprivation Urban,Intensity of Deprivation Rural
157,54.0,46.8,55.0
684,41.1,47.0,59.5
974,36.8,39.3,43.7
214,47.3,45.2,48.5
8,51.6,45.8,53.6


And the same for *'intensity'*

So this is the final dataset for all MPI information, so now it's time to merge it with the main dataset. First, I want to convert the features into `snake_case` because this is python, and also what I prefer

In [46]:
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 17 columns):
LocationName                         984 non-null object
ISO                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
MPI National                         984 non-null float64
MPI Regional                         984 non-null float64
Headcount Ratio Regional             984 non-null float64
Intensity of deprivation Regional    983 non-null float64
MPI Urban                            984 non-null float64
Headcount Ratio Urban                984 non-null float64
Intensity of Deprivation Urban       984 non-null float64
MPI Rural                            984 non-null float64
Headcount Ratio Rural         

In [47]:
snake_case = lambda colname: colname.lower().replace(' ','_')
mpi_info.rename(snake_case, axis=1, inplace=True)
mpi_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 0 to 983
Data columns (total 17 columns):
locationname                         984 non-null object
iso                                  984 non-null object
country                              984 non-null object
region                               984 non-null object
world_region                         984 non-null object
lat                                  892 non-null float64
lon                                  892 non-null float64
mpi_national                         984 non-null float64
mpi_regional                         984 non-null float64
headcount_ratio_regional             984 non-null float64
intensity_of_deprivation_regional    983 non-null float64
mpi_urban                            984 non-null float64
headcount_ratio_urban                984 non-null float64
intensity_of_deprivation_urban       984 non-null float64
mpi_rural                            984 non-null float64
headcount_ratio_rural         

I'll do the same to `merged_data` before joining the last 2 datasets

In [48]:
snake_case = lambda colname: colname.lower().replace(' ','_')
merged_data.rename(snake_case, axis=1, inplace=True)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 1 to 671149
Data columns (total 35 columns):
id                    533461 non-null int64
funded_amount         533461 non-null float64
loan_amount           533461 non-null float64
activity              533461 non-null object
sector                533461 non-null object
use                   533456 non-null object
country_code          533453 non-null object
country               533461 non-null object
region                533461 non-null object
currency              533461 non-null object
partner_id            533461 non-null float64
posted_time           533461 non-null datetime64[ns]
disbursed_time        533461 non-null datetime64[ns]
funded_time           499549 non-null datetime64[ns]
term_in_months        533461 non-null float64
lender_count          533461 non-null int64
tags                  533461 non-null object
borrower_genders      533461 non-null object
repayment_interval    533461 non-null object
loan_the

In [66]:
final_data = merged_data.merge(
    mpi_info,
    how='left',
    left_on='mpi_region',
    right_on='locationname',
    suffixes=('', '_mi')
)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 52 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

Remove `locationname_mi`, since it was merged on

In [67]:
final_data.drop('locationname_mi', axis=1, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 51 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [68]:
(final_data['country'] != final_data['country_mi']).sum()

95476

A good amount of the data has different values for these features, let's take a closer look

In [69]:
temp = final_data[final_data['country'] != final_data['country_mi']]
temp.sample(10)

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,...,mpi_national,mpi_regional,headcount_ratio_regional,intensity_of_deprivation_regional,mpi_urban,headcount_ratio_urban,intensity_of_deprivation_urban,mpi_rural,headcount_ratio_rural,intensity_of_deprivation_rural
29327,698780,5075.0,5075.0,Knitting,Arts,to buy wool wholesale,BO,Bolivia,La Paz / El Alto,BOB,...,0.089,0.077,18.4,41.8,0.019,4.9,39.4,0.191,42.98,44.4
219527,964189,850.0,850.0,Services,Services,to open a car wash.,GE,Georgia,Khelvachauri,USD,...,,,,,,,,,,
488387,1282368,475.0,475.0,Dairy,Agriculture,to purchase cows and thus expand her milk busi...,IN,India,Khurda,INR,...,,,,,,,,,,
6044,662949,1775.0,1775.0,Farming,Agriculture,to install heating systems in her greenhouses ...,UA,Ukraine,Vinogradovo,UAH,...,,,,,,,,,,
460175,1250778,950.0,950.0,Used Clothing,Clothing,to buy a car to use for transporting items to ...,AL,Albania,Korce,ALL,...,,,,,,,,,,
163705,896290,1850.0,2700.0,Beauty Salon,Services,to buy a tattoo machine and hair dryer.,LB,Lebanon,Nabatieh,USD,...,,,,,,,,,,
57538,742892,3225.0,3225.0,Pharmacy,Health,"to buy boxes of modern medicine (antibiotics, ...",CD,The Democratic Republic of the Congo,"Goma, North Kivu province",USD,...,0.401,0.462,82.9,55.8,0.228,48.6,47.0,0.486,88.18,55.1
250274,1000142,1200.0,1200.0,Farming,Agriculture,to make some investments in the greenhouse he ...,AL,Albania,Lezhe,ALL,...,,,,,,,,,,
273207,1026196,550.0,550.0,Higher education costs,Education,to pay the university fees.,PY,Paraguay,Paraguari,PYG,...,,,,,,,,,,
475052,1266964,925.0,925.0,Higher education costs,Education,to pay her tuition fees.,PY,Paraguay,Villarrica,PYG,...,,,,,,,,,,


Since country_mi is NaN often, let's only check when that isn't true

In [70]:
temp = temp[pd.notnull(temp['country_mi'])][['country','country_mi']]
temp.drop_duplicates()

Unnamed: 0,country,country_mi
35,Tanzania,"Tanzania, United Republic of"
192,Bolivia,"Bolivia, Plurinational State of"
458,Vietnam,Viet Nam
1665,The Democratic Republic of the Congo,"Congo, Democratic Republic of the"
59866,Myanmar (Burma),Myanmar


All of the other cases just have alternate spellings, so dropping `country_mi` should cause no issues

In [71]:
final_data.drop('country_mi', axis=1, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 50 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [72]:
(final_data['mpi_region'] != final_data['region_mi']).sum()

533461

I have a strong feeling that `region_mi` is a substring of `mpi_region`, so let's see how often that is the case

In [73]:
vecin = np.vectorize(str.__contains__)
temp = final_data[['mpi_region', 'region_mi']].dropna()
(~vecin(temp['mpi_region'], temp['region_mi'])).sum()

0

This means that `region_mi` is always in `mpi_region`, so we can remove `region_mi` because it contains less info, right? Take a look at the data however

In [74]:
temp.sample(10)

Unnamed: 0,mpi_region,region_mi
5121,"Solola, Guatemala",Solola
109711,"Islamabad (ICT), Pakistan",Islamabad (ICT)
88691,"Central Visayas, Philippines",Central Visayas
237300,"Central Visayas, Philippines",Central Visayas
335666,"Nyanza, Kenya",Nyanza
425492,"Galapagos Island, Ecuador",Galapagos Island
178488,"Central, Jordan",Central
260973,"Eastern Visayas, Philippines",Eastern Visayas
488679,"Western, Kenya",Western
166163,"Khatlon, Tajikistan",Khatlon


If I add the country to this data, it should be even more clear

In [75]:
temp = final_data[['mpi_region', 'region_mi', 'country']].dropna()
temp.sample(10)

Unnamed: 0,mpi_region,region_mi,country
424680,"Western Visayas, Philippines",Western Visayas,Philippines
206501,"Punjab, Pakistan",Punjab,Pakistan
523802,"Khatlon, Tajikistan",Khatlon,Tajikistan
512622,"Eastern, Kenya",Eastern,Kenya
24375,"Punjab, Pakistan",Punjab,Pakistan
10815,"Usulutan, El Salvador",Usulutan,El Salvador
450432,"Western Visayas, Philippines",Western Visayas,Philippines
205041,"Calabarzon, Philippines",Calabarzon,Philippines
394648,"Northern Mindanao, Philippines",Northern Mindanao,Philippines
206678,"Punjab, Pakistan",Punjab,Pakistan


So it is better to keep `region_mi` over `mpi_region`, however I will change the names so it makes a little more sense. Later I realized that `region_mi` had some NaNs, so instead of just renaming, if `region_mi` is NaN, it takes the value from `mpi_region`

In [76]:
final_data['mpi_region'] = final_data['region_mi'].fillna(final_data['mpi_region'])
final_data.drop('region_mi', axis=1, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 49 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [77]:
final_data[['lat_mi', 'lon_mi', 'mpi_geo']].sample(10)

Unnamed: 0,lat_mi,lon_mi,mpi_geo
459912,10.932152,104.798771,"(10.9321519, 104.798771)"
111987,9.843207,118.736478,"(9.843206499999999, 118.7364783)"
150820,37.911356,69.097023,"(37.9113562, 69.097023)"
152282,-0.115003,34.851379,"(-0.115003, 34.851379)"
73966,-17.485103,29.788925,"(-17.4851029, 29.7889248)"
69950,0.258052,30.52791,"(0.2580521, 30.5279096)"
363763,,,
134647,-0.019006,37.648081,"(-0.0190061, 37.6480812)"
315618,-25.891968,32.605135,"(-25.891968, 32.6051351)"
215056,10.296856,123.888677,"(10.2968562, 123.8886774)"


As you can see here, `lat_mi` and `lon_mi` is just the decomposed information from `mpi_geo`, so we can drop `mpi_geo`. I will also rename the other features to `mpi_region_lat` and `mpi_region_lon` to make it a little more clear

In [78]:
final_data['mpi_region_lat'] = final_data['lat_mi']
final_data['mpi_region_lon'] = final_data['lon_mi']
final_data.drop(['mpi_geo', 'lat_mi', 'lon_mi'], axis=1, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 48 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

# TODO figure out what to do with data that has no MPI (just drop or try to salvage)

## This entire project takes around 3-4 minutes to run, so it's pretty efficient

In [79]:
def ior(i):
    i = iter(i)
    try:
        s = next(i)
    except StopIteration:
        return None
    for e in i:
        s = s | e
    return s

all_tags = ior(final_data['tags'].apply(set))
all_tags

{'#Animals',
 '#Biz Durable Asset',
 '#Eco-friendly',
 '#Elderly',
 '#Fabrics',
 '#Female Education',
 '#First Loan',
 '#Health and Sanitation',
 '#Hidden Gem',
 '#Inspiring Story',
 '#Interesting Photo',
 '#Job Creator',
 '#Low-profit FP',
 '#Orphan',
 '#Parent',
 '#Post-disbursed',
 '#Refugee',
 '#Repair Renew Replace',
 '#Repeat Borrower',
 '#Schooling',
 '#Single',
 '#Single Parent',
 '#Supporting Family',
 '#Sustainable Ag',
 '#Technology',
 '#Tourism',
 '#Trees',
 '#Unique',
 '#Vegan',
 '#Widowed',
 '#Woman Owned Biz',
 'user_favorite',
 'user_like',
 'volunteer_like',
 'volunteer_pick'}

In [82]:
vecin = np.vectorize(list.__contains__)

for tag in all_tags:
    final_data['tag_' + tag] = vecin(final_data['tags'], tag).astype(int)

In [83]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 83 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [86]:
veclen = np.vectorize(list.__len__)

final_data['num_tags'] = veclen(final_data['tags'])
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 84 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [87]:
final_data.drop('tags', axis=1, inplace=True)
final_data[[key for key in final_data.keys() if 'tag' in key]].sample(10)

Unnamed: 0,tag_#Refugee,tag_volunteer_pick,tag_#Female Education,tag_user_favorite,tag_#Eco-friendly,tag_#Sustainable Ag,tag_#Tourism,tag_#Orphan,tag_#Hidden Gem,tag_#First Loan,...,tag_#Elderly,tag_#Schooling,tag_#Supporting Family,tag_#Interesting Photo,tag_#Vegan,tag_#Biz Durable Asset,tag_#Post-disbursed,tag_#Single,tag_#Animals,num_tags
510553,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,3
27832,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66794,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
51997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193475,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,5
28612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
520481,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
95552,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2
248000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
396696,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [88]:
final_data['num_male_borrowers'] = final_data['borrower_genders'].apply(lambda x: x.count('male'))
final_data['num_female_borrowers'] = veclen(final_data['borrower_genders']) - final_data['num_male_borrowers']
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 85 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         

In [89]:
final_data['time_till_funded'] = final_data['funded_time'] - final_data['posted_time']
final_data['hours_till_funded'] = final_data['time_till_funded'].astype('timedelta64[h]')
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533461 entries, 0 to 533460
Data columns (total 87 columns):
id                                   533461 non-null int64
funded_amount                        533461 non-null float64
loan_amount                          533461 non-null float64
activity                             533461 non-null object
sector                               533461 non-null object
use                                  533456 non-null object
country_code                         533453 non-null object
country                              533461 non-null object
region                               533461 non-null object
currency                             533461 non-null object
partner_id                           533461 non-null float64
posted_time                          533461 non-null datetime64[ns]
disbursed_time                       533461 non-null datetime64[ns]
funded_time                          499549 non-null datetime64[ns]
term_in_months         