In [2]:
import pandas as pd
import numpy as np
import datetime
import matplotlib as plt
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Point 1: 
Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.

In [3]:
loan_lenders = pd.read_csv('data/loans_lenders.csv')

In [4]:
loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [5]:
def normalize(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [6]:
%%time

norm_loan_lenders = normalize(loan_lenders, 'lenders', ',')

CPU times: user 12.4 s, sys: 715 ms, total: 13.2 s
Wall time: 13.1 s


In [7]:
norm_loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,muc888
0,483693,sam4326
0,483693,camaran3922
0,483693,lachheb1865
0,483693,rebecca3499


### Point 2: 
For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [8]:
%%time

loans = pd.read_csv('data/loans.csv')[['loan_id', 'country_code', 'loan_amount', 'num_lenders_total', 'disburse_time', 'planned_expiration_time']]

CPU times: user 20.2 s, sys: 1.82 s, total: 22.1 s
Wall time: 24.2 s


a lot of nan's!

In [9]:
print(len(loans.planned_expiration_time[loans.planned_expiration_time.isnull()]),
len(loans.disburse_time[loans.disburse_time.isnull()]))

371834 2813


let's convert to datetime all not nan values of disburse_time and planned_expiration_time;
#### note:
nan values in a pandas series are represented as float

In [10]:
formatter = '%Y-%m-%d %H:%M:%S.%f +0000'

In [11]:
%%time 

# TODO: find a more efficient way to do that, if any..

loans['disburse_time'] = loans['disburse_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)
loans['planned_expiration_time'] = loans['planned_expiration_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)

CPU times: user 34 s, sys: 0 ns, total: 34 s
Wall time: 34 s


In [12]:
%%time

loans['duration'] = loans.planned_expiration_time - loans.disburse_time

CPU times: user 34.8 ms, sys: 0 ns, total: 34.8 ms
Wall time: 36.4 ms


In [13]:
loans.duration

0         53 days 19:30:06
1         96 days 14:25:07
2         37 days 13:10:05
3         34 days 19:10:02
4         57 days 22:10:02
                ...       
1419602   39 days 17:00:03
1419603   39 days 08:40:07
1419604   51 days 14:20:04
1419605   63 days 00:50:02
1419606   61 days 12:50:06
Name: duration, Length: 1419607, dtype: timedelta64[ns]

There are 14935 loans which duration is negative: it means that the deadline was not satisfied

In [14]:
loans.duration[loans.duration < pd.Timedelta(0)]

63        -31 days +21:40:03
96        -25 days +06:30:11
177       -25 days +21:00:03
207       -33 days +06:30:08
217       -31 days +17:40:03
                 ...        
1419303   -35 days +21:30:03
1419304   -32 days +17:50:04
1419374   -34 days +21:40:03
1419443   -30 days +06:00:03
1419581   -30 days +16:50:04
Name: duration, Length: 14935, dtype: timedelta64[ns]

### Point 3: Find the lenders that have funded at least twice

In [15]:
%%time

loans_by_lenders = norm_loan_lenders.groupby('lenders')['loan_id'].count().reset_index(name='fund_count')

CPU times: user 9.42 s, sys: 0 ns, total: 9.42 s
Wall time: 9.41 s


In [16]:
loans_by_lenders[loans_by_lenders.fund_count > 1].head(15)

Unnamed: 0,lenders,fund_count
0,000,39
1,00000,39
2,0002,70
4,0101craign0101,71
5,0132575,4
6,0154884,4
7,0161130,2
8,0169713,3
9,0185429,2
11,0206338,2


### Point 4: For each country, compute how many loans have involved that country as borrowers

### Point 5: For each country, compute the overall amount of money borrowed

### Point 6: Like the previous point, but expressed as a percentage of the overall amount lent

In [17]:
def borrow_summary(dimension: str) -> pd.DataFrame:
    grouped = loans.groupby(dimension)
    borrow_summary = grouped['loan_id'].count().reset_index(name='borrow_count')
    borrow_summary['overall_amount_borrowed'] = grouped['loan_amount'].sum().reset_index()['loan_amount']
    overall_amount_lent = loans['loan_amount'].sum()
    borrow_summary['overall_amount_borrowed/total'] = (borrow_summary['overall_amount_borrowed'] / overall_amount_lent) * 100
    return borrow_summary

In [18]:
borrow_by_country = borrow_summary('country_code')

In [19]:
borrow_by_country.head(15)

Unnamed: 0,country_code,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,AF,2337,1967950.0,0.16657
1,AL,3075,4307350.0,0.36459
2,AM,13952,22950475.0,1.94259
3,AZ,10172,14784625.0,1.25141
4,BA,608,477250.0,0.0404
5,BF,3489,4085200.0,0.34578
6,BG,296,375300.0,0.03177
7,BI,1727,5233450.0,0.44297
8,BJ,5946,3865825.0,0.32721
9,BO,25250,44226725.0,3.74347


In [20]:
borrow_by_country.describe()

Unnamed: 0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
count,95.0,95.0,95.0
mean,14943.13684,12435766.05263,1.0526
std,35149.88448,18263180.36163,1.54584
min,1.0,5000.0,0.00042
25%,545.0,752525.0,0.0637
50%,4681.0,4181100.0,0.3539
75%,15117.0,16728425.0,1.41594
max,285336.0,97984600.0,8.29368


### Point 7: Like the three previous points, but split for each year (with respect to disburse_time)

In [21]:
%%time

loans['disburse_year'] = loans['disburse_time'].map(lambda t: t.year)

CPU times: user 6.41 s, sys: 0 ns, total: 6.41 s
Wall time: 6.41 s


In [22]:
borrow_by_year = borrow_summary('disburse_year')

In [23]:
borrow_by_year

Unnamed: 0,disburse_year,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,2005.0,203,102850.0,0.00871
1,2006.0,2172,1376575.0,0.11652
2,2007.0,24400,15446525.0,1.30744
3,2008.0,54586,39423050.0,3.33687
4,2009.0,83076,59689475.0,5.05228
5,2010.0,93466,72609150.0,6.14583
6,2011.0,114540,93699300.0,7.93096
7,2012.0,133650,119977575.0,10.15522
8,2013.0,140167,132043925.0,11.17655
9,2014.0,172709,152270425.0,12.88857


### Point 8: For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.


In [24]:
loans['base_contribution'] = loans.loan_amount / loans.num_lenders_total

In [25]:
ids = set(loans.loan_id) - set(loan_lenders.loan_id)
ids2 = set(loan_lenders.loan_id) - set(loans.loan_id)


In [26]:
loans_cleaned = loans[(loans.num_lenders_total > 0) & (~loans.loan_id.isin(ids))] ### verify maybe useless

In [27]:
norm_cleaned = norm_loan_lenders[~norm_loan_lenders.loan_id.isin(ids2)] ### verify maybe useless

In [28]:
%%time

joined = pd.concat([norm_cleaned.set_index('loan_id'), loans_cleaned.set_index('loan_id')], axis=1, join='inner')

CPU times: user 3.97 s, sys: 1.22 s, total: 5.19 s
Wall time: 5.78 s


In [29]:
overall_amount_per_lender = joined.groupby('lenders')['base_contribution'].sum().reset_index(name='overall_amount_lent')

In [30]:
overall_amount_per_lender.head(15)

Unnamed: 0,lenders,overall_amount_lent
0,000,1485.30966
1,00000,1249.94736
2,0002,2201.18046
3,00mike00,38.46154
4,0101craign0101,2424.08893
5,0132575,113.5877
6,0154884,122.07768
7,0161130,52.73973
8,0169713,90.68987
9,0185429,52.86378


In [31]:
overall_amount_per_lender.tail(15)

Unnamed: 0,lenders,overall_amount_lent
1639011,zvi,370.57695
1639012,zvi1263,28.84615
1639013,zvika5974,35.71429
1639014,zvonimir7460,34.375
1639015,zx147,177.17835
1639016,zyra9641,26.88679
1639017,zyrah8525,166.69643
1639018,zyrorl,92.59959
1639019,zzaba,25.0
1639020,zzaman,642.25094


In [32]:
overall_amount_per_lender.describe()

Unnamed: 0,overall_amount_lent
count,1639026.0
mean,637.10046
std,12587.00054
min,5.43478
25%,33.48962
50%,88.1722
75%,289.14772
max,6296800.48566


### Point 9: For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

In [33]:
borrow_by_country.head()

Unnamed: 0,country_code,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,AF,2337,1967950.0,0.16657
1,AL,3075,4307350.0,0.36459
2,AM,13952,22950475.0,1.94259
3,AZ,10172,14784625.0,1.25141
4,BA,608,477250.0,0.0404


In [34]:
lenders = pd.read_csv('data/lenders.csv')

In [35]:
sum(lenders['country_code'].isna()) ### lot of nan's. we must assume 

1458635

In [36]:
lenders['country_code'].dropna().value_counts()

US    591612
CA     67970
GB     38380
AU     37103
DE     16007
       ...  
GW         1
NU         1
TF         1
KM         1
IO         1
Name: country_code, Length: 234, dtype: int64

In [37]:
lenders['country_code'].dropna().describe()

count     890539
unique       234
top           US
freq      591612
Name: country_code, dtype: object

In [38]:
s = lenders.country_code.value_counts(normalize=True)

In [39]:
missing = lenders['country_code'].isnull()
lenders.loc[missing,'country_code'] = np.random.choice(s.index, size=len(lenders[missing]),p=s.values)

In [40]:
overall_amount_per_lender.lenders = overall_amount_per_lender.lenders.apply(lambda x: x.strip())

In [41]:
%%time

df4 = pd.concat([overall_amount_per_lender.set_index('lenders'), lenders.set_index('permanent_name')], axis=1, join='inner')

CPU times: user 6.52 s, sys: 4.22 ms, total: 6.53 s
Wall time: 6.53 s


In [42]:
lend_by_country = df4.groupby('country_code')['overall_amount_lent'].sum().reset_index(name='overall_amount_lent')

In [43]:
lend_by_country.head()

Unnamed: 0,country_code,overall_amount_lent
0,AD,8252.71562
1,AE,1796963.76547
2,AF,143169.74471
3,AG,717.10917
4,AI,448.87161


In [44]:
len(loans['country_code'].unique())

96

In [45]:
len(lenders['country_code'].unique())

234

In [84]:
overall_in_out = pd.concat([borrow_by_country.set_index('country_code'), lend_by_country.set_index('country_code')], axis = 1, join='inner')

In [85]:
overall_in_out['lent - borrow'] = overall_in_out['overall_amount_lent'] - overall_in_out['overall_amount_borrowed']

In [49]:
overall_in_out.sort_values('lent - borrow', ascending=False).head(20)

Unnamed: 0_level_0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total,overall_amount_lent,lent - borrow
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
US,9180,46352000.0,3.92336,634751207.02787,588399207.02787
CA,1,50000.0,0.00423,85293677.64499,85243677.64499
CN,134,380525.0,0.03221,4568912.10922,4188387.10922
BR,482,1192325.0,0.10092,1938226.74603,745901.74603
TH,247,608925.0,0.05154,1264282.4873,655357.4873
VU,4,9250.0,0.00078,294790.19707,285540.19707
UY,1,8000.0,0.00068,117565.66648,109565.66648
VI,2,10000.0,0.00085,22941.50977,12941.50977
GU,4,17300.0,0.00146,29887.49953,12587.49953
BT,2,20000.0,0.00169,22526.18799,2526.18799


### Point 10: Which country has the highest ratio between the difference computed at the previous point and the population?

In [86]:
country_stats = pd.read_csv('data/country_stats.csv')

In [87]:
cols = list(overall_in_out.columns) + ['population', 'population_below_poverty_line']
print(cols)

['borrow_count', 'overall_amount_borrowed', 'overall_amount_borrowed/total', 'overall_amount_lent', 'lent - borrow', 'population', 'population_below_poverty_line']


In [88]:
overall_in_out = pd.concat([overall_in_out, country_stats.set_index('country_code')], axis=1, join='inner')[cols]

In [89]:
overall_in_out.head()

Unnamed: 0_level_0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total,overall_amount_lent,lent - borrow,population,population_below_poverty_line
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AF,2337,1967950.0,0.16657,143169.74471,-1824780.25529,35530081,35.8
AL,3075,4307350.0,0.36459,27751.69918,-4279598.30082,2930187,14.3
AM,13952,22950475.0,1.94259,32686.84922,-22917788.15078,2930450,32.0
AZ,10172,14784625.0,1.25141,15871.59624,-14768753.40376,9827589,4.9
BA,608,477250.0,0.0404,70344.15762,-406905.84238,3507017,17.2


In [90]:
overall_in_out['lent - borrow / pop'] = overall_in_out['lent - borrow'] / overall_in_out['population']

In [92]:
overall_in_out.sort_values('lent - borrow / pop', ascending = False)['lent - borrow / pop ratio'].head(20)

country_code
CA    2.32752
US    1.81348
UY    0.03170
TH    0.00949
BR    0.00356
BT    0.00313
CN    0.00297
BW    0.00001
LK   -0.00111
MR   -0.00325
TR   -0.00483
IN   -0.00508
PG   -0.00512
ZA   -0.00544
EG   -0.01217
CI   -0.01369
SO   -0.02084
BG   -0.02212
PR   -0.02217
NP   -0.02454
Name: lent - borrow / pop ratio, dtype: float64

### 10: Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?

In [102]:
overall_in_out['lent - borrow / pop_over_poverty_line'] = overall_in_out['lent - borrow'] / (overall_in_out['population'] * (100 - overall_in_out['population_below_poverty_line']))

In [103]:
overall_in_out.sort_values('lent - borrow / pop_over_poverty_line', ascending = False)['lent - borrow / pop_over_poverty_line'].head(20)

country_code
CA    0.02569
US    0.02136
UY    0.00035
TH    0.00010
BR    0.00004
BT    0.00004
CN    0.00003
BW    0.00000
LK   -0.00001
MR   -0.00005
TR   -0.00006
IN   -0.00007
ZA   -0.00007
PG   -0.00008
EG   -0.00016
CI   -0.00025
BG   -0.00028
ID   -0.00031
NP   -0.00033
PA   -0.00064
Name: lent - borrow / pop_over_poverty_line, dtype: float64