In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib as plt
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Point 1: 
Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.

In [2]:
loan_lenders = pd.read_csv('data/loans_lenders.csv')

In [3]:
loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [4]:
def normalize(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [5]:
%%time

norm_loan_lenders = normalize(loan_lenders, 'lenders', ',')

CPU times: user 15.6 s, sys: 1.45 s, total: 17.1 s
Wall time: 17.4 s


In [6]:
norm_loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,muc888
0,483693,sam4326
0,483693,camaran3922
0,483693,lachheb1865
0,483693,rebecca3499


### Point 2: 
For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [7]:
%%time

loans = pd.read_csv('data/loans.csv')[['loan_id', 'country_code', 'loan_amount', 'num_lenders_total', 'disburse_time', 'planned_expiration_time']]

CPU times: user 28.8 s, sys: 2.53 s, total: 31.3 s
Wall time: 32.1 s


a lot of nan's!

In [8]:
print(len(loans.planned_expiration_time[loans.planned_expiration_time.isnull()]),
len(loans.disburse_time[loans.disburse_time.isnull()]))

371834 2813


let's convert to datetime all not nan values of disburse_time and planned_expiration_time;
#### note:
nan values in a pandas series are represented as float

In [9]:
formatter = '%Y-%m-%d %H:%M:%S.%f +0000'

In [10]:
%%time 

loans['disburse_time'] =  pd.to_datetime(loans['disburse_time'], format=formatter)
loans['planned_expiration_time'] =  pd.to_datetime(loans['planned_expiration_time'], format=formatter)

CPU times: user 4.76 s, sys: 84.1 ms, total: 4.85 s
Wall time: 4.91 s


In [11]:
%%time

loans['duration'] = loans.planned_expiration_time - loans.disburse_time

CPU times: user 24.4 ms, sys: 8.93 ms, total: 33.3 ms
Wall time: 33.4 ms


In [12]:
loans.duration

0         53 days 19:30:06
1         96 days 14:25:07
2         37 days 13:10:05
3         34 days 19:10:02
4         57 days 22:10:02
                ...       
1419602   39 days 17:00:03
1419603   39 days 08:40:07
1419604   51 days 14:20:04
1419605   63 days 00:50:02
1419606   61 days 12:50:06
Name: duration, Length: 1419607, dtype: timedelta64[ns]

There are 14935 loans which duration is negative: it means that the deadline was not satisfied

In [13]:
loans.duration[loans.duration < pd.Timedelta(0)]

63        -31 days +21:40:03
96        -25 days +06:30:11
177       -25 days +21:00:03
207       -33 days +06:30:08
217       -31 days +17:40:03
                 ...        
1419303   -35 days +21:30:03
1419304   -32 days +17:50:04
1419374   -34 days +21:40:03
1419443   -30 days +06:00:03
1419581   -30 days +16:50:04
Name: duration, Length: 14935, dtype: timedelta64[ns]

### Point 3: 
Find the lenders that have funded at least twice

In [14]:
%%time

loans_by_lenders = norm_loan_lenders.groupby('lenders')['loan_id'].count().reset_index(name='fund_count')

CPU times: user 14.2 s, sys: 560 ms, total: 14.8 s
Wall time: 14.9 s


In [15]:
loans_by_lenders[loans_by_lenders.fund_count > 1].head(15)

Unnamed: 0,lenders,fund_count
0,000,39
1,00000,39
2,0002,70
4,0101craign0101,71
5,0132575,4
6,0154884,4
7,0161130,2
8,0169713,3
9,0185429,2
11,0206338,2


### Point 4: 
For each country, compute how many loans have involved that country as borrowers

### Point 5: 
For each country, compute the overall amount of money borrowed

### Point 6: 
Like the previous point, but expressed as a percentage of the overall amount lent

In [16]:
def borrow_summary(dimension: str) -> pd.DataFrame:
    grouped = loans.groupby(dimension)
    borrow_summary = grouped['loan_id'].count().reset_index(name='borrow_count')
    borrow_summary['overall_amount_borrowed'] = grouped['loan_amount'].sum().reset_index()['loan_amount']
    overall_amount_lent = loans['loan_amount'].sum()
    borrow_summary['overall_amount_borrowed/total'] = (borrow_summary['overall_amount_borrowed'] / overall_amount_lent) * 100
    return borrow_summary

In [17]:
borrow_by_country = borrow_summary('country_code')

In [18]:
borrow_by_country.head(15)

Unnamed: 0,country_code,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,AF,2337,1967950.0,0.16657
1,AL,3075,4307350.0,0.36459
2,AM,13952,22950475.0,1.94259
3,AZ,10172,14784625.0,1.25141
4,BA,608,477250.0,0.0404
5,BF,3489,4085200.0,0.34578
6,BG,296,375300.0,0.03177
7,BI,1727,5233450.0,0.44297
8,BJ,5946,3865825.0,0.32721
9,BO,25250,44226725.0,3.74347


In [19]:
borrow_by_country.describe()

Unnamed: 0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
count,95.0,95.0,95.0
mean,14943.13684,12435766.05263,1.0526
std,35149.88448,18263180.36163,1.54584
min,1.0,5000.0,0.00042
25%,545.0,752525.0,0.0637
50%,4681.0,4181100.0,0.3539
75%,15117.0,16728425.0,1.41594
max,285336.0,97984600.0,8.29368


### Point 7: 
Like the three previous points, but split for each year (with respect to disburse_time)

In [20]:
borrow_by_year = borrow_summary(loans['disburse_time'].dt.year)

In [21]:
borrow_by_year

Unnamed: 0,disburse_time,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,2005.0,203,102850.0,0.00871
1,2006.0,2172,1376575.0,0.11652
2,2007.0,24400,15446525.0,1.30744
3,2008.0,54586,39423050.0,3.33687
4,2009.0,83076,59689475.0,5.05228
5,2010.0,93466,72609150.0,6.14583
6,2011.0,114540,93699300.0,7.93096
7,2012.0,133650,119977575.0,10.15522
8,2013.0,140167,132043925.0,11.17655
9,2014.0,172709,152270425.0,12.88857


### Point 8: 
For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.


In [22]:
loans['base_contribution'] = loans.loan_amount / loans.num_lenders_total

In [23]:
ids = set(loans.loan_id) - set(loan_lenders.loan_id)
ids2 = set(loan_lenders.loan_id) - set(loans.loan_id)


In [24]:
loans_cleaned = loans[(loans.num_lenders_total > 0) & (~loans.loan_id.isin(ids))] ### verify maybe useless

In [25]:
norm_cleaned = norm_loan_lenders[~norm_loan_lenders.loan_id.isin(ids2)] ### verify maybe useless

In [26]:
%%time

joined = pd.concat([norm_cleaned.set_index('loan_id'), loans_cleaned.set_index('loan_id')], axis=1, join='inner')

CPU times: user 4.25 s, sys: 1.83 s, total: 6.07 s
Wall time: 6.42 s


In [27]:
overall_amount_per_lender = joined.groupby('lenders')['base_contribution'].sum().reset_index(name='overall_amount_lent')

In [28]:
overall_amount_per_lender.head(15)

Unnamed: 0,lenders,overall_amount_lent
0,000,1485.30966
1,00000,1249.94736
2,0002,2201.18046
3,00mike00,38.46154
4,0101craign0101,2424.08893
5,0132575,113.5877
6,0154884,122.07768
7,0161130,52.73973
8,0169713,90.68987
9,0185429,52.86378


In [29]:
overall_amount_per_lender.tail(15)

Unnamed: 0,lenders,overall_amount_lent
1639011,zvi,370.57695
1639012,zvi1263,28.84615
1639013,zvika5974,35.71429
1639014,zvonimir7460,34.375
1639015,zx147,177.17835
1639016,zyra9641,26.88679
1639017,zyrah8525,166.69643
1639018,zyrorl,92.59959
1639019,zzaba,25.0
1639020,zzaman,642.25094


In [30]:
overall_amount_per_lender.describe()

Unnamed: 0,overall_amount_lent
count,1639026.0
mean,637.10046
std,12587.00054
min,5.43478
25%,33.48962
50%,88.1722
75%,289.14772
max,6296800.48566


### Point 9: 
For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

In [31]:
borrow_by_country.head()

Unnamed: 0,country_code,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,AF,2337,1967950.0,0.16657
1,AL,3075,4307350.0,0.36459
2,AM,13952,22950475.0,1.94259
3,AZ,10172,14784625.0,1.25141
4,BA,608,477250.0,0.0404


In [32]:
lenders = pd.read_csv('data/lenders.csv')

In [33]:
sum(lenders['country_code'].isna()) ### lot of nan's. we must assume 

1458635

In [34]:
lenders['country_code'].dropna().value_counts()

US    591612
CA     67970
GB     38380
AU     37103
DE     16007
       ...  
KM         1
NU         1
MS         1
TF         1
GW         1
Name: country_code, Length: 234, dtype: int64

In [35]:
lenders['country_code'].dropna().describe()

count     890539
unique       234
top           US
freq      591612
Name: country_code, dtype: object

In [36]:
s = lenders.country_code.value_counts(normalize=True)

In [37]:
missing = lenders['country_code'].isnull()
lenders.loc[missing,'country_code'] = np.random.choice(s.index, size=len(lenders[missing]),p=s.values)

In [38]:
overall_amount_per_lender.lenders = overall_amount_per_lender.lenders.str.strip()

In [39]:
%%time

df4 = pd.concat([overall_amount_per_lender.set_index('lenders'), lenders.set_index('permanent_name')], axis=1, join='inner')

CPU times: user 9.1 s, sys: 402 ms, total: 9.5 s
Wall time: 9.61 s


In [40]:
lend_by_country = df4.groupby('country_code')['overall_amount_lent'].sum().reset_index(name='overall_amount_lent')

In [41]:
lend_by_country.head()

Unnamed: 0,country_code,overall_amount_lent
0,AD,6143.98773
1,AE,1820017.66116
2,AF,150896.02207
3,AG,1084.30342
4,AI,1678.48323


In [42]:
len(loans['country_code'].unique())

96

In [43]:
len(lenders['country_code'].unique())

234

In [44]:
overall_in_out = pd.concat([borrow_by_country.set_index('country_code'), lend_by_country.set_index('country_code')], axis = 1, join='inner')

In [45]:
overall_in_out['lent - borrow'] = overall_in_out['overall_amount_lent'] - overall_in_out['overall_amount_borrowed']

In [46]:
overall_in_out.sort_values('lent - borrow', ascending=False).head(20)

Unnamed: 0_level_0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total,overall_amount_lent,lent - borrow
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
US,9180,46352000.0,3.92336,639902073.97117,593550073.97117
CA,1,50000.0,0.00423,86194321.66328,86144321.66328
BR,482,1192325.0,0.10092,2210492.80729,1018167.80729
CN,134,380525.0,0.03221,1214252.37336,833727.37336
TH,247,608925.0,0.05154,1307955.98905,699030.98905
VU,4,9250.0,0.00078,296884.26029,287634.26029
UY,1,8000.0,0.00068,102445.12745,94445.12745
VI,2,10000.0,0.00085,20862.87531,10862.87531
GU,4,17300.0,0.00146,26433.72682,9133.72682
BW,1,8000.0,0.00068,9876.96392,1876.96392


### Point 10: 
Which country has the highest ratio between the difference computed at the previous point and the population?

In [47]:
country_stats = pd.read_csv('data/country_stats.csv')

In [48]:
cols = list(overall_in_out.columns) + ['population', 'population_below_poverty_line']
print(cols)

['borrow_count', 'overall_amount_borrowed', 'overall_amount_borrowed/total', 'overall_amount_lent', 'lent - borrow', 'population', 'population_below_poverty_line']


In [49]:
overall_in_out = pd.concat([overall_in_out, country_stats.set_index('country_code')], axis=1, join='inner')[cols]

In [50]:
overall_in_out.head()

Unnamed: 0_level_0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total,overall_amount_lent,lent - borrow,population,population_below_poverty_line
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AF,2337,1967950.0,0.16657,150896.02207,-1817053.97793,35530081,35.8
AL,3075,4307350.0,0.36459,20123.01496,-4287226.98504,2930187,14.3
AM,13952,22950475.0,1.94259,36853.99399,-22913621.00601,2930450,32.0
AZ,10172,14784625.0,1.25141,11704.08335,-14772920.91665,9827589,4.9
BA,608,477250.0,0.0404,70191.96861,-407058.03139,3507017,17.2


In [51]:
overall_in_out['lent - borrow / pop'] = overall_in_out['lent - borrow'] / overall_in_out['population']

In [52]:
overall_in_out.sort_values('lent - borrow / pop', ascending = False).iloc[0]

borrow_count                           1.00000
overall_amount_borrowed            50000.00000
overall_amount_borrowed/total          0.00423
overall_amount_lent             86194321.66328
lent - borrow                   86144321.66328
population                      36624199.00000
population_below_poverty_line          9.40000
lent - borrow / pop                    2.35211
Name: CA, dtype: float64

### Point 11: 
Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?

In [53]:
overall_in_out['lent - borrow / pop_over_poverty_line'] = overall_in_out['lent - borrow'] / (overall_in_out['population'] * (100 - overall_in_out['population_below_poverty_line']))

In [54]:
overall_in_out.sort_values('lent - borrow / pop_over_poverty_line', ascending = False).iloc[0]

borrow_count                                   1.00000
overall_amount_borrowed                    50000.00000
overall_amount_borrowed/total                  0.00423
overall_amount_lent                     86194321.66328
lent - borrow                           86144321.66328
population                              36624199.00000
population_below_poverty_line                  9.40000
lent - borrow / pop                            2.35211
lent - borrow / pop_over_poverty_line          0.02596
Name: CA, dtype: float64

### Point 12
For each year, compute the total amount of loans. Each loan that has planned expiration time and disburse time in different years must have its amount distributed proportionally to the number of days in each year. For example, a loan with disburse time December 1st, 2016, planned expiration time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85 for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018

In [55]:
loans.head()

Unnamed: 0,loan_id,country_code,loan_amount,num_lenders_total,disburse_time,planned_expiration_time,duration,base_contribution
0,657307,PH,125.0,3,2013-12-22 08:00:00,2014-02-14 03:30:06,53 days 19:30:06,41.66667
1,657259,HN,400.0,11,2013-12-20 08:00:00,2014-03-26 22:25:07,96 days 14:25:07,36.36364
2,658010,PK,400.0,16,2014-01-09 08:00:00,2014-02-15 21:10:05,37 days 13:10:05,25.0
3,659347,KG,625.0,21,2014-01-17 08:00:00,2014-02-21 03:10:02,34 days 19:10:02,29.7619
4,656933,PH,425.0,15,2013-12-17 08:00:00,2014-02-13 06:10:02,57 days 22:10:02,28.33333


In [56]:
loans = loans.dropna(subset=['planned_expiration_time'], how='all')
loans = loans.dropna(subset=['disburse_time'], how='all')

In [57]:
grouped = loans.groupby(loans['disburse_time'].dt.year)[['disburse_time', 'planned_expiration_time', 'loan_amount']]

In [58]:
new_df = pd.DataFrame(columns=['loan_id', 'year', 'amount'])

In [122]:
%%time

def compute_amounts(start_year, row):
    amounts = []
    duration = row.duration.days
    disburse = row.disburse_time
    planned = row.planned_expiration_time
    last_day_of_start_year = datetime.date(year=start_year, month=12, day=31)
    first_day_of_planned_exp_year = datetime.date(year=planned.year, month=1, day=1)
    first_year_duration = (last_day_of_start_year - disburse.date()).days
    last_year_duration = (planned.date() - first_day_of_planned_exp_year).days
    
    first_year_amount = row.loan_amount * first_year_duration / duration
    last_year_amount = row.loan_amount * last_year_duration / duration
    
    amounts.extend([(start_year, first_year_amount), (planned.year, last_year_amount)])
    
    remaining_days = duration - first_year_duration - last_year_duration
    i = 1
    year = start_year
    while (remaining_days > 365):
        year = start_year + i
        amount = row.loan_amount * 365 / duration
        amounts.append((year, amount))
        i = i + 1
        remaining_days = remaining_days - 365
    
    last_amount = row.loan_amount * remaining_days / duration
    amounts.append((year + 1, last_amount))
    return amounts

new_df = pd.DataFrame(columns=['loan_id', 'year', 'amount'])
ids = []
years = []
amounts = []
for index, row in loans.iterrows():
    start_year = row.disburse_time.year
    years_tot = row.planned_expiration_time.year - start_year
    if (years_tot > 1):
        amts = compute_amounts(start_year, row)
        for year, amount in amts:
            ids.append(row.loan_id)
            years.append(year)
            amounts.append(amount)
    else:
        ids.append(row.loan_id)
        years.append(start_year)
        amounts.append(row.loan_amount)
        
new_df['loan_id'] = ids
new_df['year'] = years
new_df['amount'] = amounts

CPU times: user 2min 33s, sys: 1.03 s, total: 2min 34s
Wall time: 2min 39s


In [86]:
datetime.date(year=2014, month=12, day=31) - datetime.date(2014,1,9)

datetime.timedelta(days=356)

In [67]:
df = np.where(loans.planned_expiration_time.dt.year - loans.disburse_time.dt.year > 1, 'si', 'no')

Unnamed: 0,loan_id,year,amount
0,657307,2013,125.00000
1,657259,2013,400.00000
2,658010,2014,400.00000
3,659347,2014,625.00000
4,656933,2013,425.00000
...,...,...,...
95,961916,2015,175.00000
96,962191,2015,275.00000
97,962630,2015,175.00000
98,962668,2015,575.00000


In [125]:
new_df['loan_id'].value_counts()

1077955    7
1077941    7
1077943    7
1077936    7
1077940    7
          ..
890648     1
913175     1
911126     1
917269     1
379065     1
Name: loan_id, Length: 1044962, dtype: int64

In [133]:
max(new_df['loan_id'])

1444085

In [136]:
new_df[new_df.loan_id == 1077955]

Unnamed: 0,loan_id,year,amount
421742,1077955,2011,24.25713
421743,1077955,2016,545.78532
421744,1077955,2012,1106.73135
421745,1077955,2013,1106.73135
421746,1077955,2014,1106.73135
421747,1077955,2015,1106.73135
421748,1077955,2016,3.03214
