In [238]:
import pandas as pd
import numpy as np
import datetime

pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Point 1: 
Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.

In [110]:
loan_lenders = pd.read_csv('data/loans_lenders.csv')

In [111]:
loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [234]:
def normalize(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [235]:
%%time

norm_loan_lenders = normalize(loan_lenders, 'lenders', ',')

CPU times: user 16.8 s, sys: 2.28 s, total: 19.1 s
Wall time: 19.3 s


In [114]:
norm_loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,muc888
0,483693,sam4326
0,483693,camaran3922
0,483693,lachheb1865
0,483693,rebecca3499


### Point 2: 
For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [115]:
%%time

loans = pd.read_csv('data/loans.csv')

CPU times: user 32.1 s, sys: 7.14 s, total: 39.3 s
Wall time: 40.1 s


a lot of nan's!

In [116]:
print(len(loans.planned_expiration_time[loans.planned_expiration_time.isnull()]),
len(loans.disburse_time[loans.disburse_time.isnull()]))

371834 2813


let's convert to datetime all not nan values of disburse_time and planned_expiration_time;
#### note:
nan values in a pandas series are represented as float

In [117]:
formatter = '%Y-%m-%d %H:%M:%S.%f +0000'

In [118]:
%%time 

# TODO: find a more efficient way to do that, if any..

loans['disburse_time'] = loans['disburse_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)
loans['planned_expiration_time'] = loans['planned_expiration_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)

CPU times: user 35.4 s, sys: 722 ms, total: 36.2 s
Wall time: 36.4 s


In [119]:
%%time

loans['duration'] = loans.planned_expiration_time - loans.disburse_time

CPU times: user 29.1 ms, sys: 24.4 ms, total: 53.5 ms
Wall time: 63.7 ms


In [120]:
loans.duration

0         53 days 19:30:06
1         96 days 14:25:07
2         37 days 13:10:05
3         34 days 19:10:02
4         57 days 22:10:02
                ...       
1419602   39 days 17:00:03
1419603   39 days 08:40:07
1419604   51 days 14:20:04
1419605   63 days 00:50:02
1419606   61 days 12:50:06
Name: duration, Length: 1419607, dtype: timedelta64[ns]

There are 14935 loans which duration is negative: it means that the deadline was not satisfied

In [121]:
loans.duration[loans.duration < pd.Timedelta(0)]

63        -31 days +21:40:03
96        -25 days +06:30:11
177       -25 days +21:00:03
207       -33 days +06:30:08
217       -31 days +17:40:03
                 ...        
1419303   -35 days +21:30:03
1419304   -32 days +17:50:04
1419374   -34 days +21:40:03
1419443   -30 days +06:00:03
1419581   -30 days +16:50:04
Name: duration, Length: 14935, dtype: timedelta64[ns]

### Point 3: Find the lenders that have funded at least twice

In [122]:
%%time

loans_by_lenders = norm_loan_lenders.groupby('lenders')['loan_id'].count().reset_index(name='fund_count')

CPU times: user 11.7 s, sys: 1.16 s, total: 12.9 s
Wall time: 13.4 s


In [123]:
at_least_2_fund_lenders = loans_by_lenders[loans_by_lenders.fund_count > 1]

### Point 4: For each country, compute how many loans have involved that country as borrowers

### Point 5: For each country, compute the overall amount of money borrowed

### Point 6: Like the previous point, but expressed as a percentage of the overall amount lent

In [124]:
def borrow_summary(dimension: str) -> pd.DataFrame:
    grouped = loans.groupby(dimension)
    borrow_summary = grouped['loan_id'].count().reset_index(name='borrow_count')
    borrow_summary['overall_amount_borrowed'] = grouped['loan_amount'].sum().reset_index()['loan_amount']
    overall_amount_lent = loans['loan_amount'].sum()
    borrow_summary['overall_amount_borrowed/total'] = (borrow_summary['overall_amount_borrowed'] / overall_amount_lent) * 100
    return borrow_summary

In [125]:
borrow_by_country = borrow_summary('country_code')

In [126]:
borrow_by_country.head(15)

Unnamed: 0,country_code,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,AF,2337,1967950.0,0.166573
1,AL,3075,4307350.0,0.364586
2,AM,13952,22950475.0,1.942589
3,AZ,10172,14784625.0,1.25141
4,BA,608,477250.0,0.040396
5,BF,3489,4085200.0,0.345782
6,BG,296,375300.0,0.031766
7,BI,1727,5233450.0,0.442973
8,BJ,5946,3865825.0,0.327214
9,BO,25250,44226725.0,3.743468


In [127]:
borrow_by_country.describe()

Unnamed: 0,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
count,95.0,95.0,95.0
mean,14943.136842,12435770.0,1.052596
std,35149.884479,18263180.0,1.545844
min,1.0,5000.0,0.000423
25%,545.0,752525.0,0.063696
50%,4681.0,4181100.0,0.353899
75%,15117.0,16728420.0,1.415938
max,285336.0,97984600.0,8.293678


### Point 7: Like the three previous points, but split for each year (with respect to disburse_time)

In [128]:
%%time

loans['disburse_year'] = loans['disburse_time'].map(lambda t: t.year)

CPU times: user 5.84 s, sys: 6.79 s, total: 12.6 s
Wall time: 16.8 s


In [129]:
borrow_by_year = borrow_summary('disburse_year')

In [130]:
borrow_by_year

Unnamed: 0,disburse_year,borrow_count,overall_amount_borrowed,overall_amount_borrowed/total
0,2005.0,203,102850.0,0.008705
1,2006.0,2172,1376575.0,0.116517
2,2007.0,24400,15446525.0,1.307435
3,2008.0,54586,39423050.0,3.336872
4,2009.0,83076,59689475.0,5.052276
5,2010.0,93466,72609150.0,6.145832
6,2011.0,114540,93699300.0,7.930958
7,2012.0,133650,119977575.0,10.155222
8,2013.0,140167,132043925.0,11.17655
9,2014.0,172709,152270425.0,12.888574


In [131]:
loans['disburse_time']

0         2013-12-22 08:00:00
1         2013-12-20 08:00:00
2         2014-01-09 08:00:00
3         2014-01-17 08:00:00
4         2013-12-17 08:00:00
                  ...        
1419602   2015-11-23 08:00:00
1419603   2015-11-24 08:00:00
1419604   2015-11-13 08:00:00
1419605   2015-11-03 08:00:00
1419606   2015-11-03 08:00:00
Name: disburse_time, Length: 1419607, dtype: datetime64[ns]

### Point 8: For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.


In [242]:
loans['base_contribution'] = loans.loan_amount / loans.num_lenders_total

In [243]:
ids = set(loans.loan_id) - set(loan_lenders.loan_id)
ids2 = set(loan_lenders.loan_id) - set(loans.loan_id)


In [244]:
loans_cleaned = loans[(loans.num_lenders_total > 0) & (~loans.loan_id.isin(ids))] ### verify maybe useless

In [247]:
norm_cleaned = norm_loan_lenders[~norm_loan_lenders.loan_id.isin(ids2)] ### verify maybe useless

In [203]:
%%time

df2 = pd.concat([norm_cleaned.set_index('loan_id'), loans_cleaned.set_index('loan_id')], axis=1, join='inner')

CPU times: user 52.7 s, sys: 23.4 s, total: 1min 16s
Wall time: 1min 19s


In [216]:
df3 = df2.groupby('lenders')['base_contribution'].sum().reset_index(name='overall_amount_lent')

In [239]:
df3.head(15)

Unnamed: 0,lenders,overall_amount_lent
0,000,1485.30966
1,00000,1249.94736
2,0002,2201.18046
3,00mike00,38.46154
4,0101craign0101,2424.08893
5,0132575,113.5877
6,0154884,122.07768
7,0161130,52.73973
8,0169713,90.68987
9,0185429,52.86378


In [240]:
df3.tail(15)

Unnamed: 0,lenders,overall_amount_lent
1639011,zvi,370.57695
1639012,zvi1263,28.84615
1639013,zvika5974,35.71429
1639014,zvonimir7460,34.375
1639015,zx147,177.17835
1639016,zyra9641,26.88679
1639017,zyrah8525,166.69643
1639018,zyrorl,92.59959
1639019,zzaba,25.0
1639020,zzaman,642.25094


In [241]:
df3.describe()

Unnamed: 0,overall_amount_lent
count,1639026.0
mean,637.10046
std,12587.00054
min,5.43478
25%,33.48962
50%,88.1722
75%,289.14772
max,6296800.48566
