In [1]:
import pandas as pd
import numpy as np
import datetime

### Point 1: 
Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.

In [2]:
loan_lenders = pd.read_csv('data/loans_lenders.csv')

In [3]:
loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [4]:
def normalize(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

In [5]:
%%time

norm_loan_lenders = normalize(loan_lenders, 'lenders', ',')

CPU times: user 40.5 s, sys: 2.35 s, total: 42.9 s
Wall time: 42.9 s


In [6]:
norm_loan_lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,muc888
0,483693,sam4326
0,483693,camaran3922
0,483693,lachheb1865
0,483693,rebecca3499


### Point 2: 
For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [7]:
%%time

loans = pd.read_csv('data/loans.csv')

CPU times: user 56.2 s, sys: 5.86 s, total: 1min 2s
Wall time: 1min 6s


a lot of nan's!

In [8]:
print(len(loans.planned_expiration_time[loans.planned_expiration_time.isnull()]),
len(loans.disburse_time[loans.disburse_time.isnull()]))

371834 2813


let's convert to datetime all not nan values of disburse_time and planned_expiration_time;
#### note:
nan values in a pandas series are represented as float

In [9]:
formatter = '%Y-%m-%d %H:%M:%S.%f +0000'

In [10]:
%%time 

# TODO: find a more efficient way to do that, if any..

loans['disburse_time'] = loans['disburse_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)
loans['planned_expiration_time'] = loans['planned_expiration_time'].apply(lambda str_date: datetime.datetime.strptime(str_date, formatter) if not (isinstance(str_date, float) and np.isnan(str_date)) else str_date)

CPU times: user 1min 15s, sys: 659 ms, total: 1min 15s
Wall time: 1min 16s


In [11]:
%%time

loans['duration'] = loans.planned_expiration_time - loans.disburse_time

CPU times: user 20.8 ms, sys: 4.03 ms, total: 24.9 ms
Wall time: 29.1 ms


In [12]:
loans.duration

0         53 days 19:30:06
1         96 days 14:25:07
2         37 days 13:10:05
3         34 days 19:10:02
4         57 days 22:10:02
                ...       
1419602   39 days 17:00:03
1419603   39 days 08:40:07
1419604   51 days 14:20:04
1419605   63 days 00:50:02
1419606   61 days 12:50:06
Name: duration, Length: 1419607, dtype: timedelta64[ns]

There are 14935 loans which duration is negative: it means that the deadline was not satisfied

In [13]:
loans.duration[loans.duration < pd.Timedelta(0)]

63        -31 days +21:40:03
96        -25 days +06:30:11
177       -25 days +21:00:03
207       -33 days +06:30:08
217       -31 days +17:40:03
                 ...        
1419303   -35 days +21:30:03
1419304   -32 days +17:50:04
1419374   -34 days +21:40:03
1419443   -30 days +06:00:03
1419581   -30 days +16:50:04
Name: duration, Length: 14935, dtype: timedelta64[ns]

### Point 3: Find the lenders that have funded at least twice

In [14]:
%%time

loans_by_lenders = norm_loan_lenders.groupby('lenders')['loan_id'].count().reset_index(name='fund_count')

CPU times: user 19.5 s, sys: 1.22 s, total: 20.8 s
Wall time: 20.8 s


In [15]:
at_least_2_fund_lenders = loans_by_lenders[loans_by_lenders.fund_count > 1]

### Point 4: For each country, compute how many loans have involved that country as borrowers

In [16]:
country_as_borrower = loans.groupby('country_code')['loan_id'].count().reset_index(name='loan_as_borrower_count')

In [17]:
country_as_borrower.head(15)

Unnamed: 0,country_code,loan_as_borrower_count
0,AF,2337
1,AL,3075
2,AM,13952
3,AZ,10172
4,BA,608
5,BF,3489
6,BG,296
7,BI,1727
8,BJ,5946
9,BO,25250


In [60]:
country_as_borrower = loans.groupby('country_code')['loan_amount'].sum().reset_index(name='overall_amount_')

Unnamed: 0,loan_id,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,sector_name,...,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model,duration
781594,1322478,,,,,5625.0,10000.0,refunded,Weaving,Arts,...,14.0,210,1,1,"user_favorite, user_favorite, user_favorite, u...",,,irregular,field_partner,-43 days +21:34:26
923389,1323945,ANA by Karma,English,ANA by Karma’s Story:\n<p>Weavers in Bhutan a...,ANA by Karma’s Story:\n<p>Weavers in Bhutan a...,10000.0,10000.0,funded,Weaving,Arts,...,14.0,357,10,5,"user_favorite, user_favorite, user_favorite, u...",female,True,irregular,field_partner,-26 days +11:30:59
