In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder 
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

le = LabelEncoder()

__author__ = 'HK Dambanemuya'
__version__ = 'Python 3'

## Aggregation Functions

In [2]:

def description_length(string):
    return len(str(string).split())

def interEventTime(arr):
    return [(t - s).seconds for s, t in zip(sorted(arr), sorted(arr)[1:])]

def avgInterEventTime(arr):
    return np.mean(interEventTime(arr))

def gini(x, w=None):
    # The rest of the code requires numpy arrays.
    x = np.asarray(x)
    if w is not None:
        w = np.asarray(w)
        sorted_indices = np.argsort(x)
        sorted_x = x[sorted_indices]
        sorted_w = w[sorted_indices]
        # Force float dtype to avoid overflows
        cumw = np.cumsum(sorted_w, dtype=float)
        cumxw = np.cumsum(sorted_x * sorted_w, dtype=float)
        return (np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / 
                (cumxw[-1] * cumw[-1]))
    else:
        sorted_x = np.sort(x)
        n = len(x)
        cumx = np.cumsum(sorted_x, dtype=float)
        # The above formula, with all weights equal to 1 simplifies to:
        return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n


# Aggregation Map
f = {
     'CreationDate': avgInterEventTime, # 
     'Amount': gini, #
     'momentum_at_1': 'max'
}

## Import Listing Data

In [3]:
listings = pd.read_csv('../../Data/lending/listing.txt', sep="|")
listings = listings[['Lst_Key', 'CreationDate', 'AmountRequested', 'CreditGrade', 'DebtToIncomeRatio', 'Description', 'BorrowerState', 'Status']] 
listings['CreationDate'] = pd.to_datetime(listings['CreationDate'])
listings['Description'] = listings['Description'].apply(description_length)
listings['BorrowerState'] = le.fit_transform(listings['BorrowerState'])
creditmap = {'AA':8, 'A':7, 'B':6, 'C':5, 'D':4, 'E':3, 'HR':2, 'NC':1}
listings['CreditGrade'] = listings.CreditGrade.fillna(9)
listings = listings.applymap(lambda s: creditmap.get(s) if s in creditmap else s)
listings = listings.rename(index = str, columns={'Lst_Key': 'ListingKey',
                                                 'BorrowerState': 'Region',
                                                'CreationDate': 'approved_timestamp'})
# Drop records with no credit information
listings = listings[listings['CreditGrade']<9] 
# Remove current listings
listings = listings[listings.Status.isin(["Expired", "Completed"])]

listings

Unnamed: 0,ListingKey,approved_timestamp,AmountRequested,CreditGrade,DebtToIncomeRatio,Description,Region,Status
0,C0933365069571441D6651D,2006-02-25 05:51:10,20000.000,3,0.142,1,7,Expired
2,9D5E336577848196944F197,2006-03-15 21:52:45,3000.000,2,0.169,348,20,Expired
13,F7C433651616107108B5E83,2006-04-11 00:52:59,15000.000,4,0.194,95,46,Expired
15,AFE53364876098574BEA11A,2006-04-19 14:20:05,5000.000,1,,47,7,Expired
16,ABA53364509248269675D6B,2006-04-24 18:04:52,3500.000,2,0.026,55,13,Expired
...,...,...,...,...,...,...,...,...
341834,519F3434500591392AA8BC3,2008-10-11 04:12:38,1100.000,2,,187,37,Completed
341983,CB193434007735493F21BFC,2008-10-11 14:55:57,1000.000,4,0.100,216,17,Completed
342044,134034338475907815A0699,2008-10-11 18:47:11,2000.000,4,0.070,127,5,Completed
342321,7BED3433972278236407AE5,2008-10-07 09:54:21,3600.000,8,,448,7,Completed


In [4]:
listings.Status.value_counts()

Expired      191408
Completed    29114 
Name: Status, dtype: int64

## Import Filtered Bids

In [5]:
# Import Bid Data
bids_map = {'bids_1': pd.read_csv("../../Data/lending/filtered/bids_1.csv"),
            'bids_2': pd.read_csv("../../Data/lending/filtered/bids_2.csv"),
            'bids_3': pd.read_csv("../../Data/lending/filtered/bids_3.csv"),
            'bids_4': pd.read_csv("../../Data/lending/filtered/bids_4.csv"),
            'bids_5': pd.read_csv("../../Data/lending/filtered/bids_5.csv"),
            'bids_6': pd.read_csv("../../Data/lending/filtered/bids_6.csv"),
            'bids_7': pd.read_csv("../../Data/lending/filtered/bids_7.csv"),
            'bids_8': pd.read_csv("../../Data/lending/filtered/bids_8.csv"),
            'bids_9': pd.read_csv("../../Data/lending/filtered/bids_9.csv"),
            'bids_10': pd.read_csv("../../Data/lending/filtered/bids_10.csv")}

# Inspect bid record counts
for k,v in tqdm_notebook(bids_map.items()):
    print (f"{k} campaigns = {v.ListingKey.nunique()}")
    print (f"{k} bids = {len(v)}")

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

bids_1 campaigns = 143770
bids_1 bids = 143770
bids_2 campaigns = 143770
bids_2 bids = 240403
bids_3 campaigns = 143770
bids_3 bids = 319220
bids_4 campaigns = 143770
bids_4 bids = 388381
bids_5 campaigns = 143770
bids_5 bids = 451249
bids_6 campaigns = 143770
bids_6 bids = 509451
bids_7 campaigns = 143770
bids_7 bids = 564030
bids_8 campaigns = 143770
bids_8 bids = 615760
bids_9 campaigns = 143770
bids_9 bids = 665097
bids_10 campaigns = 143770
bids_10 bids = 712335



## Aggregate Filtered Bids

In [6]:
for k,v in tqdm_notebook(bids_map.items()):
  
    var_bids = bids_map[k]
    var_bids['Amount'] = var_bids['Amount'].astype(float)
    var_bids = var_bids.drop(['time_trunc'], axis=1)
    var_bids.CreationDate = pd.to_datetime(var_bids.CreationDate)
    var_bids = var_bids.merge(listings[['ListingKey', 'approved_timestamp']], on="ListingKey", how='left')
    var_bids['CreationDate'] = pd.to_datetime(var_bids['CreationDate'])
    var_bids['approved_timestamp'] = pd.to_datetime(var_bids['approved_timestamp'])
    var_bids['momentum_at_1'] = [t.seconds for t in (var_bids['approved_timestamp'] - var_bids['CreationDate'])]
    var_bids = pd.DataFrame(var_bids.groupby('ListingKey').agg(f)).reset_index()
    var_bids.columns = ['ProjectID','Momentum','Diversity', 'Momentum1']
    var_bids['Momentum'] = var_bids['Momentum'].fillna(var_bids['Momentum1'])

    if k=='bids_1':
        var_bids['Momentum'] = var_bids['Momentum1']
        var_bids = var_bids[var_bids.columns[:-1]]
    else:
        var_bids = var_bids[var_bids.columns[:-1]]

    var_bids = var_bids.merge(listings, left_on='ProjectID', right_on='ListingKey', how='left')
    var_bids['Status'].replace({'Expired': 0, 'Completed':1}, inplace=True)
    var_bids = var_bids.drop_duplicates(subset=['ProjectID'])
    var_bids = var_bids[['ProjectID', 'Momentum', 'Diversity','CreditGrade', 
                         'DebtToIncomeRatio', 'Description', 'Status']]
    var_bids = var_bids.fillna(-1)

    var_bids.to_csv(f"../../Data/lending/aggregated/{k}.csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


