In [1]:
import sys
import numba
import swifter
import timeit
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

__author__ = 'HK Dambanemuya'
__version__ = 'Python 2'

In [2]:
project_data = pd.read_csv('../Data/ProjectLevelData.txt', sep="|")
listing_data = pd.read_csv('../Data/listing.txt', sep="|")
listing_data = listing_data.rename(index=str, columns={"Lst_Key": "ListingKey"})
borrower_data = project_data[["ListingKey", "FundedOrNot", "RepaidOrNot"]].merge(listing_data[["ListingKey", "MemberKey", "StartDate"]], on="ListingKey")
borrower_data.sample(5)

Unnamed: 0,ListingKey,FundedOrNot,RepaidOrNot,MemberKey,StartDate
75232,52023539254476097E5F8B3,True,,D07A353815634906381F7D8,2012-02-03 17:00:45
166717,B5233419412472610E8F52B,False,,5F4A341823207729253E67C,2008-04-28 06:28:38
225846,F57B35195812571519730DC,False,,E3013517720606600494049,2011-06-26 12:00:14
95309,67B5342694054840671068A,False,,972233791490488424EA064,2008-07-25 13:39:34
72400,4EEE34860289313013F98EF,True,True,B26D3467686303402E81ED1,2010-06-09 16:09:17


In [3]:
len(set(borrower_data.MemberKey))

128998

In [4]:
hashmap = dict()
memberkeys = sorted(list(set(borrower_data.MemberKey)))[100000:]
len(memberkeys)

28998

In [5]:
start = timeit.default_timer()
for key in memberkeys:
    hashmap[key] = borrower_data.query("MemberKey == '{0}'".format(key))
stop = timeit.default_timer()
print "Hashmap generated in {0} minutes".format((stop-start)/60) 

Hashmap generated in 10.0335017783 minutes


In [6]:
len(borrower_data)

235753

In [7]:
borrower_data = borrower_data[borrower_data['MemberKey'].isin(memberkeys)]

In [8]:
len(borrower_data)

52870

In [9]:
def borrower_completed_listings(x):
    return hashmap[x["MemberKey"]].query("FundedOrNot == True and StartDate < '{0}'".format(x["StartDate"]))["ListingKey"].nunique()

start = timeit.default_timer()    
borrower_data["BorrowerCompletedListings"] = borrower_data[["MemberKey", "StartDate"]].swifter.apply(borrower_completed_listings, axis=1)
stop = timeit.default_timer()
print "Feature engineering completed in {0} minutes".format((stop-start)/60) 

Feature engineering completed in 8.81185622 minutes


In [10]:
def borrower_repaid_listings(x):
    return hashmap[x["MemberKey"]].query("RepaidOrNot == True and StartDate < '{0}'".format(x["StartDate"]))["ListingKey"].nunique()

start = timeit.default_timer()    
borrower_data["BorrowerRepaidListings"] = borrower_data[["MemberKey", "StartDate"]].swifter.apply(borrower_repaid_listings, axis=1)
stop = timeit.default_timer()
print "Feature engineering completed in {0} minutes".format((stop-start)/60) 

Feature engineering completed in 8.788467935 minutes


In [11]:
def borrower_total_listings(x):
    return hashmap[x["MemberKey"]].query("StartDate < '{0}'".format(x["StartDate"]))["ListingKey"].nunique()

start = timeit.default_timer()    
borrower_data["BorrowerTotalListings"] = borrower_data[["MemberKey", "StartDate"]].swifter.apply(borrower_total_listings, axis=1)
stop = timeit.default_timer()
print "Feature engineering completed in {0} minutes".format((stop-start)/60) 

Feature engineering completed in 8.46764088833 minutes


In [29]:
borrower_data[borrower_data.MemberKey==memberkeys[np.random.randint(25000)]].sort_values("StartDate")

Unnamed: 0,ListingKey,FundedOrNot,RepaidOrNot,MemberKey,StartDate,BorrowerCompletedListings,BorrowerRepaidListings,BorrowerTotalListings
56527,3DA0346588771940210EE4F,True,True,DB16346765130182994B3CB,2009-10-28 16:08:42,0,0,0
98603,6B3C3486137275045A46769,False,,DB16346765130182994B3CB,2010-05-28 15:58:36,1,1,1
91489,63AC3485685085349980A99,True,True,DB16346765130182994B3CB,2010-06-10 15:34:45,1,1,2
132727,902035118287281640FBB5C,False,,DB16346765130182994B3CB,2011-08-08 17:00:15,2,2,3


In [20]:
borrower_data.to_csv("../Data/borrower_listing_fe/borrower_listing_fe_128k.csv", index=False)