**Description:** 

**Input:** 
- Bids
- Listings

**Output:** 
- ListingKey
- AmountRequested
- NoBids
- AvgInterBidTime
- CoV
- DebtToIncomeRatio
- DescriptionLength
- RepaidOrNot
- Network_3
- Network_5
- Network_7

In [1]:
import os
import json
import pandas as pd
import numpy as np
import networkx as nx
from scipy import stats
from tqdm import tqdm_notebook
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Lender bids coefficient of variation
def bid_cv(bid_amount_list):
    return np.std(bid_amount_list) / np.mean(bid_amount_list)

# Bid feature selection
bid_features = ["ListingKey", "AmountRequested", "NoBids", "AvgInterBidTime", 
                "CoV", "DebtToIncomeRatio", "DescriptionLength", "RepaidOrNot"]

# Bid aggregation mapping
f = {'Lst_Key': ['max'],
     'Amount': [bid_cv]}

__author__ = 'HK Dambanemuya'

# Bids Data

In [2]:
# Import bid-level data
bids = pd.read_csv("../../Data/bid_notick.txt", sep="|")
# Filter only the necessary columns
bids = bids[['ListingKey', 'MemberKey', 'Bid_Key', 'CreationDate', 'Amount', 'ParticipationAmount', 'Status', 'ListingStatus']]
# Sample 5 bids
bids.sample(5)

Unnamed: 0,ListingKey,MemberKey,Bid_Key,CreationDate,Amount,ParticipationAmount,Status,ListingStatus
6812190,413134781144085017C9BB9,321934097845511151641D1,44C235816756734983A6885,2010-03-09 20:18:04,25.0,25.0,Winning,Completed
5713356,61EF34299053151797308DE,33973412263058634D9090F,6C253535828709674C96164,2008-09-13 11:27:35,50.0,50.0,Winning,Completed
4090283,4B83341608464940140FDC3,84473416642323685087438,08933520259028951BF25E9,2008-04-01 21:11:12,100.0,100.0,Winning,Completed
825258,BF5A3379098150731B89E51,EFF63368815933830260B54,AAED33801973640936228E2,2007-02-04 08:53:36,50.33,50.33,Winning,Expired
1970531,846933938623159307DCB19,08AA3385767444417DD0690,8188349640523013836A522,2007-06-28 00:20:20,100.0,0.0,Outbid,Cancelled


## Bids Feature Engineering from Bid Level Data

In [3]:
# Copy bids dataframe
bid_fe = bids.copy()
# Duplicate ListingKey column for grouping
bid_fe['Lst_Key'] = bid_fe.ListingKey 
# Aggregate bids by listing key (to get unique loans) and amount (to get CoH)
bid_fe = pd.DataFrame(bid_fe[['ListingKey','Lst_Key', 'Amount']].groupby('ListingKey').agg(f).to_numpy())
# Rename columns
bid_fe = bid_fe.rename(index=str, columns={0: "ListingKey", 1: "CoV"})
# Sample 5 loans
bid_fe.sample(5)

Unnamed: 0,ListingKey,CoV
134088,913934244343752021AD3C2,0.0
149735,A22634126666823136B25F4,0.0
175541,BE343401733824788179BA6,0.0
171076,B9643387906045060F3595B,0.291
36750,27F334778369849541DF236,0.282


## Aggregate Bid Data to Loan Listing

In [4]:
# Listing Level Bid Data
bid_data = pd.read_csv("../../Data/ProjectLevelData.txt", sep="|")
# Filter Funded (Repaid and Defaulted) Listings
bid_data = bid_data.query("RepaidOrNot==True or RepaidOrNot==False")
# Merge Listing Level Bid Data with Engineered Bid Features at Listing Level
bid_data = bid_data.merge(bid_fe, on="ListingKey", how="inner")
# Filter Bid Features
bid_data = bid_data[bid_features]
# Debt to Income Ratio Median
dtirm = bid_data.DebtToIncomeRatio.median()
bid_data['DebtToIncomeRatio'] = bid_data['DebtToIncomeRatio'].fillna(dtirm)
# Convert all columns to numeric
bid_data[bid_data.columns[1:]] = bid_data[bid_data.columns[1:]].apply(pd.to_numeric)
# Show sample data
bid_data.sample(5)

Unnamed: 0,ListingKey,AmountRequested,NoBids,AvgInterBidTime,CoV,DebtToIncomeRatio,DescriptionLength,RepaidOrNot
17479,831B3376802213291BFA395,1800.0,65,13262.723,0.807,0.27,670,True
17070,7FF834323429674147A56A6,7500.0,235,5474.111,0.581,0.2,139,False
21506,A1D5337727275292155279B,4900.0,155,5604.058,0.592,0.14,117,True
11989,5A5D336623216190991FD1F,1000.0,6,94633.333,1.108,0.08,28,False
31379,ECBF35098714179940AFE1D,15000.0,313,4578.965,1.868,0.24,131,True


In [5]:
print (Counter(bid_data['RepaidOrNot']))
bid_data.describe()

Counter({True: 22236, False: 11730})


Unnamed: 0,AmountRequested,NoBids,AvgInterBidTime,CoV,DebtToIncomeRatio,DescriptionLength
count,33966.0,33966.0,33966.0,33966.0,33966.0,33966.0
mean,5982.832,134.999,15628.643,1.109,0.304,210.656
std,5381.875,139.222,112657.723,0.697,0.85,144.735
min,1000.0,1.0,0.0,0.0,0.0,0.0
25%,2500.0,39.0,2293.446,0.701,0.12,102.0
50%,4200.0,89.0,4715.764,0.934,0.19,171.0
75%,7500.0,180.0,10133.63,1.27,0.29,286.0
max,25000.0,1206.0,11034125.0,8.148,10.01,813.0


In [6]:
print (stats.pearsonr(bid_data['CoV'], bid_data['NoBids']))
print (stats.pearsonr(bid_data['CoV'], bid_data['AvgInterBidTime']))
print (stats.pearsonr(bid_data['CoV'], bid_data['DebtToIncomeRatio']))
print (stats.pearsonr(bid_data['CoV'], bid_data['DescriptionLength']))

(0.06745975341533694, 1.4617810426014224e-35)
(-0.026336080151550603, 1.2077137905265889e-06)
(0.04586861863186291, 2.727828759930367e-17)
(0.04925171742110739, 1.06398457855983e-19)


# Network Construction

In [7]:
def get_edgelist(steps = 6):
    edge_list = [] # Series of edge lists for each loan listing
    # Iterate through all loan listings
    for key in tqdm_notebook(bid_data['ListingKey'].values): 
        # Create temporary dataframe to store listing bids
        temp_df = bids[bids['ListingKey']==key].sort_values(by='CreationDate')
        # Reset dataframe index for enumeration
        temp_df = temp_df.reset_index(drop=True) 
        # Create edge_list and avoid duplicate edges
        temp_edge_list, observed = [], []
        # Iterate through all bids for the listing 
        for i in range(len(temp_df)-steps):
            # Iterate through bids in herding range
            for step in range(steps):
                # Check if consecutive bids are not from the same lender
                if temp_df.loc[i, "MemberKey"] != temp_df.loc[i+step, "MemberKey"]:
                    # Check if the edge has not already been observed
                    if (temp_df.loc[i+step, "MemberKey"], "->", temp_df.loc[i, "MemberKey"]) not in observed:
                        # Check if consecutive bids within range are similar
                        if temp_df.loc[i, "Amount"] == temp_df.loc[i+step, "Amount"]:
                            # Add edge from follower to herder
                            temp_edge_list.append([temp_df.loc[i+step, "MemberKey"], temp_df.loc[i, "MemberKey"]])
                            # Indicate the edge is observed to avoide duplicate edges
                            observed.append((temp_df.loc[i+step, "MemberKey"], "->", temp_df.loc[i, "MemberKey"]))
        # Add listing's edge list to main list
        edge_list.append(temp_edge_list)
    return edge_list

In [8]:
# Get edge_lists for different herding ranges
edge_list_3 = get_edgelist(steps = 4)
edge_list_5 = get_edgelist(steps = 6)
edge_list_7 = get_edgelist(steps = 8)

  0%|          | 0/33966 [00:00<?, ?it/s]

  0%|          | 0/33966 [00:00<?, ?it/s]

  0%|          | 0/33966 [00:00<?, ?it/s]

In [9]:
# Add edge lists to dataframe
bid_data['Network_3'] = edge_list_3
bid_data['Network_5'] = edge_list_5
bid_data['Network_7'] = edge_list_7

In [10]:
## Save dataframe
bid_data.to_csv('../../Data/colending_data.csv', index=False)