In [3]:
from __future__ import division
import json
import itertools
import swifter
import operator
import numpy as np
from numpy import inf
from tqdm import tqdm_notebook
from scipy.stats import pearsonr
import pandas as pd
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('use_inf_as_na', True)
plt.style.use('seaborn-whitegrid')
%matplotlib inline

__author__ = 'HK Dambanemuya'
__version__ = 'Python 2'

'''
    Analysis originaly performed in Python 2 (deprecated)
    Seaborn, Statsmodel, and * imports broken in Python 3
'''

In [2]:
bid_fe = pd.read_csv("../Data/bid_notick.txt", sep="|")
bid_fe = bid_fe.rename(index=str, columns={"CreationDate": "BidCreationDate"})
bid_fe = bid_fe[["ListingKey", "Bid_Key", "BidCreationDate", "Amount"]]
print bid_fe.columns
bid_fe.head(5)

Index([u'ListingKey', u'Bid_Key', u'BidCreationDate', u'Amount'], dtype='object')


Unnamed: 0,ListingKey,Bid_Key,BidCreationDate,Amount
0,D6473365888221963456102,0F12336634427919902FE93,2006-02-21 20:23:20,50.0
1,D6473365888221963456102,87F333655285187004B2EE8,2006-02-21 22:50:00,50.0
2,3B09336533376614186EEF7,9DB53365565361098646F41,2006-02-28 00:16:01,50.0
3,601833660859080305A2AC3,7284336544796747049D125,2006-02-28 20:06:43,50.0
4,601833660859080305A2AC3,F3FD3365245307569E84B9F,2006-03-01 09:27:50,50.0


In [4]:
listing_data = pd.read_csv('../Data/listing.txt', sep="|")
listing_data = listing_data[["Lst_Key", "CreationDate"]]
listing_data = listing_data.rename(index=str, columns={"CreationDate": "ListingCreationDate", "Lst_Key": "ListingKey"})
listing_data.head(5)

Unnamed: 0,ListingKey,ListingCreationDate
0,C0933365069571441D6651D,2006-02-25 05:51:10
1,06413364714455220D3A5A5,2006-03-15 17:17:51
2,9D5E336577848196944F197,2006-03-15 21:52:45
3,06193365103109718753985,2006-03-22 11:34:35
4,1C0733660810131582558C3,2006-03-22 12:23:19


In [5]:
bid_fe = bid_fe.merge(listing_data, on="ListingKey")
del listing_data
bid_fe.head()

Unnamed: 0,ListingKey,Bid_Key,BidCreationDate,Amount,ListingCreationDate
0,D6473365888221963456102,0F12336634427919902FE93,2006-02-21 20:23:20,50.0,2006-02-15 14:21:25
1,D6473365888221963456102,87F333655285187004B2EE8,2006-02-21 22:50:00,50.0,2006-02-15 14:21:25
2,D6473365888221963456102,802633649243658388DBFF3,2006-02-16 17:31:46,50.0,2006-02-15 14:21:25
3,D6473365888221963456102,E0B73366046199037A96317,2006-02-15 14:21:25,100.0,2006-02-15 14:21:25
4,D6473365888221963456102,BEEC3366008631090775EBF,2006-02-21 22:45:10,100.0,2006-02-15 14:21:25


In [5]:
# with open("../Data/listing_keys.txt") as file_2:
#     keys = json.load(file_2)
# listing_keys = keys.get("Values")
# print len(listing_keys)

In [6]:
print len(bid_fe)
bid_fe = bid_fe[bid_fe['ListingKey'].isin(listing_keys)]
bid_fe["Lst_Key"] = bid_fe.ListingKey
bid_fe["Amount2"] = bid_fe.Amount
print len(bid_fe)

9732656
3948660


In [7]:
f = {'Lst_Key': ['max'],
     'Amount': ['mean'],
     'Amount2':['max'],
     'ListingCreationDate': ['max']
}

In [8]:
listing_fe = pd.DataFrame(bid_fe[["Lst_Key", "ListingKey", "Amount", "Amount2", "ListingCreationDate"]].groupby('ListingKey').agg(f).as_matrix())
listing_fe = listing_fe.rename(index=str, columns={0: "MaxLenderBidAmount", 
                                                   1: "MeanLenderBidAmount",
                                                   2: "ListingCreationDate",
                                                   3: "ListingKey"})
listing_fe.head()

Unnamed: 0,MaxLenderBidAmount,MeanLenderBidAmount,ListingCreationDate,ListingKey
0,100.0,66.737,2008-06-21 19:29:46,00033425227988088FA6752
1,300.0,85.0,2007-01-02 13:30:38,000433785890431972B4743
2,322.86,65.022,2008-05-26 11:46:21,00083422661625108817246
3,1500.0,775.0,2008-05-08 15:13:41,000A34209897973969CFA81
4,500.0,78.903,2008-01-25 21:13:15,000D3410451511356B08F17


In [9]:
# Declare Global Variables
keys = [] # Listing Keys
meanAmountRaisedPerSecond = [] # Mean Amount Raised Per Second
# Feature Engineering
for key in listing_keys:
    keys.append(str(key))
    # Initialize dates series with bid creation date
    dates = [listing_fe.query("ListingKey == '{0}'".format(key)).ListingCreationDate.item()]
    # Create temporary dataframe to store listing variables
    df = bid_fe.query("ListingKey == '{0}'".format(key)).sort_values("BidCreationDate")
    # Get listing's bid creation dates
    dates = dates + (list(df.BidCreationDate))
    # Get listing's bid amounts
    amounts = list(df.Amount)
    # Calcutate time deltas for bid transaction times
    deltas = [(pd.to_datetime(t) - pd.to_datetime(s)).seconds for s, t in zip(dates, dates[1:])]
    # Calculate bid amount raised per second
    amountRaisedPerSecond = np.array(amounts, dtype=np.float) / np.array(deltas, dtype=np.float)
    # Replace inf values with 0
    amountRaisedPerSecond[amountRaisedPerSecond == inf] = 0
    # Calculate mean bid amount raised per second
    meanAmountRaisedPerSecond.append(amountRaisedPerSecond.mean())

In [24]:
# 1 step herding coefficients
herding1Coefficients = []
for key in listing_keys:
    df = bid_fe.query("ListingKey == '{0}'".format(key)).sort_values("BidCreationDate")
    amounts = list(df.Amount)
    # create 2 dimension series to calculate 1-step herding coefficient
    a,b = [], []
    for i in range(len(amounts)-1):
        a.append(amounts[i])
        b.append(amounts[i+1])
    # calculate herding coefficient
    herding1Coefficients.append(pearsonr(a,b)[0])

In [25]:
# 2 step herding coefficients
herding2Coefficients = []
for key in listing_keys:
    df = bid_fe.query("ListingKey == '{0}'".format(key)).sort_values("BidCreationDate")
    amounts = list(df.Amount)
    # create 3 dimension series to calculate 2-step herding coefficient
    a,b,c = [], [], []
    for i in range(len(amounts)-2):
        a.append(amounts[i])
        b.append(amounts[i+1])
        c.append(amounts[i+2])
    # calculate herding coefficient
    herding2Coefficients.append( (pearsonr(a,b)[0] + pearsonr(b,c)[0] + pearsonr(a,c)[0]) / 3 )

In [26]:
# 3 step herding coefficients
herding3Coefficients = []
for key in listing_keys:
    df = bid_fe.query("ListingKey == '{0}'".format(key)).sort_values("BidCreationDate")
    amounts = list(df.Amount)
    # create 4 dimension series to calculate 3-step herding coefficient
    a,b,c,d = [], [], [], []
    for i in range(len(amounts)-3):
        a.append(amounts[i])
        b.append(amounts[i+1])
        c.append(amounts[i+2])
        d.append(amounts[i+3])
    # calculate herding coefficient
    herding3Coefficients.append( (pearsonr(a,b)[0] + pearsonr(a,c)[0] + pearsonr(a,d)[0] + pearsonr(b,c)[0] + pearsonr(b,d)[0] + pearsonr(c,d)[0]) / 6 )

In [27]:
final_df = pd.DataFrame()
final_df["ListingKey"] = keys
final_df["Herding1Coefficient"] = herding1Coefficients
final_df["Herding2Coefficient"] = herding2Coefficients
final_df["Herding3Coefficient"] = herding3Coefficients
final_df["MeanLenderBidAmountPerSecond"] = meanAmountRaisedPerSecond
final_df.head(5)

Unnamed: 0,ListingKey,Herding1Coefficient,Herding2Coefficient,Herding3Coefficient,MeanLenderBidAmountPerSecond
0,37973421768070663164DA4,-0.106,-0.052,-0.054,46.221
1,5F5E3402439483094CEC22C,0.073,0.046,0.026,6.511
2,8E8034314743326272D0845,-0.025,-0.023,-0.011,5.25
3,6C9A34233870080343742C3,-0.106,-0.093,0.036,0.214
4,3CC5342840983035130A319,-0.168,-0.204,-0.029,0.321


In [28]:
final_df = final_df.merge(listing_fe[["ListingKey", "MeanLenderBidAmount", "MaxLenderBidAmount"]], on="ListingKey")
final_df.head(5)

Unnamed: 0,ListingKey,Herding1Coefficient,Herding2Coefficient,Herding3Coefficient,MeanLenderBidAmountPerSecond,MeanLenderBidAmount,MaxLenderBidAmount
0,37973421768070663164DA4,-0.106,-0.052,-0.054,46.221,84.688,300.0
1,5F5E3402439483094CEC22C,0.073,0.046,0.026,6.511,89.419,500.0
2,8E8034314743326272D0845,-0.025,-0.023,-0.011,5.25,63.46,558.5
3,6C9A34233870080343742C3,-0.106,-0.093,0.036,0.214,84.746,328.35
4,3CC5342840983035130A319,-0.168,-0.204,-0.029,0.321,109.894,500.0


In [29]:
final_df["Herding1Coefficient"] = final_df.Herding1Coefficient.fillna(-2)
final_df["Herding2Coefficient"] = final_df.Herding2Coefficient.fillna(-2)
final_df["Herding3Coefficient"] = final_df.Herding3Coefficient.fillna(-2)
final_df.to_csv("../Data/prosper_lender_herding_dynamics.csv", index=False)