In [3]:
# install.packages('lubridate')
# install.packages('data.table')

In [4]:
library(data.table)
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following object is masked from 'package:base':

    date



In [None]:
# read bid data to compute CI signals
data <- read.csv("../Data/bid_notick.txt",header=TRUE,sep='|')
data <- data[,c("Amount","CreationDate","ListingKey","ListingStatus")]
names(data) <- c("BidAmount","BidCreationDate","ListingKey","ListingStatus")
data <- data[which(data$ListingStatus!="Active"&data$ListingStatus!="Pending Completion"),] 
data <- data[which(data$BidAmount>0),]
data$BidCreationDate <- as.POSIXct(data$BidCreationDate)

In [None]:
#data <- data[which(year(data$BidCreationDate)>2005),] # eliminate entries for 2005
data <- droplevels(data)
# fund-raising label
data$FundedOrNot <- data$ListingStatus=="Completed"
# k
data$HowManythBid <- ave(as.numeric(data$BidCreationDate),data$ListingKey,FUN=seq_along)
# keep only relevant parts
data <- data[,c("BidAmount","BidCreationDate","ListingKey","FundedOrNot","HowManythBid")]

In [None]:
# compute Gini inequality at the bid level
dt <- data.table(data)
dt <- dt[order(dt$ListingKey,dt$BidCreationDate),]
dt$HowManythBid <- ave(as.numeric(dt$BidCreationDate),dt$ListingKey,FUN=seq_along)
dt <- dt[,NoBids:=length(BidAmount),by=list(ListingKey)]
dt_part <- dt[which(dt$NoBids>1),]
dt_part <- dt_part[,Gini:=diag(t(apply(apply(as.matrix(dist(BidAmount)),2,cumsum),1,cumsum)))
                   /(2*HowManythBid*cumsum(BidAmount)),by=list(ListingKey)]
dt_part2 <- cbind(dt[which(dt$NoBids==1),],Gini=rep(0,nrow(dt[which(dt$NoBids==1),])))
dt <- rbind(dt_part,dt_part2)
rm(dt_part)
rm(dt_part2)
gc()
# keep only the relevant parts of the data table
dt <- dt[,c("ListingKey","HowManythBid","Gini","NoBids"),with=FALSE] 

In [None]:
data2 <- read.csv("../Data/listing.txt",header=TRUE,sep='|')
data2a <- data2[,c("Lst_Key","AmountRequested","CreationDate")] # keep only the relevant parts of the data frame
names(data2a) <- c("ListingKey","AmountRequested","ListingStartDate") # start and duration of listing
data <- merge(data,data2a,by="ListingKey")

In [None]:
# momentum feature
data <- data[order(data$ListingKey,data$BidCreationDate),]
# time taken
data$TimeTaken <- as.numeric(difftime(data$BidCreationDate,data$ListingStartDate,units="mins"))
data[which(data$TimeTaken<0),]$TimeTaken <- 0
# inter-bid time
data$NumericDate <- as.numeric(data$BidCreationDate) # unit is second
data <- data[order(data$ListingKey,data$BidCreationDate),] 
# if transform doesn't change the order of the rows, the line above is not necessary
diffs <- c(-1,diff(data$NumericDate))
data$InterBidTime <- diffs
# replace the inter-bid-time for the first bid with the time-to-bid
data[!duplicated(data$ListingKey),]$InterBidTime <- data[!duplicated(data$ListingKey),]$TimeTaken*60 # we want it in seconds
data <- transform(data,CumInterBidTime=ave(InterBidTime,ListingKey,FUN=cumsum)) 
data$AvgInterBidTime <- data$CumInterBidTime/data$HowManythBid
data <- data[,c("ListingKey","HowManythBid","AmountRequested","ListingStartDate",
                "AvgInterBidTime","FundedOrNot")]

In [None]:
# load data for repayment label
data3 <- read.csv("../Data/loan.txt",header=TRUE,sep='|')
data3 <- data3[,c("Status","ListingKey","CreationDate")] # keep only the relevant parts of the data frame
names(data3) <- c("LoanStatus","ListingKey","LoanCreationDate")
data3$LoanCreationDate <- as.POSIXct(data3$LoanCreationDate)
data3 <- data3[order(data3$LoanCreationDate),]
data3 <- data3[which(duplicated(data3$ListingKey)==F),]
#data3 <- data3[which(year(data3$LoanCreationDate)>2005),] # eliminate entries for 2005
data <- merge(data,data3,by="ListingKey",all.x=T)
# label details
data[which(data$LoanStatus=="1 month late"),]$LoanStatus <- NA
data[which(data$LoanStatus=="2 months late"),]$LoanStatus <- NA
data[which(data$LoanStatus=="3 months late"),]$LoanStatus <- NA
data[which(data$LoanStatus=="4+ months late"),]$LoanStatus <- NA
data[which(data$LoanStatus=="Cancelled"),]$LoanStatus <- NA
data[which(data$LoanStatus=="Current"),]$LoanStatus <- NA
data[which(data$LoanStatus=="Late"),]$LoanStatus <- NA
data[which(data$LoanStatus=="Payoff in progress"),]$LoanStatus <- NA
data[which(data$LoanStatus=="Repurchased"),]$LoanStatus <- NA
data$RepaidOrNot <- data$LoanStatus=="Paid" 
data <- data[,c("ListingKey","HowManythBid","AmountRequested","ListingStartDate",
                "AvgInterBidTime","FundedOrNot","RepaidOrNot")]
data <- droplevels(data)

In [None]:
data <- data[order(data$ListingKey,data$HowManythBid),]
dt <- dt[order(dt$ListingKey,dt$HowManythBid),]
data <- cbind(data,dt[,c("Gini","NoBids"),with=F])

In [None]:
# get additional data about projects (controls)
data2b <- data2[,c("BorrowerState","CreationDate","CreditGrade","DebtToIncomeRatio","Description","Lst_Key")]
data2b$Year <- substr(data2b$CreationDate,1,4)
#data2b <- data2b[which(data2b$Year>2005),]
data2b <- data2b[,c("BorrowerState","Year","CreditGrade","DebtToIncomeRatio","Description","Lst_Key")]
names(data2b) <- c("BorrowerState","Year","CreditGrade","DebtToIncomeRatio","Description","ListingKey")

# clean the description text
cleanHTML <- function(htmlString) {
  return(gsub("<.*?>", " ", htmlString))
}
data2b$Description <- cleanHTML(data2b$Description) # delete html tags

cleanWhites1 <- function(htmlString) {
  return(gsub("\n*|\r*", "", htmlString))
}
data2b$Description <- cleanWhites1(data2b$Description) # delete white spaces

cleanWhites2 <- function(htmlString) {
  return(gsub("^\\s+|\\s+$", "", htmlString))
}
data2b$Description <- cleanWhites2(data2b$Description) # delete white spaces

trim <- function(x) {
  return(gsub("^ *|(?<= ) | *$", "", x, perl=T))  
}
data2b$Description <- trim(data2b$Description) # delete more white spaces
# get the length of the description
library(stringr)
data2b$DescriptionLength <- str_count(data2b$Description," ")+1
data2b[which(data2b$DescriptionLength==1),]$DescriptionLength <- 0
data2b <- data2b[-c(5)] # drop Description

In [None]:
data <- merge(data,data2b,by="ListingKey")
data <- droplevels(data)
t <- c("ListingKey","AvgInterBidTime","FundedOrNot","HowManythBid",
       "RepaidOrNot","Gini","AmountRequested","NoBids","DescriptionLength",
       "BorrowerState","Year","CreditGrade","DebtToIncomeRatio")
data <- data[,which(names(data)%in%t)]
write.table(data,"../Data/BidLevelData.txt",row.names=F,col.names=T,sep="|")

In [None]:
dt <- data.table(data)
pdata <- dt[dt[, .I[HowManythBid==max(HowManythBid)],by=ListingKey]$V1]
write.table(pdata,"../Data/ProjectLevelData.txt",row.names=F,col.names=T,sep="|")