In [29]:
library(party)
library(randomForest)
library(readr)
library(dplyr)
library(tictoc)

In [30]:
df <- as.data.frame(read_csv('credit_train.csv'))

Parsed with column specification:
cols(
  `Loan ID` = col_character(),
  `Customer ID` = col_character(),
  `Loan Status` = col_character(),
  `Current Loan Amount` = col_integer(),
  Term = col_character(),
  `Credit Score` = col_integer(),
  `Annual Income` = col_integer(),
  `Years in current job` = col_character(),
  `Home Ownership` = col_character(),
  Purpose = col_character(),
  `Monthly Debt` = col_double(),
  `Years of Credit History` = col_double(),
  `Months since last delinquent` = col_integer(),
  `Number of Open Accounts` = col_integer(),
  `Number of Credit Problems` = col_integer(),
  `Current Credit Balance` = col_integer(),
  `Maximum Open Credit` = col_integer(),
  Bankruptcies = col_integer(),
  `Tax Liens` = col_integer()
)


In [31]:
sapply(df,class)
# There are a ton of NAs

In [32]:
df <- na.omit(df)

In [33]:
head(df,2)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
3,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741,2231892,8 years,Own Home,Debt Consolidation,29200.53,14.9,29,18,1,297996,750090,0,0
7,273581de-85d8-4332-81a5-19b04ce68666,90a75dde-34d5-419c-90dc-1e58b04b3e35,Fully Paid,217646,Short Term,730,1184194,< 1 year,Home Mortgage,Debt Consolidation,10855.08,19.6,10,13,1,122170,272052,1,0


In [34]:
df <- select(df,-`Loan ID`,-`Customer ID`) 

In [35]:
library(caret)
dummy <- dummyVars("~ .", data = df,fullRank = TRUE, sep=".")

In [36]:
features <- as.data.frame(predict(dummy, df))

In [37]:
head(features,3)

Unnamed: 0,`Loan Status`Fully Paid,`Current Loan Amount`,TermShort Term,`Credit Score`,`Annual Income`,`Years in current job`1 year,`Years in current job`10+ years,`Years in current job`2 years,`Years in current job`3 years,`Years in current job`4 years,...,Purposewedding,`Monthly Debt`,`Years of Credit History`,`Months since last delinquent`,`Number of Open Accounts`,`Number of Credit Problems`,`Current Credit Balance`,`Maximum Open Credit`,Bankruptcies,`Tax Liens`
3,1,99999999,1,741,2231892,0,0,0,0,0,...,0,29200.53,14.9,29,18,1,297996,750090,0,0
7,1,217646,1,730,1184194,0,0,0,0,0,...,0,10855.08,19.6,10,13,1,122170,272052,1,0
9,1,548746,1,678,2559110,0,0,1,0,0,...,0,18660.28,22.6,33,4,0,437171,555038,0,0


In [38]:
colnames(features) <- gsub("`", "", colnames(features))
colnames(features) <- gsub(" ", ".", colnames(features))
colnames(features) <- gsub("\\+", "plus", colnames(features))
colnames(features) <- gsub("\\/", "", colnames(features))
# In R, use \\ to grab special character
head(features,1)

Unnamed: 0,Loan.StatusFully.Paid,Current.Loan.Amount,TermShort.Term,Credit.Score,Annual.Income,Years.in.current.job1.year,Years.in.current.job10plus.years,Years.in.current.job2.years,Years.in.current.job3.years,Years.in.current.job4.years,...,Purposewedding,Monthly.Debt,Years.of.Credit.History,Months.since.last.delinquent,Number.of.Open.Accounts,Number.of.Credit.Problems,Current.Credit.Balance,Maximum.Open.Credit,Bankruptcies,Tax.Liens
3,1,100000000.0,1,741,2231892,0,0,0,0,0,...,0,29200.53,14.9,29,18,1,297996,750090,0,0


In [39]:
colnames(features)

In [40]:
set.seed(100)
train <- sample(nrow(features), 0.75*nrow(features), replace = FALSE)
TrainSet <- features[train,]
TestSet <- features[-train,]

In [41]:
labels <- as.factor(features$Loan.StatusFully.Paid)
features <- select(features, -Loan.StatusFully.Paid)

In [46]:
cat('Training Set Shape:', dim(TrainSet),"\n")
cat('Testing Set Shape:',dim(ValidSet))

Training Set Shape: 28507 43 
Testing Set Shape: 9503 43

In [None]:
tic()
    random_forest = rf <- randomForest(features,labels, ntree=200, importance=TRUE)
toc()

In [None]:
print(rf)

In [None]:
plot(rf)
# Have to find a way to do this in Python

In [None]:
# Predicting on train set
predTrain <- predict(rf, TrainSet, type = "class")
# Checking classification accuracy
table(predTrain, TrainSet$Loan.StatusFully.Paid)  

In [None]:
# Checking classification accuracy
predValid <- predict(rf, ValidSet, type = "class")
mean(predValid == ValidSet$Loan.StatusFully.Paid)                    
table(predValid,ValidSet$Loan.StatusFully.Paid)

In [None]:
importance(rf)  
varImpPlot(rf)

In [None]:
getTree(rf)