# Inbalance Treatment

In [1]:
if(!require(imbalance)){install.packages("imbalance");require(imbalance)}
### ROSE = Random Over-Sampling Examples
if(!require(ROSE)){install.packages("ROSE");require(ROSE)}
if(!require(partykit)){install.packages("partykit");require(partykit)}
if(!require(dplyr)) {install.packages("dplyr");require(dplyr)}
if(!require(ggplot2)) {install.packages("ggplot2");require(ggplot2)}
if(!require(DBI)){install.packages("DBI"); require(DBI)}
if(!require(RSQLite)){install.packages("RSQLite"); require(RSQLite)}


Loading required package: imbalance
"package 'imbalance' was built under R version 3.5.2"Loading required package: ROSE
"package 'ROSE' was built under R version 3.5.2"Loaded ROSE 0.0-3

Loading required package: partykit
Loading required package: grid
Loading required package: libcoin
Loading required package: mvtnorm
Loading required package: dplyr

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: ggplot2
Loading required package: DBI
Loading required package: RSQLite


In [2]:
##############################################################################
##########                DATABASE FUNCTIONS                     #############
##############################################################################
            
#### Read function to import data from the SQL to a pandas dataframe.
readSQL <- function(query, db=DB_FILE) {
    require(DBI)
    require(RSQLite)
    con <- dbConnect(SQLite(), DB_FILE)
    df <- dbGetQuery(con, query)
    return(df)
}
#### Write a pandas dataframe into an SQL table. Use overwrite=True if you want to delete 
#### first a pre-existent table with the same name. Use append=True if you want to append
#### the data in the dataframe to a pre-existent table.
writeSQL <- function(df,tablename,overwrite=FALSE, append=FALSE,db=DB_FILE) {
    require(DBI)
    require(RSQLite)
    con <- dbConnect(SQLite(), DB_FILE)
    dbWriteTable(con,tablename,df,overwrite,append)
}
####
listTables <- function(db=DB_FILE) {
    require(DBI)
    require(RSQLite)
    con <- dbConnect(SQLite(), DB_FILE)
    ### list the tables on the DB
    res <- dbListTables(con)
    return(res)
} 

In [3]:
### load data
DB_FILE = (paste0(getwd(),"/Data/loans.db"))
listTables()
loansX = readSQL("SELECT * FROM X_train_scaled")
loansY = readSQL("SELECT [default] FROM Y_train")


In [4]:
train = cbind(loansX,loansY)

In [5]:
loansX = readSQL("SELECT * FROM X_dev_scaled")
loansY = readSQL("SELECT [default] FROM Y_dev")

In [6]:
dev = cbind(loansX,loansY)

In [7]:
train <- train%>%select(-one_of(c('index')))
dev <- dev%>%select(-one_of(c('index')))

In [8]:
table(train$default)
table(train$default)/nrow(train)*100


     0      1 
167067  36769 


       0        1 
81.96148 18.03852 

In [9]:
numPositive <- length(which(train$default == 1))
numNegative <- length(which(train$default == 0))
nInstances <- numNegative - numPositive
cbind(numPositive=numPositive,numNegative=numNegative,nInstances=nInstances)

numPositive,numNegative,nInstances
36769,167067,130298


## Baseline model performance with the unbalanced dataset

In [12]:
mod1 <- ctree(default ~., data=train)
auc1 <- pROC::auc(dev$default, predict(mod1,newdata=dev,type="response"))
auc1

"no non-missing arguments to min; returning Inf"

# Imbalance treatment
 
### Under Sampling

In [13]:
#under sampling
data_balanced_under <- ovun.sample(default ~ ., data = train, method = "under",N = numPositive*2)$data
table(data_balanced_under$default)


    0     1 
36769 36769 

In [14]:
mod1 <- ctree(default ~., data=data_balanced_under)
auc1 <- pROC::auc(dev$default, predict(mod1,newdata=dev,type="response"))
auc1

"no non-missing arguments to min; returning Inf"

### Over Sampling

In [15]:
#over sampling
data_balanced_over <- ovun.sample(default ~ ., data = train, method = "over",N = numNegative*2)$data
table(data_balanced_over$default)


     0      1 
167067 167067 

In [16]:
mod1 <- ctree(default ~., data=data_balanced_over)
auc1 <- pROC::auc(dev$default, predict(mod1, newdata=dev,type="response"))
auc1

"no non-missing arguments to min; returning Inf"

### OverUnder Sampling

In [17]:
#over_under sampling
data_balanced_both <- ovun.sample(default ~ ., data = train, method = "both", p=0.5, seed = 1207)$data
table(data_balanced_both$default)


     0      1 
101941 101895 

In [18]:
mod1 <- ctree(default ~., data=data_balanced_both)
auc1 <- pROC::auc(dev$default, predict(mod1, newdata=dev,type="response"))
auc1

"no non-missing arguments to min; returning Inf"

### Generation of synthetic data by Randomly Over Sampling Examples (ROSE).

In [19]:
# Rose: 
data.rose <- ROSE(default ~ ., data = train, seed = 1207)$data
table(data.rose$default)


     0      1 
101941 101895 

In [20]:
mod1 <- ctree(default ~., data=data.rose)
auc1 <- pROC::auc(dev$default, predict(mod1, newdata=dev,type="response"))
auc1

## Conclusion

For our dataset the best model was for the base dataset <b> for the base dataset </b>
therefore we continue to work with this dataset