## Kaggle competition 


## Load libaries

In [1]:
Sys.setenv(LANG = "en")

# Data processing library
library(data.table)       # Data manipulation
library(plyr)             # Data manipulation
library(stringr)          # String, text processing             
library(dataPreparation)  # Data preparation library
library(woeBinning)       # Decision tree–based binning for numerical and categorical variables
library(Boruta)           # Variable selection

# Machine learning library
library(mlr)          # Machine learning framework
library(caret)         # Data processing and machine learning framework
library(MASS)          # LDA
library(randomForest)  # RF
library(gbm)           # Boosting Tree



ERROR: Error in library(dataPreparation): there is no package called 'dataPreparation'


## Read in the data 

In [None]:
full_train <- read.csv('C:/Users/kkusterer/Documents/MBD Semester 2/Kaggle competition/Data & More/bank_mkt_train.csv')
# DataSet without Response # 
test_No_rep<- read.csv('C:/Users/kkusterer/Documents/MBD Semester 2/Kaggle competition/Data & More/bank_mkt_test.csv')

In [None]:
str(test_No_rep)

In [None]:
str(full_train)

## preprocess the data 

In [None]:
# Fix the value of campaign
full_train[, 'campaign'] <- full_train[, 'campaign'] - 1
test_No_rep[, 'campaign'] <- test_No_rep[, 'campaign'] - 1

# Checking the value has been reduced
min(full_train[, 'campaign'])  # Previously = 1
min(test_No_rep[, 'campaign'])  # Previously = 1

In [None]:
# Check for NA's #
apply(is.na(full_train), 2, sum)

## split the data into train/validation/test

### Splitting the full_train set

In [None]:
# set a seed #
set.seed(1)

# Create a split # 
train_split <- caret::createDataPartition(y=full_train[, 'subscribe'], p=.7, list=F)
train <- full_train[train_split,] # spilt = 70% train
valid_test <- full_train[-train_split,] #Spilt = 30% test + vaiidation set

valid_split <- caret::createDataPartition(y=valid_test[, 'subscribe'], p=.5, list=F)
valid <- valid_test[valid_split,] # 15% - This 50 percent of the 30 percent. 
test <- valid_test[-valid_split,] # 15%

In [None]:
# checking the subscribe column by number witin each split #
table(train$subscribe) 
table(valid$subscribe) 
table(test$subscribe) 

## Checking the VarImp

In [None]:
#Check the most important variables for the random forest #
x <- train[,2:(ncol(train)-1)]
y <- as.factor(train[,"subscribe"])
rf_model <- randomForest(x,y, mtry=3, ntree=100, importance=T, seeds=1)
pimp_varImp <-  PIMP(x, y, rf_model, S=10, parallel=F, seed=123)

In [None]:
# Check which are the most important variables #
pimp_varImp$VarImp[order(pimp_varImp$VarImp[, 1], decreasing=T), ]

In [None]:
plot(train$month)

## Feature Engineering 

## Based on the VAR Imp 

### Group months into a column for spring 

In [None]:
#Group months into seasons #
train[, 'month_spring'] <- as.logical(train$month %in% c('mar', 'apr', 'may'))
valid[, 'month_spring'] <- as.logical(valid$month %in% c('mar', 'apr', 'may'))
test[, 'month_spring'] <- as.logical(test$month %in% c('mar', 'apr', 'may'))
# For test data
test_No_rep[, 'month_spring'] <- as.logical(test_No_rep$month %in% c('mar', 'apr', 'may'))

### Group months into a column for Summer 

In [None]:
# Add new variable to train and test (holdout)
# Train, valid, test
train[, 'month_summer'] <- as.logical(train$month %in% c('jun', 'jul', 'aug'))
valid[, 'month_summer'] <- as.logical(valid$month %in% c('jun', 'jul', 'aug'))
test[, 'month_summer'] <- as.logical(test$month %in% c('jun', 'jul', 'aug'))
# Test (holdout)
test_No_rep [, 'month_summer'] <- as.logical(test_No_rep$month %in% c('jun', 'jul', 'aug'))

### Group months into a column for autumn

In [None]:
 #Add new variable to train and test (holdout)
# Train, valid, test
train[, 'month_autumn'] <- as.logical(train$month %in% c('sep', 'oct', 'nov'))
valid[, 'month_autumn'] <- as.logical(valid$month %in% c('sep', 'oct', 'nov'))
test[, 'month_autumn'] <- as.logical(test$month %in% c('sep', 'oct', 'nov'))
# Test (holdout)
test_No_rep[, 'month_autumn'] <- as.logical(test_No_rep$month %in% c('sep', 'oct', 'nov'))

### Group months into a column for winter

In [None]:
# Add new variable to train and test (holdout)
# Train, valid, test
train[, 'month_winter'] <- as.logical(train$month %in% c('dec', 'jan', 'feb'))
valid[, 'month_winter'] <- as.logical(valid$month %in% c('dec', 'jan', 'feb'))
test[, 'month_winter'] <- as.logical(test$month %in% c('dec', 'jan', 'feb'))
# Test (holdout)
test_No_rep[, 'month_winter'] <- as.logical(test_No_rep$month %in% c('dec', 'jan', 'feb'))

### Create a colmn for the avergae 

In [None]:
# Add new variable to train and test (holdout)
# Train, valid, test
train[, 'age_ge_mean'] <- as.logical(train$age > mean(train$age))
valid[, 'age_ge_mean'] <- as.logical(valid$age > mean(valid$age))
test[, 'age_ge_mean'] <- as.logical(test$age > mean(test$age))
# Test (holdout)
test_No_rep[, 'age_ge_mean'] <- as.logical(test_No_rep$age > mean(train$age))

### create column to pdays to 999

In [None]:
# Add new variable to train and test (holdout)
# pdays == 999 is a special value
# Train, valid, test
train[, 'pdays_999'] <- as.logical(train$pdays == 999)
valid[, 'pdays_999'] <- as.logical(valid$pdays == 999)
test[, 'pdays_999'] <- as.logical(test$pdays == 999)
# Test (holdout)
test_No_rep[, 'pdays_999'] <- as.logical(test_No_rep$pdays == 999)

### Processing the data to be used on the models

### Select the variables of the highest importance

In [None]:
# Get the IV and DV list name
# Dependent variable (DV)
dv_list <- c('subscribe')
# Independent variable (IV)
iv_list <- setdiff(colnames(train), dv_list)  # Exclude the target variable
iv_list <- setdiff(iv_list, 'client_id')  # Exclude the client_id

In [None]:
# Pick out categorical, boolean and numerical variable
# List to be used for categorical variable
iv_cat_list <- c()
# List to be used for  boolean variable
iv_bool_list <- c()
# List to be used for numerical variable
iv_num_list <- c()  
for (v in iv_list) {
    if (class(train[, v]) == 'factor') {  # Factor == categorical variable
        iv_cat_list <- c(iv_cat_list, v)
    } else if (class(train[, v]) == 'logical') {  # Logical == boolean variable
        iv_bool_list <- c(iv_bool_list, v)
    } else {  # Non-factor + Non-logical == numerical variable
        iv_num_list <- c(iv_num_list, v)
    }
}

### Binning related variables

In [None]:
# Grouping 12 categories in the variable job onto 3 groups using WOE
binning_cat <- woe.binning(train, 'subscribe', 'job')
binning_cat

### Applying the binning 

In [None]:
# Apply the binning to data
tmp <- woe.binning.deploy(train, binning_cat, add.woe.or.dum.var='woe')
head(tmp[, c('job', 'job.binned', 'woe.job.binned')])

### Looping throught the cat variables, do this for each split

In [None]:
# Loop through all categorical variables
for (v in iv_cat_list) {
    
    # Remapping categorical variable on train data
    binning_cat <- woe.binning(train, 'subscribe', v)
    
    # Apply the binning to the train, valid and test data
    train <- woe.binning.deploy(train, binning_cat, add.woe.or.dum.var='woe')
    valid <- woe.binning.deploy(valid, binning_cat, add.woe.or.dum.var='woe')
    test <- woe.binning.deploy(test, binning_cat, add.woe.or.dum.var='woe')
    
    # Apply the binning to the test (holdout) data
    test_No_rep <- woe.binning.deploy(test_No_rep, binning_cat, add.woe.or.dum.var='woe')
}

### performing the discetization on a numerical variable

In [None]:
# Grouping the variable age onto 4 groups using WOE
binning_num <- woe.binning(train, 'subscribe', 'age')
binning_num

In [None]:
# Apply the binning to data
tmp <- woe.binning.deploy(train, binning_num, add.woe.or.dum.var='woe')
head(tmp[, c('age', 'age.binned', 'woe.age.binned')])

### Apply to all nummerical variables in the data

In [None]:
# Loop through all numerical variables
for (v in iv_num_list) {
    
    # Discretizing numerical variable on train data
    binning_num <- woe.binning(train, 'subscribe', v)
    
    # Apply the binning to the train, valid and test data
    train <- woe.binning.deploy(train, binning_num, add.woe.or.dum.var='woe')
    valid <- woe.binning.deploy(valid, binning_num, add.woe.or.dum.var='woe')
    test <- woe.binning.deploy(test, binning_num, add.woe.or.dum.var='woe')
    
    # Apply the binning to the test (holdout) data
    test_No_rep <- woe.binning.deploy(test_No_rep, binning_num, add.woe.or.dum.var='woe')
}

### Discretization <- focsused on using equal frequency discretization

In [None]:
# Build the discretization
bins <- build_bins(dataSet=train, cols="age", n_bins=5, type="equal_freq", verbose=F)

# Print out to check
bins

### applied to one variale 

In [None]:
# Apply to the data
tmp <- fastDiscretization(dataSet=train, bins=bins, verbose=F)
setDF(tmp); setDF(train)  # Convert data.table to data.frame
head(tmp[, 'age'])

### Applied to the full dataset

In [None]:
# Loop through all numerical variables
for (v in iv_num_list) {
    
    # Discretizing numerical variable on train data, n_bins=5
    bins <- build_bins(dataSet=train, cols=v, n_bins=5, type="equal_freq", verbose=F)
    
    # Apply the binning to the train, valid and test data
    tmp <- fastDiscretization(dataSet=train, bins=bins, verbose=F)
    setDF(tmp); setDF(train)  # Convert data.table to data.frame
    train[, paste0(v, '_freq_bin')] <- tmp[, v]  # Add new variable
    
    tmp <- fastDiscretization(dataSet=valid, bins=bins, verbose=F)
    setDF(tmp); setDF(valid)  # Convert data.table to data.frame
    valid[, paste0(v, '_freq_bin')] <- tmp[, v]  # Add new variable
    
    tmp <- fastDiscretization(dataSet=test, bins=bins, verbose=F)
    setDF(tmp); setDF(test)  # Convert data.table to data.frame
    test[, paste0(v, '_freq_bin')] <- tmp[, v]  # Add new variable
    
    # Apply the binning to the test (holdout) data
    tmp <- fastDiscretization(dataSet=test_No_rep, bins=bins, verbose=F)
    setDF(tmp); setDF(test_No_rep)  # Convert data.table to data.frame
    test_No_rep[, paste0(v, '_freq_bin')] <- tmp[, v]  # Add new variable
}

### Build on one variable

In [None]:
# Build the discretization
bins <- build_bins(dataSet=train, cols="age", n_bins=5, type="equal_width", verbose=F)

# Print out to check
bins

### Apply to dataset

In [None]:
# Apply to the data
tmp <- fastDiscretization(dataSet=train, bins=bins, verbose=F)
setDF(tmp); setDF(train)  # Convert data.table to data.frame
head(tmp[, 'age'])

### Loop through all data and apply

In [None]:
# Loop through all numerical variables
for (v in iv_num_list) {
    
    # Discretizing numerical variable on train data, n_bins=5
    bins <- build_bins(dataSet=train, cols=v, n_bins=5, type="equal_width", verbose=F)
    
    # Apply the binning to the train, valid and test data
    tmp <- fastDiscretization(dataSet=train, bins=bins, verbose=F)
    setDF(tmp); setDF(train)  # Convert data.table to data.frame
    train[, paste0(v, '_width_bin')] <- tmp[, v]  # Add new variable
    
    tmp <- fastDiscretization(dataSet=valid, bins=bins, verbose=F)
    setDF(tmp); setDF(valid)  # Convert data.table to data.frame
    valid[, paste0(v, '_width_bin')] <- tmp[, v]  # Add new variable
    
    tmp <- fastDiscretization(dataSet=test, bins=bins, verbose=F)
    setDF(tmp); setDF(test)  # Convert data.table to data.frame
    test[, paste0(v, '_width_bin')] <- tmp[, v]  # Add new variable
    
    # Apply the binning to the test (holdout) data
    tmp <- fastDiscretization(dataSet=test_No_rep, bins=bins, verbose=F)
    setDF(tmp); setDF(test_No_rep)  # Convert data.table to data.frame
    test_No_rep[, paste0(v, '_width_bin')] <- tmp[, v]  # Add new variable
}

### Getting updated list of variables

In [None]:
# Get the IV and DV list name
# Dependent variable (DV)
dv_list <- c('subscribe')
# Independent variable (IV)
iv_list <- setdiff(colnames(train), dv_list)  # Exclude the target variable
iv_list <- setdiff(iv_list, 'client_id')  # Exclude the client_id

In [None]:
#Pick out categorical, boolean and numerical variable
iv_cat_list <- c()  # List to store categorical variable
iv_bool_list <- c()  # List to store boolean variable
iv_num_list <- c()  # List to store numerical variable
for (v in iv_list) {
    if (class(train[, v]) == 'factor') {  # Factor == categorical variable
        iv_cat_list <- c(iv_cat_list, v)
    } else if (class(train[, v]) == 'logical') {  # Logical == boolean variable
        iv_bool_list <- c(iv_bool_list, v)
    } else {  # Non-factor + Non-logical == numerical variable
        iv_num_list <- c(iv_num_list, v)
    }
}

### Dummy encode a catergorical variable

In [None]:
# Build the dummy encoding
encoding <- build_encoding(dataSet=train, cols="job", verbose=F)

In [None]:
# Transform the categorical variable
tmp <- one_hot_encoder(dataSet=train, encoding=encoding, type='logical', drop=F, verbose=F)
setDF(tmp)
tmp <- tmp[, -ncol(tmp)]
head(tmp[, 84:ncol(tmp)])

### USing above method we apply to all the variables in the dataset 

In [None]:
# Loop through all categorical variables
for (v in iv_cat_list) {
    
    # Representing categorical variable on train data
    encoding <- build_encoding(dataSet=train, cols=v, verbose=F)
    
    # Apply the binning to the train, valid and test data
    train <- one_hot_encoder(dataSet=train, encoding=encoding, type='logical', drop=F, verbose=F)
    setDF(train)
    train <- train[, -ncol(train)]  # Drop the last dummy column
    
    valid <- one_hot_encoder(dataSet=valid, encoding=encoding, type='logical', drop=F, verbose=F)
    setDF(valid)
    valid <- valid[, -ncol(valid)]  # Drop the last dummy column
    
    test <- one_hot_encoder(dataSet=test, encoding=encoding, type='logical', drop=F, verbose=F)
    setDF(test)
    test <- test[, -ncol(test)]  # Drop the last dummy column
    
    # Apply the binning to the test (holdout) data
    test_No_rep <- one_hot_encoder(dataSet=test_No_rep, encoding=encoding, type='logical', drop=F, verbose=F)
    setDF(test_No_rep)
    test_No_rep <- test_No_rep[, -ncol(test_No_rep)]  # Drop the last dummy column
}

### Testing the variable representation on the catergorical variables in the dataset

In [None]:
# Find the incidence rates per category of a variable
tb <- table(train$job, train$subscribe)
incidence_map <- data.frame('v1'=rownames(tb), 'v2'=tb[, '1'] / (tb[, '0'] + tb[, '1']))
colnames(incidence_map) <- c('job', 'job_incidence')
incidence_map

In [None]:
# Convert the categories with incidences
tmp <- plyr::join(x=train, y=incidence_map, by='job', type="left", match="all")  # Left join
head(tmp[, c('job', 'job_incidence')])

### Applying the above method to the entire dataset

In [None]:
# Loop through all categorical variables
for (v in iv_cat_list){
    
    # Find the incidence rates per category of a variable
    tb <- table(train[, v], train[, 'subscribe'])
    incidence_map <- data.frame('v1'=rownames(tb), 'v2'=tb[, '1'] / (tb[, '0'] + tb[, '1']))
    colnames(incidence_map) <- c(v, paste0(v, '_incidence'))  # Rename the columns to join
    
    # Apply the variable representation to the train, valid and test data
    train <- plyr::join(x=train, y=incidence_map, by=v, type="left", match="all")
    valid <- plyr::join(x=valid, y=incidence_map, by=v, type="left", match="all")
    test <- plyr::join(x=test, y=incidence_map, by=v, type="left", match="all")
    
    # Apply the binning to the test (holdout) data
    test_No_rep <- plyr::join(x=test_No_rep, y=incidence_map, by=v, type="left", match="all")
}

### Testing the variable representation on the catergorical variable 

In [None]:
# Find the WOE per category of a variable
tb <- table(train$job, train$subscribe)
woe_map <- data.frame('v1'=rownames(tb), 'v2'=log(tb[, '1'] / tb[, '0']))
colnames(woe_map) <- c('job', 'job_woe')
woe_map

In [None]:
# Convert the categories with WOE
tmp <- plyr::join(x=train, y=woe_map, by='job', type="left", match="all")  # Left join
head(tmp[, c('job', 'job_woe')])

### Applying above method to the all the data

In [None]:
# Loop through all categorical variables
for (v in iv_cat_list) {
    
    # Find the incidence rates per category of a variable
    tb <- table(train[, v], train[, 'subscribe'])
    woe_map <- data.frame('v1'=rownames(tb), 'v2'=log(tb[, '1'] / tb[, '0']))
    colnames(woe_map) <- c(v, paste0(v, '_woe'))  # Rename the columns to join
    
    # Apply the variable representation to the train, valid and test data
    train <- plyr::join(x=train, y=woe_map, by=v, type="left", match="all")
    valid <- plyr::join(x=valid, y=woe_map, by=v, type="left", match="all")
    test <- plyr::join(x=test, y=woe_map, by=v, type="left", match="all")
    
    # Apply the binning to the test (Test_No_Rep) data
    test_No_rep <- plyr::join(x=test_No_rep, y=woe_map, by=v, type="left", match="all")
}

### Take the log of Age

In [None]:
# Transform the variable age on train and test (Test_No_rep)
# Train, valid, test
train[, 'age_log'] <- log(train[, 'age'])
valid[, 'age_log'] <- log(valid[, 'age'])
test[, 'age_log'] <- log(test[, 'age'])
# Test (Test_No_Rep)
test_No_rep[, 'age_log'] <- log(test_No_rep[, 'age'])

### Then to standardize the numerical variables

In [None]:
# Standardize the variable age on train and test (Test_No_rep)
# Train, valid, test
train[, 'age_scaled'] <- scale(train[, 'age'], center=T, scale=T)  # sd = 1, mean = 0
valid[, 'age_scaled'] <- scale(valid[, 'age'], center=T, scale=T)  # sd = 1, mean = 0
test[, 'age_scaled'] <- scale(test[, 'age'], center=T, scale=T)  # sd = 1, mean = 0
# Test (Test_No_rep)
test_No_rep[, 'age_scaled'] <- scale(test_No_rep[, 'age'], center=T, scale=T)  # sd = 1, mean = 0

## Variable Selection 

### Updating the list catorgoriacl variables

In [None]:
# Get the IV and DV list name
# Dependent variable (DV)
dv_list <- c('subscribe')
# Independent variable (IV)
iv_list <- setdiff(colnames(train), dv_list)  # Exclude the target variable
iv_list <- setdiff(iv_list, 'client_id')  # Exclude the client_id

In [None]:
# Pick out categorical, boolean and numerical variable
iv_cat_list <- c()  # List to store categorical variable
iv_bool_list <- c()  # List to store boolean variable
iv_num_list <- c()  # List to store numerical variable
for (v in iv_list) {
    if (class(train[, v]) == 'factor') {  # Factor == categorical variable
        iv_cat_list <- c(iv_cat_list, v)
    } else if (class(train[, v]) == 'logical') {  # Logical == boolean variable
        iv_bool_list <- c(iv_bool_list, v)
    } else {  # Non-factor + Non-logical == numerical variable
        iv_num_list <- c(iv_num_list, v)
    }
}

### check values for inf +- -> potential outliers in the data

In [None]:
# Check missing value
# Train, valid, test
sum(apply(sapply(train, is.infinite), 2, sum))
sum(apply(sapply(valid, is.infinite), 2, sum))
sum(apply(sapply(test, is.infinite), 2, sum))
# Test (holdout)
sum(apply(sapply(test_No_rep, is.infinite), 2, sum))

In [None]:
# Impute +/-Inf value by NA
# Train, valid, test
train[sapply(train, is.infinite)] <- NA
valid[sapply(valid, is.infinite)] <- NA
test[sapply(test, is.infinite)] <- NA
# Test (holdout)
test_No_rep[sapply(test_No_rep, is.infinite)] <- NA

### check and correct any potential NA values

In [None]:
# Check missing value
# Train, valid, test
sum(apply(is.na(train), 2, sum))
sum(apply(is.na(valid), 2, sum))
sum(apply(is.na(test), 2, sum))
# Test (holdout)
sum(apply(is.na(test_No_rep), 2, sum))

In [None]:
# Impute missing value in numerical variable by mean
for (v in iv_num_list) {
    # Train, valid, test
    train[is.na(train[, v]), v] <- mean(train[, v], na.rm=T)
    valid[is.na(valid[, v]), v] <- mean(valid[, v], na.rm=T)
    test[is.na(test[, v]), v] <- mean(test[, v], na.rm=T)
    
    # Test (holdout)
    test_No_rep[is.na(test_No_rep[, v]), v] <- mean(test_No_rep[, v], na.rm=T)
}

### Dropping catergorical variables

In [None]:
for (v in iv_cat_list) {
    # Train, valid, test
    train[, v] <- NULL
    valid[, v] <- NULL
    test[, v] <- NULL
    
    # Test (Test_No_Rep)
    test_No_rep[, v] <- NULL
}

### Concerting Boolean Varaibels to integer variabels 

In [None]:
# Convert boolean to int
for (v in iv_bool_list) {
    # Train, valid, test
    train[, v] <- as.integer(train[, v])
    valid[, v] <- as.integer(valid[, v])
    test[, v] <- as.integer(test[, v])
    
    # Test (Test_No_Rep)
    test_No_rep[, v] <- as.integer(test_No_rep[, v])
}

### find any constant Varaibles

In [None]:
# Find the constant variable
var_list <- c()
for (v in c(iv_num_list, iv_bool_list)) {
    var_list <- c(var_list, var(train[, v], na.rm=T))
}
constant_var <- c(iv_num_list, iv_bool_list)[var_list == 0]
constant_var

### Drop the constant Variables

In [None]:
# Drop the constant variable
for (v in constant_var) {
    # Train, valid, test
    train[, v] <- NULL
    valid[, v] <- NULL
    test[, v] <- NULL
    
    # Test (Test_No_rep)
    test_No_rep[, v] <- NULL
}

### Compute the fishcer score, for variable importance as a predictor

In [None]:
FisherScore <- function(basetable, depvar, IV_list) {
  "
  This function calculate the Fisher score of a variable.
  
  Ref:
  ---
  Verbeke, W., Dejaeger, K., Martens, D., Hur, J., & Baesens, B. (2012). New insights into churn prediction in the telecommunication sector: A profit driven data mining approach. European Journal of Operational Research, 218(1), 211-229.
  "
  
  # Get the unique values of dependent variable
  DV <- unique(basetable[, depvar])
  
  IV_FisherScore <- c()
  
  for (v in IV_list) {
    fs <- abs((mean(basetable[which(basetable[, depvar]==DV[1]), v]) - mean(basetable[which(basetable[, depvar]==DV[2]), v]))) /
      sqrt((var(basetable[which(basetable[, depvar]==DV[1]), v]) + var(basetable[which(basetable[, depvar]==DV[2]), v])))
    IV_FisherScore <- c(IV_FisherScore, fs)
  }
  
  return(data.frame(IV=IV_list, fisher_score=IV_FisherScore))
}

varSelectionFisher <- function(basetable, depvar, IV_list, num_select=20) {
  "
  This function will calculate the Fisher score for all IVs and select the best
  top IVs.

  Assumption: all variables of input dataset are converted into numeric type.
  "
  
  fs <- FisherScore(basetable, depvar, IV_list)  # Calculate Fisher Score for all IVs
  num_select <- min(num_select, ncol(basetable))  # Top N IVs to be selected
  return(as.vector(fs[order(fs$fisher_score, decreasing=T), ][1:num_select, 'IV']))
}

In [None]:
# Calculate Fisher Score for all variable
# Get the IV and DV list
dv_list <- c('subscribe')  # DV list
iv_list <- setdiff(names(train), dv_list)  # IV list excluded DV
iv_list <- setdiff(iv_list, 'client_id')  # Excluded the client_id
fs <- FisherScore(train, dv_list, iv_list)
head(fs)

In [None]:
# Select top 20 variables according to the Fisher Score
best_fs_var <- varSelectionFisher(train, dv_list, iv_list, num_select=50)
head(best_fs_var, 10)

In [None]:
# Apply variable selection to the data
# Train
var_select <- names(train)[names(train) %in% best_fs_var]
train_processed <- train[, c('client_id', var_select, 'subscribe')]
# Valid
var_select <- names(valid)[names(valid) %in% best_fs_var]
valid_processed <- valid[, c('client_id', var_select, 'subscribe')]
# Test
var_select <- names(test)[names(test) %in% best_fs_var]
test_processed <- test[, c('client_id', var_select, 'subscribe')]
# Test (holdout)
var_select <- names(test_No_rep)[names(test_No_rep) %in% best_fs_var]
test_No_rep_processed <- test_No_rep[, c('client_id', var_select)]

### Final data preprocess

In [None]:
# Check if train and test (holdout) have same variables
# Train, valid, test
dim(train_processed)
dim(valid_processed)
dim(test_processed)
# Test (holdout)
dim(test_No_rep_processed)

In [None]:
# Rename the data columns
for (v in colnames(train_processed)) {
    
    # Fix the column name
    fix_name <- str_replace_all(v, "[^[:alnum:] ]", "_")
    fix_name <- gsub(' +', '', fix_name) 
    
    # Train, valid, test
    colnames(train_processed)[colnames(train_processed) == v] <- fix_name
    colnames(valid_processed)[colnames(valid_processed) == v] <- fix_name
    colnames(test_processed)[colnames(test_processed) == v] <- fix_name
    
    # Test (holdout)
    colnames(test_No_rep_processed)[colnames(test_No_rep_processed) == v] <- fix_name
}

## Rename Tables

In [None]:
train_pro <- train_processed
valid_pro <- valid_processed
test_pro <- test_processed
testNrep <- test_No_rep_processed

## Fit the model 

### Logistic Regression

In [None]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=5, predict="both")

# Define the model
learner <- makeLearner("classif.logreg", predict.type="prob", fix.factors.prediction=T)

# Define the task
train_task <- makeClassifTask(id="bank_train", data=train_pro[, -1], target="subscribe")

# Set hyper parameter tuning
tune_params <- makeParamSet(
)
ctrl = makeTuneControlGrid()

# Run the hyper parameter tuning with k-fold CV
if (length(tune_params$pars) > 0) {
    # Run parameter tuning
    res <- tuneParams(learner, task=train_task, resampling=rdesc,
      par.set=tune_params, control=ctrl, measures=list(mlr::auc))
    
    # Extract best model
    best_learner <- res$learner
    
} else {
    # Simple cross-validation
    res <- resample(learner, train_task, rdesc, measures=list(mlr::auc, setAggregation(mlr::auc, train.mean)))
    
    # No parameter for tuning, only 1 best learner
    best_learner <- learner
}

In [None]:
# Retrain the model with tbe best hyper-parameters
best_md <- mlr::train(best_learner, train_task)

In [None]:
# Make prediction on valid data
pred_log <- predict(best_md, newdata=valid_processed[, -1])
#performance(pred_log, measures=mlr::auc) <-error here

## RandomForest model

In [None]:
# Set up cross-validation
rdesc = makeResampleDesc("CV", iters=10)

# Define the model
learner <- makeLearner("classif.randomForest", predict.type="prob", fix.factors.prediction=T)

# Define the task
train_task <- makeClassifTask(id="bank_train", data=train_pro[, -1], target="subscribe")

# Set hyper parameter tuning
tune_params <- makeParamSet(
  makeDiscreteParam('ntree', value=c(100, 250, 500, 750, 1000)),
  makeDiscreteParam('mtry', value=round(sqrt((ncol(train_processed)-1) * c(0.1, 0.25, 0.5, 1, 2, 4))))
)
ctrl = makeTuneControlGrid()

# Run the hyper parameter tuning with k-fold CV
if (length(tune_params$pars) > 0) {
    # Run parameter tuning
    res <- tuneParams(learner, task=train_task, resampling=rdesc,
      par.set=tune_params, control=ctrl, measures=list(mlr::auc))
    
    # Extract best model
    best_learner <- res$learner
    
} else {
    # Simple cross-validation
    res <- resample(learner, train_task, rdesc, measures=list(mlr::auc))
    
    # No parameter for tuning, only 1 best learner
    best_learner <- learner
}

In [None]:
# Retrain the model with tbe best hyper-parameters
best_md <- mlr::train(best_learner, train_task)

In [None]:
# Make prediction on valid data
pred <- predict(best_md, newdata=valid_pro[, -1])
#performance(pred, measures=mlr::auc) <- error here

In [None]:
class(pred) # pred of type prediction 

In [None]:
# Make prediction on test data
pred <- predict(best_md, newdata=test_pro[, -1])
performance(pred, measures=mlr::auc)

In [None]:
# Make prediction on test data
pred <- predict(best_md, newdata=testNrep[, -1])
pred

## K nearest neighbors

### We can use the class package in using the KNN 

In [None]:
#install.packages("class")
library(class)

In [None]:
ncol(train_pro)

In [None]:
## Normilization 

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }

norm_knn <- as.data.frame(lapply(train_pro[,1:51], normalize))


In [None]:
set.seed(123)

data_knn <- sample(1:nrow(norm_knn),size=nrow(norm_knn)*0.7,replace = FALSE) 

#Creation of labels
train_knn <- train_pro[,52]
test_knn <- test_pro[,52]

# Running KNN on the test
knn_pred_test <- kNN(train = train_pro, test=test_pro, cl=train_knn, k = 10)

In [None]:
#Install relevant package to produce output
install.pacakges(gmodels)

In [None]:
library(gmodels)
# Create visualisation for KNN
Crosstable <- (x = test_knn, y = knn_pred_test, prop.chisp=FALSE)

In [None]:
#Model_Accuracy = (TN+TP/Total_Observations) 



 ## Gardient Boosting

In [None]:
set.seed(123)

#Training the model 

subscribe_model <-gbm(formula = subscribe ~ ., 
                    distribution = "bernoulli", 
                    data = train_pro,
                    n.trees = 10000)
# Print the model

print(subscribe_model)

#summary() the variable importance

summary(subscribe_model)


### Generating predictions with the model 

In [None]:
#gerneraton on test
preds1 <- predict(object = subscribe_model,
                     newdata = test_pro,
                     n.trees = 10000)
#generation on test (scaled)
preds2 <- predict(object = subscribe_model,
                     newdata = test_pro,
                     n.trees = "response")

#Comparing the reange of the 2 predictions 
range(preds1)
range(preds2)

### Generating the test Auc's for both sets of predications and compare

In [None]:
library(AUC)

In [None]:
auc(actual = test_pro$subscribe, predicted = preds1)#default 
auc(actual = test_pro$subscribe, predicted = preds2)#rescaled <- error in auc function missing a required argument

## Linear Regression 

### Alternative Method using/ mlbench

In [None]:
Install.packages("mlbench")
library(mlbench)

# Make a task
regr.task = makeRegrTask(data = train_pro, target = "subsrcibe")
regr.task


In [None]:
set.seed(1234)

# Define a search space for each learner'S parameter
ps_ksvm = makeParamSet(
  makeNumericParam("sigma", lower = -12, upper = 12, trafo = function(x) 2^x)
)

ps_rf = makeParamSet(
  makeIntegerParam("num.trees", lower = 1L, upper = 200L)
)

# Choose a resampling strategy
rdesc = makeResampleDesc("CV", iters = 5L)

# Choose a performance measure
meas = rmse

# Choose a tuning method
ctrl = makeTuneControlCMAES(budget = 100L)

# Make tuning wrappers
tuned.ksvm = makeTuneWrapper(learner = "regr.ksvm", resampling = rdesc, measures = meas,
  par.set = ps_ksvm, control = ctrl, show.info = FALSE)
tuned.rf = makeTuneWrapper(learner = "regr.ranger", resampling = rdesc, measures = meas,
  par.set = ps_rf, control = ctrl, show.info = FALSE)

In [None]:
# Four learners to be compared
lrns = list(makeLearner("regr.lm"), tuned.ksvm, tuned.rf)

# Conduct the benchmark experiment
bmr = benchmark(learners = lrns, tasks = regr.task, resamplings = rdesc, measures = rmse, 
  show.info = FALSE)

In [None]:
# Get performances 
getBMRAggrPerformances(bmr)

### Simple Method for linear Regression 

In [None]:
library(readxl)

In [None]:
# On training 
lmSubscribe = lm(subscribe ~., data=train_pro)
summary(lmSubscribe)
# On validation 
lmSubscribe = lm(subscribe ~., data=valid_pro)
summary(lmSubscribe)

#On test
lmSubscribe = lm(subscribe ~., data=test_pro)
summary(lmSubscribe)


### The above shows which features are good for predicting subscribe and which features are not 

In [None]:
# Ruuning a linear regression with some more related features 

lm_subscribe2 <- lm(subscribe ~ pdays + woe_cons_conf_idx_binned + woe_euribor3m_binned, data = train_pro)
summary(lm_subscribe2)
plot(lm_subscibe2, pch = 16, color ="red")
