In [1]:
# Import all required libraries
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

np.random.seed(0)
%matplotlib inline

## Loading Data

In [50]:
path = "C:/Users/hyunm/OneDrive/Documents/GitHub/Rain-Model/data/"
# Load the training data
X = np.genfromtxt(path + 'X_train.txt', delimiter=None)
Y = np.genfromtxt(path + 'Y_train.txt', delimiter=None)

# Test features
Xte = np.genfromtxt(path + 'X_test.txt', delimiter=None)

# Split into train and validation
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) # Default is 80% training/20% validation
Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

## Random Forest

In [8]:
# computeRandomForest(num_bags, num_features):
#     Returns bags (trees) using Random Forest algorithm given num_bags and num_features
def computeRandomForest(num_bags, num_features):
    np.random.seed(0)  # Resetting the seed in case you ran other stuff.
    bags = []
    n_bags = num_bags
    num_records = Xtr.shape[0]
    
    for l in range(n_bags):
        
        # Each boosted data is the size of the original data
        Xi, Yi = ml.bootstrapData(Xtr, Ytr, num_records)


        # Train the model on that draw
        tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6, maxDepth=25, nFeatures=num_features)
        bags.append(tree)

        tr_auc = tree.auc(Xtr, Ytr)
        val_auc = tree.auc(Xva, Yva)

        print("Decision Tree : {0}".format(l))
        print("{0:>15}: {1:.4f}".format('Train AUC', tr_auc))
        print("{0:>15}: {1:.4f}".format('Validation AUC', val_auc))
        
    return bags

## Creating a Bagged Class

In [10]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [40]:
# best number of features is 6
best_num_features = 6

In [43]:
bags_100 = computeRandomForest(100, best_num_features)

Decision Tree : 0
      Train AUC: 0.7926
 Validation AUC: 0.6968
Decision Tree : 1
      Train AUC: 0.7923
 Validation AUC: 0.6909
Decision Tree : 2
      Train AUC: 0.7949
 Validation AUC: 0.6937
Decision Tree : 3
      Train AUC: 0.7894
 Validation AUC: 0.6979
Decision Tree : 4
      Train AUC: 0.7916
 Validation AUC: 0.6989
Decision Tree : 5
      Train AUC: 0.7891
 Validation AUC: 0.6863
Decision Tree : 6
      Train AUC: 0.7899
 Validation AUC: 0.6971
Decision Tree : 7
      Train AUC: 0.7965
 Validation AUC: 0.6968
Decision Tree : 8
      Train AUC: 0.7887
 Validation AUC: 0.6874
Decision Tree : 9
      Train AUC: 0.7935
 Validation AUC: 0.6944
Decision Tree : 10
      Train AUC: 0.7907
 Validation AUC: 0.6938
Decision Tree : 11
      Train AUC: 0.7888
 Validation AUC: 0.6877
Decision Tree : 12
      Train AUC: 0.7894
 Validation AUC: 0.6930
Decision Tree : 13
      Train AUC: 0.7832
 Validation AUC: 0.6842
Decision Tree : 14
      Train AUC: 0.7849
 Validation AUC: 0.6821
Decis

# ## Compute AUC score for given bags

In [45]:
# AUC score for bag num 10
# print("Print AUC score for bag num: " + str(len(bags_10)))
# bt = BaggedTree(bags_10)
# bt.classes = np.unique(Y)

# print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr)))
# print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))

# AUC score for bag num 30
# print("Print AUC score for bag num: " + str(len(bags_30)))
# bt2 = BaggedTree(bags_30)
# bt2.classes = np.unique(Y)

# print("{0:>15}: {1:.4f}".format('Train AUC', bt2.auc(Xtr, Ytr)))
# print("{0:>15}: {1:.4f}".format('Validation AUC', bt2.auc(Xva, Yva)))

print("Print AUC score for bag num: " + str(len(bags_100)))
bt3 = BaggedTree(bags_100)
bt3.classes = np.unique(Y)

print("{0:>15}: {1:.4f}".format('Train AUC', bt3.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt3.auc(Xva, Yva)))

Print AUC score for bag num: 100
      Train AUC: 0.9076
 Validation AUC: 0.7839


best_num_features = 6 (computed using square root of 14 (total num features) and adding 3)
For bags = 10
    Train AUC: 0.8914
    Validation AUC: 0.7702 
    
For bags = 30
    Train AUC: 0.9022
    Validation AUC: 0.7793 
    
For bags = 100
    Train AUC: 0.9076
    Validation AUC: 0.7839 (BEST)

## Predicting probability using test data, predict soft for 100 bags, 6 features for random root

In [46]:
probs = bt3.predictSoft(Xte)
print(probs)

[[0.55053317 0.44946683]
 [0.12534233 0.87465767]
 [0.75782709 0.24217291]
 ...
 [0.75578807 0.24421193]
 [0.32089024 0.67910976]
 [0.71865865 0.28134135]]


## Submitting Predictions

In [47]:
# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
Y_sub = np.vstack([np.arange(Xte.shape[0]), probs[:, 1]]).T

# We specify the header (ID, Prob1) and also specify the comments as '' so the header won't be commented out with
# the # sign.
np.savetxt('Y_sub_random_forest.txt', Y_sub, '%d, %.5f', header='ID,Prob1', comments='', delimiter=',')