# Machine Learning
# Homework 2 : Ensemble Methods

## Q1

**a) Random Forest from Decision Tree **

In [282]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [283]:
from sklearn.datasets import load_iris
iris = load_iris()

X = iris.data
y = iris.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
from sklearn import tree

In [284]:
class RandomForestClassifier:
    
    def __init__(self):
        self.forest = []
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
            
    
    def fit(self, X, y, n_estimators = 10):
        for i in range(n_estimators):
            model = tree.DecisionTreeClassifier()
            bag = self.bagging(X, y)
            model.fit(bag[0], y[bag[1]])
            self.forest.append(np.array([model, bag[2]]))
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(self.forest)):
            X_ = np.array(np.delete(X_test, self.forest[i][1], axis = 1))
            y_pred.append(self.forest[i][0].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return np.array(y_pred.max())

class RandomForestRegressor:
    
    def __init__(self):
        self.forest = []
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
            
    
    def fit(self, X, y, n_estimators = 10):
        for i in range(n_estimators):
            model = tree.DecisionTreeRegressor()
            bag = self.bagging(X, y)
            model.fit(bag[0], y[bag[1]])
            self.forest.append(np.array([model, bag[2]]))
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(self.forest)):
            X_ = np.array(np.delete(X_test, self.forest[i][1], axis = 1))
            y_pred.append(self.forest[i][0].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return y_pred.mean()
            
            

** d) Applying the Algorithms on IRIS datset**

** Random forest with given Parameters **

In [221]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train, 20)

In [222]:
y_pred = forest.predict(X_test)
y_pred

array([0, 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 0, 0, 2, 2, 0,
       2, 0, 0, 1, 2, 2, 1, 1, 0, 1, 0, 2, 2, 1, 2, 0, 2, 1, 0, 2, 2, 0,
       0])

In [223]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8888888888888888

** Decision Tree **

In [232]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred_ = model.predict(X_test)
accuracy_score(y_test, y_pred_)

0.9555555555555556

** Random Forest with optimized Parameters **

In [241]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train, 5)
y_pred = forest.predict(X_test)
accuracy_score(y_test, y_pred)

0.9777777777777777

** Observation: ** we can observe that Random forest with given parameters has less accuracy than a normal Decision Tree. However, when the parameters in the Random Forest are optimized we are getting a much better accuracy.

** b) Parallel Version of Random Forest ** 

In [20]:
import multiprocessing as mp

In [21]:
class RandomForestClassifierMP:
    
    def __init__(self):
        self.forest = []
        self.output = mp.Queue()
        self.trees = []
        self.n_estimators = 10
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
    
    def mini_fit(self, model,i, y, bag,output):
        model.fit(bag[0], y[bag[1]])
        output.put((i, model))
        
        
    def fit(self, X, y, n_estimators = 10):
        self.n_estimators = n_estimators
        processes = []
        for i in range(n_estimators):
            model = tree.DecisionTreeClassifier()
            bag = self.bagging(X, y)
            processes.append(mp.Process(target = self.mini_fit, args = (model,i, y, bag, self.output)))
            self.forest.append(bag[2])
            processes[-1].start()
        for p in processes:
            p.join()
        
    def predict(self, X_test):
        y_pred = []
        for i in range((self.n_estimators)):
            tree = self.output.get()
            X_ = np.array(np.delete(X_test, self.forest[tree[0]], axis = 1))
            y_pred.append(tree[1].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return np.array(y_pred.max())
    
class RandomForestRegressorMP:
    
    def __init__(self):
        self.forest = []
        self.output = mp.Queue()
        self.trees = []
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
    
    def mini_fit(self, model,i, y, bag,output):
        model.fit(bag[0], y[bag[1]])
        output.put((i, model))
        
        
    def fit(self, X, y, n_estimators = 10):
        processes = []
        for i in range(n_estimators):
            model = tree.DecisionTreeRegressor()
            bag = self.bagging(X, y)
            processes.append(mp.Process(target = self.mini_fit, args = (model,i, y, bag, self.output)))
            self.forest.append(bag[2])
        for p in proceses:
            p.start()
        for p in processes:
            p.join()
        for i in range(n_estimators):
            self.trees.append(self.output.get())
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(self.trees)):
            X_ = np.array(np.delete(X_test, self.forest[self.trees[i][0]], axis = 1))
            y_pred.append(self.trees[i][1].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return np.array(y_pred.max())

In [70]:
forestMP = RandomForestClassifierMP()
forestMP.fit(X_train, y_train, 5)

In [71]:
y_predMP = forestMP.predict(X_test)

In [72]:
y_predMP

array([2, 1, 0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 0, 0, 2, 0, 1, 2, 0, 2,
       0, 2, 1, 2, 0, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2,
       2])

In [73]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predMP)

0.9333333333333333

** d) Comparison of Serial and Parallel Versions of Random Forests **

In [135]:
class RandomForestClassifier:
    
    def __init__(self):
        self.forest = []
        self.predictions = []
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
            
    
    def fit(self, X, y, n_estimators = 10):
        for i in range(n_estimators):
            model = tree.DecisionTreeClassifier()
            bag = self.bagging(X, y)
            model.fit(bag[0], y[bag[1]])
            time.sleep(0.01)
            self.forest.append(np.array([model, bag[2]]))
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(self.forest)):
            X_ = np.array(np.delete(X_test, self.forest[i][1], axis = 1))
            y_pred.append(self.forest[i][0].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return np.array(y_pred.max())

class RandomForestClassifierMP:
    
    def __init__(self):
        self.forest = []
        self.output = []
        self.n_estimators = 10
    
    def bagging(self, X, y):
        cols = X.shape[1] - int(np.sqrt(X.shape[1]))
        indices = np.random.randint(0, len(X), size = (int(len(X)*0.6),))
        X_ = X[indices]
        features = np.random.randint(0, X.shape[1], size = (cols, ))
        X_ = np.delete(X_, features, axis = 1)
        return np.array([X_, indices, features])
    
    def mini_fit(self, model,i, y, bag,output):
        model.fit(bag[0], y[bag[1]])
        output.append((i, model))
        
        
    def fit(self, X, y, n_estimators = 10):
        self.n_estimators = n_estimators
        processes = []
        for i in range(n_estimators):
            model = tree.DecisionTreeClassifier()
            bag = self.bagging(X, y)
            processes.append(mp.Process(target = self.mini_fit, args = (model,i, y, bag, self.output)))
            self.forest.append(bag[2])
            processes[-1].start()
        for p in processes:
            p.join()
        
    def predict(self, X_test):
        y_pred = []
        for i in range(len(self.output)):
            tree = self.output[i]
            X_ = np.array(np.delete(X_test, self.forest[tree[0]], axis = 1))
            y_pred.append(tree[1].predict(X_))
        y_pred = pd.DataFrame(data = y_pred)
        return np.array(y_pred.max())

** Serial Implemenation Runtime **

In [136]:
import time
a = time.time()
forest = RandomForestClassifier()
forest.fit(X_train, y_train, 25)
b = time.time() - a
print(b)

0.323933124542


** Parallel Implementation Runtime **

In [137]:
c = time.time()
main_d={}
forestMP = RandomForestClassifierMP()
forestMP.fit(X_train, y_train, 25)
#forestMP.predict(X_test)
d = time.time() - c
print(d)

0.234215021133


### Observation:
We can clearly observe that the time taken for serial Implementaion is taking 0.4 sec to run whereas the parallel implementation of the same is taking around 0.24 seconds for 25 estimators.

** Checking the same for breast cancer dataset **

In [138]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [139]:
X = data.data
y = data.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [140]:
import time
a = time.time()
forest = RandomForestClassifier()
forest.fit(X_train, y_train, 20)
#forest.predict(X_test)
b = time.time() - a
print(b)

c = time.time()
main_d={}
forestMP = RandomForestClassifierMP()
forestMP.fit(X_train, y_train, 20)
#forestMP.predict(X_test)
d = time.time() - c
print(d)

0.23770403862
0.179099798203


** e) 5 - fold Cross Validation **

In [297]:
from sklearn.datasets import load_iris
iris = load_iris()

X = iris.data
y = iris.target
randomize = np.arange(len(X))
np.random.shuffle(randomize)
X = X[randomize]
y = y[randomize]

In [298]:
l = len(X)
l = l/5
l

30

In [299]:
foldsX = []
foldsy = []

for i in range(5):
    foldsX.append(X[i*l: (i+1)*l, :])
    foldsy.append(y[i*l:(i+1)*l])
from sklearn.metrics import accuracy_score

print len(foldsX)
print len(foldsy)
a = np.vstack(foldsX)
a.shape

5
5


(150, 4)

In [307]:
estimators = [1, 2, 5, 10, 20, 100]

for i in range(len(estimators)):
    for j in range(5):
        folds_X = foldsX[:]
        folds_y = foldsy[:]
        folds_X.pop(j)
        folds_y.pop(j)
        X_train = np.vstack(folds_X)
        y_train = np.vstack(folds_y)
        X_test = np.vstack(foldsX[j])
        y_test = np.vstack(foldsy[j])
        print(X_train, y_train, X_test, y_test)
        model = RandomForestClassifier()
        model.fit(X_train, y_train,estimators[i])
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_pred, y_test)
        print(estimators[i], accuracy)
        

(array([[5.9, 3.2, 4.8, 1.8],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5. , 3.3, 1.4, 0.2],
       [7.2, 3.2, 6. , 1.8],
       [5. , 2.3, 3.3, 1. ],
       [5.1, 3.8, 1.5, 0.3],
       [5.8, 2.7, 4.1, 1. ],
       [4.8, 3.1, 1.6, 0.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5. , 1.7],
       [6.7, 2.5, 5.8, 1.8],
       [5. , 3.5, 1.6, 0.6],
       [6.4, 3.2, 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [5. , 3.4, 1.5, 0.2],
       [5.6, 3. , 4.5, 1.5],
       [6.8, 3. , 5.5, 2.1],
       [6.2, 2.9, 4.3, 1.3],
       [5.8, 2.7, 5.1, 1.9],
       [4.8, 3. , 1.4, 0.1],
       [6.3, 2.9, 5.6, 1.8],
       [5.7, 3. , 4.2, 1.2],
       [7.2, 3. , 5.8, 1.6],
       [6. , 2.7, 5.1, 1.6],
       [5.7, 3.8, 1.7, 0.3],
       [6.1, 2.9, 4.7, 1.4],
       [6.1, 3. , 4.6, 1.4],
       [6.1, 2.6, 5.6, 1.4],
       [5. , 3.2, 1.2, 0.2],
       [5.6, 2.8, 4.9, 2. ],
       [4.7, 3.2, 1.3, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [6.3, 

IndexError: index 32 is out of bounds for axis 0 with size 4