In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , LinearRegression , SGDRegressor
from sklearn.neighbors import KNeighborsClassifier , KNeighborsRegressor
from sklearn.metrics import accuracy_score , r2_score
from mlxtend.plotting import plot_decision_regions
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn import svm

from scipy.stats import mode

In [2]:
df = pd.read_csv('data/heart.csv')

train_set , test_set = train_test_split(df , test_size=0.1 , random_state=0 , stratify=df.iloc[: , -1])
X_train = train_set.iloc[: , 0:-1].values
y_train = train_set.iloc[: , -1].values

X_test = test_set.iloc[: , 0:-1].values
y_test = test_set.iloc[: , -1].values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape) 
print(y_test.shape)

(272, 13)
(272,)
(31, 13)
(31,)


## **Bagging Classifications**

In [3]:
class BaggingClassification():
    
    def __init__(self , estimator , n_estimator , subset_size , t_score , random_state=42 , task='classification'):
        self.estimator = estimator
        self.n_estimator = n_estimator
        self.subset_size = subset_size
        self.models = []
        self.all_preds = []
        self.task = task
        self.t_score = t_score
        self.random_state = random_state
        np.random.seed(self.random_state)
        
        
    def fit(self , X , y):
        
        for i in range(self.n_estimator):
            X_subset , _ , y_subset , _ = train_test_split(X , y , test_size=self.subset_size , stratify=y)
            clf = self.estimator.__class__()
            clf.fit(X_subset , y_subset)
            self.models.append(clf)
            
            
    def models_predict(self , x):
        for mdl in self.models:
            y_pred = mdl.predict(x)
            self.all_preds.append(y_pred)
        all_preds_array = np.array(self.all_preds)
        return all_preds_array
    
    
    def predict(self , xt):
        
        predictions = self.models_predict(xt)
        if self.task=='classification':
            ensemble_prediction = mode(predictions).mode.ravel() 
        if self.task=='regression':
            ensemble_prediction = np.mean(predictions , axis=0)
        return ensemble_prediction
    
    
    def models_score(self , xs , ys):
        models_predictions = self.models_predict(xs)
        for i,y_hat in enumerate(models_predictions):
            print(f'Score for model{i} is {self.t_score(ys , y_hat)}')
            
            
    def ensemble_score(self , xe , ye):
        y_hat_ensemble = self.predict(xe)
        print(f'Score for ensemble bagging is {self.t_score(ye , y_hat_ensemble)}')

In [4]:
model = DecisionTreeClassifier()

bagging = BaggingClassification(estimator=model , n_estimator=20 , subset_size=0.6 , t_score=accuracy_score , random_state=0)

bagging.fit(X_train , y_train)

bagging.models_score(X_test , y_test)

Score for model0 is 0.7419354838709677
Score for model1 is 0.7741935483870968
Score for model2 is 0.7096774193548387
Score for model3 is 0.7096774193548387
Score for model4 is 0.7419354838709677
Score for model5 is 0.6774193548387096
Score for model6 is 0.8064516129032258
Score for model7 is 0.7741935483870968
Score for model8 is 0.7096774193548387
Score for model9 is 0.7419354838709677
Score for model10 is 0.7741935483870968
Score for model11 is 0.5806451612903226
Score for model12 is 0.7096774193548387
Score for model13 is 0.7419354838709677
Score for model14 is 0.6129032258064516
Score for model15 is 0.8709677419354839
Score for model16 is 0.7419354838709677
Score for model17 is 0.6451612903225806
Score for model18 is 0.7741935483870968
Score for model19 is 0.8387096774193549


In [5]:
bagging.ensemble_score(X_test , y_test)

Score for ensemble bagging is 0.8064516129032258


## **Random Forest**

In [179]:
class RandomForest():
    
    def __init__(self , n_estimator , max_depth , subset_size , num_subspace , t_score , ccp , splitter , random_state=4 , task='classification'):
        self.n_estimator = n_estimator
        self.subset_size = subset_size
        self.models = []
        self.all_preds = []
        self.task = task
        self.t_score = t_score
        self.random_state = random_state
        self.max_depth = max_depth
        self.num_subspace = num_subspace
        self.ccp = ccp
        self.all_sub_features = []
        self.splitter = splitter
        np.random.seed(self.random_state)
        
        
    def fit(self , X , y):
        
        for i in range(self.n_estimator):
            X_subset , _ , y_subset , _ = train_test_split(X , y , test_size=self.subset_size , stratify=y)
            num_features = X_subset.shape[1]
            subspace = np.random.choice(num_features , self.num_subspace , replace=False)
            X_sub = X_subset[: , subspace]
            clf = DecisionTreeClassifier(max_depth=self.max_depth , random_state=0 , class_weight='balanced' , ccp_alpha=self.ccp , splitter=self.splitter)
            clf.fit(X_sub , y_subset)
            self.models.append(clf)
            self.all_sub_features.append(subspace)

            
    def models_predict(self , x):
        for mdl,sub_feature in zip(self.models , self.all_sub_features):
            x_new = x[: , sub_feature]
            y_pred = mdl.predict(x_new)
            self.all_preds.append(y_pred)
        all_preds_array = np.array(self.all_preds)
        return all_preds_array
    
    
    def predict(self , xt):
        
        predictions = self.models_predict(xt)
        if self.task=='classification':
            ensemble_prediction = mode(predictions).mode.ravel() 
        if self.task=='regression':
            ensemble_prediction = np.mean(predictions , axis=0)
        return ensemble_prediction
    
    
    def models_score(self , xs , ys):
        models_predictions = self.models_predict(xs)
        for i,y_hat in enumerate(models_predictions):
            print(f'Score for model{i} is {self.t_score(ys , y_hat)}')
            
            
    def ensemble_score(self , xe , ye):
        y_hat_ensemble = self.predict(xe)
        print(f'Score for ensemble is {self.t_score(ye , y_hat_ensemble)}')

In [184]:
rf = RandomForest(n_estimator=20 , max_depth=3 , subset_size=0.6 , num_subspace=13 , t_score=accuracy_score , ccp=0.0 , splitter='best')
rf.fit(X_train , y_train)
rf.models_score(X_test , y_test)

Score for model0 is 0.7096774193548387
Score for model1 is 0.7419354838709677
Score for model2 is 0.7419354838709677
Score for model3 is 0.7096774193548387
Score for model4 is 0.8709677419354839
Score for model5 is 0.6774193548387096
Score for model6 is 0.5806451612903226
Score for model7 is 0.8064516129032258
Score for model8 is 0.8709677419354839
Score for model9 is 0.8064516129032258
Score for model10 is 0.8064516129032258
Score for model11 is 0.8387096774193549
Score for model12 is 0.8064516129032258
Score for model13 is 0.7096774193548387
Score for model14 is 0.7741935483870968
Score for model15 is 0.7741935483870968
Score for model16 is 0.7741935483870968
Score for model17 is 0.7741935483870968
Score for model18 is 0.7419354838709677
Score for model19 is 0.7741935483870968


In [185]:
rf.ensemble_score(X_test , y_test)

Score for ensemble is 0.8709677419354839


## **Extremely Randomized Trees**

In [186]:
ert = RandomForest(n_estimator=20 , max_depth=3 , subset_size=0.6 , num_subspace=13 , t_score=accuracy_score , ccp=0.0 , splitter='random')
ert.fit(X_train , y_train)
ert.models_score(X_test , y_test)

Score for model0 is 0.7096774193548387
Score for model1 is 0.7741935483870968
Score for model2 is 0.7096774193548387
Score for model3 is 0.6774193548387096
Score for model4 is 0.8064516129032258
Score for model5 is 0.7096774193548387
Score for model6 is 0.6774193548387096
Score for model7 is 0.7741935483870968
Score for model8 is 0.8709677419354839
Score for model9 is 0.7096774193548387
Score for model10 is 0.7096774193548387
Score for model11 is 0.7419354838709677
Score for model12 is 0.8709677419354839
Score for model13 is 0.7419354838709677
Score for model14 is 0.7419354838709677
Score for model15 is 0.8709677419354839
Score for model16 is 0.8709677419354839
Score for model17 is 0.7741935483870968
Score for model18 is 0.6774193548387096
Score for model19 is 0.7419354838709677


In [187]:
ert.ensemble_score(X_test , y_test)

Score for ensemble is 0.8064516129032258
