# Bagging Tutorial
### Author: Michael Schwabe
### Version: 1.0

* DataSet 1 Intro: 
    * https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-wisconsin-diagnostic-dataset
* DataSet 2 Intro: 
    * https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-house-prices-dataset
    
* Referenzen
    * https://scikit-learn.org/
    * https://towardsdatascience.com/ensemble-learning-bagging-boosting-3098079e5422 --> mehr Hintergrundwissen


In [29]:
# Basis Bibliotheken die wir zur Ausführung benötigen

# loader klasse für den Datensatz
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris

# Splitter für die Daten 
from sklearn.model_selection import train_test_split

# Booster verfahren und Klassifizierer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier

## Load Data Iris

In [30]:
# Laden des Datensatzes in x als Features und y als Labels
x, y = load_iris(return_X_y=True)

# aufteilen in Train und Test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

## Modelling

In [31]:
# Bagging Klassifizierer
# Basis Klassifizierer
dtree = DecisionTreeClassifier(max_depth=2, random_state=23)
etree = ExtraTreeClassifier()
svm = SVC()

# Übergabe des oder der Klassifizierer an das Bagging Verfahren
bagging = BaggingClassifier(base_estimator=etree, n_estimators=5, max_samples=50, bootstrap=True)
baggingsvc = BaggingClassifier(base_estimator=svm, n_estimators=5, max_samples=50, bootstrap=True)
baggingdtree = BaggingClassifier(base_estimator=dtree, n_estimators=5, max_samples=50, bootstrap=True)

# Fitten der Modelle
bagging.fit(x_train, y_train)
baggingsvc.fit(x_train, y_train)
baggingdtree.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
                                                        random_state=23),
                  max_samples=50, n_estimators=5)

In [32]:
# Performance
print(f"etree Train score: {bagging.score(x_train, y_train)}")
print(f"etree Test score: {bagging.score(x_test, y_test)}")

# Performance
print(f"SVC/SVM Train score: {baggingsvc.score(x_train, y_train)}")
print(f"SVC/SVM Test score: {baggingsvc.score(x_test, y_test)}")

# Performance
print(f"dtree Train score: {baggingdtree.score(x_train, y_train)}")
print(f"dtree Test score: {baggingdtree.score(x_test, y_test)}")

etree Train score: 0.9821428571428571
etree Test score: 1.0
SVC/SVM Train score: 0.9196428571428571
SVC/SVM Test score: 0.9736842105263158
dtree Train score: 0.9375
dtree Test score: 1.0


## Load Data BreastCancer

In [33]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

## Modelling

In [34]:
# Bagging Klassifizierer
# Basis Klassifizierer
dtree = DecisionTreeClassifier(max_depth=2, random_state=23)
etree = ExtraTreeClassifier()
svm = SVC()

# Übergabe des oder der Klassifizierer an das Bagging Verfahren
bagging = BaggingClassifier(base_estimator=etree, n_estimators=5, max_samples=50, bootstrap=True)
baggingsvc = BaggingClassifier(base_estimator=svm, n_estimators=5, max_samples=50, bootstrap=True)
baggingdtree = BaggingClassifier(base_estimator=dtree, n_estimators=5, max_samples=50, bootstrap=True)

# Fitten der Modelle
bagging.fit(x_train, y_train)
baggingsvc.fit(x_train, y_train)
baggingdtree.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
                                                        random_state=23),
                  max_samples=50, n_estimators=5)

In [35]:
# Performance
print(f"etree Train score: {bagging.score(x_train, y_train)}")
print(f"etree Test score: {bagging.score(x_test, y_test)}")

# Performance
print(f"SVC/SVM Train score: {baggingsvc.score(x_train, y_train)}")
print(f"SVC/SVM Test score: {baggingsvc.score(x_test, y_test)}")

# Performance
print(f"dtree Train score: {baggingdtree.score(x_train, y_train)}")
print(f"dtree Test score: {baggingdtree.score(x_test, y_test)}")

etree Train score: 0.92018779342723
etree Test score: 0.916083916083916
SVC/SVM Train score: 0.8873239436619719
SVC/SVM Test score: 0.9370629370629371
dtree Train score: 0.9366197183098591
dtree Test score: 0.958041958041958
