# Decision trees:

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from graphviz import Source
from sklearn.tree import export_graphviz
import numpy as np
from mglearn import datasets

## Decision trees for classification:

In [None]:
cancer = load_breast_cancer()
cancer.keys()

In [None]:
data = cancer["data"]
target = cancer["target"]
print(target)
fnames = cancer["feature_names"]
col = zip(fnames,"target")

In [None]:
dfc = pd.DataFrame(data,columns=fnames)
dfc["target"] = target
dfc

### Depth stops wenn all leafs are pure (standart):

In [None]:
X_train,X_test,y_train,y_test = train_test_split(data,target,random_state=0)

In [None]:
tree = DecisionTreeClassifier(random_state=0)

In [None]:
tree.fit(X_train,y_train)

In [None]:
tree.score(X_train,y_train)

As expected the score for the training dataset is 100 % because Decisiontreeclassifier runs until all leaves are pure.

In [None]:
# 
tree.score(X_test,y_test)

#### Understanding which features are most important to make a decision:  (sum is always 1)

In [None]:
fimportnace = tree.feature_importances_

In [None]:
x = np.linspace(0,len(fimportnace),len(fimportnace))
plt.plot(x,fimportnace,"o")
plt.xticks(range(len(fimportnace)),fnames,rotation=90)
plt.title("feature importance")
plt.grid()

### Depth stops at 3:

In [None]:
tree = DecisionTreeClassifier(random_state=0,max_depth=3)

In [None]:
tree.fit(X_train,y_train)

In [None]:
tree.score(X_train,y_train)

In [None]:
tree.score(X_test,y_test)

In [None]:
fimportnace = tree.feature_importances_

In [None]:
x = np.linspace(0,len(fimportnace),len(fimportnace))
plt.plot(x,fimportnace,"o")
plt.xticks(range(len(fimportnace)),fnames,rotation=90)
plt.title("feature importance")
plt.grid()

### Plotting the score of diffrent depths:

In [None]:
x = list(map(lambda x: DecisionTreeClassifier(random_state=0,max_depth=x).fit(X_train,y_train).score(X_test,y_test),range(1,6)), )

In [None]:
plt.plot(range(1,6),x,"o")
plt.xlabel("depth")
plt.ylabel("score")
plt.title("Sorce for differnent depths")
plt.grid()

### Visualizing thr tree structure:

In [None]:
export_graphviz(tree,out_file="tree.dot",class_names=["malignant","beignn"],feature_names=fnames,impurity=False,filled=True)

In [None]:
import graphviz
with open("tree.dot") as f:
	dot_graph = f.read()

In [None]:
Source(dot_graph)

## Decision trees for regression:

In [None]:
ram_prices = pd.read_csv("./data/ram_price.csv")
ram_prices


In [None]:
date = ram_prices["date"]
price = ram_prices["price"]

In [None]:
plt.plot(date,price)
plt.yscale("log")
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.title("Price of ram over the years")
plt.grid()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# use historical data to forecast prices after the year 2000
data_train = ram_prices[ram_prices.date < 2000]
data_test = ram_prices[ram_prices.date >= 2000]
data_train

In [None]:
# predict prices based on date
X_train = np.array(data_train.date)[:, np.newaxis]
# we use a log-transform to get a simpler relationship of data to target
y_train = np.log(data_train.price)

In [None]:
tree = DecisionTreeRegressor().fit(X_train,y_train)
linear_reg = LinearRegression().fit(X_train,y_train)

In [None]:
# predict on all data
X_all = np.array(ram_prices.date)[:, np.newaxis]

In [None]:
pred_tree = tree.predict(X_all)
pred_linear_ref  = linear_reg.predict(X_all)

In [None]:
plt.plot(date, np.exp(pred_tree), label="Tree Regressor", linewidth=2, linestyle="--")
plt.plot(date,np.exp(pred_linear_ref),label="Linear Regression")
plt.plot(date,price,label="Real Data")
plt.yscale("log")
plt.xlabel("Year")
plt.ylabel("Price in $/Mbyte")
plt.title("Price of ram over the years with prediction")
plt.grid()
plt.legend()


- Attention the blue line is identical to the green that why we don't see it
- Decision trees *are not able to predict* data that is outside the training data range!

# Random Forests:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from mglearn.plots import plot_tree_partition
from mglearn.plots import plot_2d_separator
from mglearn import discrete_scatter

## Random Forest for moons:

In [None]:
# importing dataset
X,y = make_moons(n_samples=100,noise=0.25,random_state=3)
# crating a DataFrame to better visualize the data
moonsdf  = pd.DataFrame(X,columns=["Feature 1","Feature 2"])
moonsdf["target"]= y

In [None]:
moonsdf

In [None]:
# creating the training and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=42)

In [None]:
# Creating and fitting the random forest model
rforest = RandomForestClassifier(n_estimators=5,random_state=2).fit(X_train,y_train)

In [None]:
# Plot of the trees and the final solution
fig, axes = plt.subplots(2,3,figsize=(20,10))
for i, (ax,tree) in enumerate(zip(axes.ravel(),rforest.estimators_)):
    ax.set_title("Tree {}".format(i))
    plot_tree_partition(X_train,y_train,tree,ax=ax)
plot_2d_separator(rforest, X_train,fill=True,ax=axes[-1,-1],alpha=0.4)
axes[-1,-1].set_title("Random Forest") 
discrete_scatter(X_train[:,0],X_train[:,1],y_train)


- We see the mistakes some of the trees make with datapoints that were not included in der training data because of the *bootstrap method*.

In [None]:
# Test data score
rforest.score(X_test,y_test)

## Random forest for breast cancer:

In [None]:
# loading the data
cancer = load_breast_cancer()
cancer.keys()

In [None]:
data = cancer.data
target  = cancer.target

In [None]:
X_train,X_test,y_train,y_test = train_test_split(data,target,random_state=0)

In [None]:
rforest = RandomForestClassifier(n_estimators=100,random_state=0).fit(X_train,y_train)

In [None]:
fimportnace = rforest.feature_importances_
fimportnace

In [None]:
x = np.linspace(0,len(fimportnace),len(fimportnace))
plt.bar(x,fimportnace)
plt.xticks(range(len(fimportnace)),fnames,rotation=90)
plt.title("feature importance")
plt.ylabel("importance")
plt.xlabel("features")
plt.grid()

In [None]:
rforest.score(X_train,y_train)

In [None]:
rforest.score(X_test,y_test)

# Gradient boosted regression trees:

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbrt = GradientBoostingClassifier(random_state=0).fit(X_train,y_train)

In [None]:
gbrt.score(X_train,y_train)

In [None]:
gbrt.score(X_test,y_test)

In [None]:
fimportnace = gbrt.feature_importances_

In [None]:
x = np.linspace(0,len(fimportnace),len(fimportnace))
plt.bar(x,fimportnace)
plt.xticks(range(len(fimportnace)),fnames,rotation=90)
plt.title("feature importance")
plt.ylabel("importance")
plt.xlabel("features")
plt.grid()

## Pre-pruning by reducing max_depth:

In [None]:
gbrt = GradientBoostingClassifier(random_state=0,max_depth=1).fit(X_train,y_train)

In [None]:
gbrt.score(X_train,y_train)

As expacted by pre-pruning we reduce the overfitting

In [None]:
gbrt.score(X_test,y_test)

In [None]:
fimportnace = gbrt.feature_importances_

Significantly increases accuracy 

In [None]:
x = np.linspace(0,len(fimportnace),len(fimportnace))
plt.bar(x,fimportnace)
plt.xticks(range(len(fimportnace)),fnames,rotation=90)
plt.title("feature importance")
plt.ylabel("importance")
plt.xlabel("features")
plt.grid()

## Pre-pruning by reducing the learning rate:

In [None]:
gbrt = GradientBoostingClassifier(random_state=0,learning_rate=0.01).fit(X_train,y_train)

In [None]:
gbrt.score(X_train,y_train)

As expacted by pre-pruning we reduce the overfitting

In [None]:
gbrt.score(X_test,y_test)

Significantly increases accuracy 