# IS-02 Machine Learning - Data and Web Science
## Lecture 3: Decision Trees
## Project 2 - Decision Tree
### <i>Avgitidis Konstantinos </i>

In [1]:
# Importing all necessary libraries
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz/bin/'
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, metrics, tree, model_selection
import graphviz
import pandas as pd
import numpy as np
import random

In [2]:
breastCancer = datasets.load_breast_cancer(as_frame=True) #Load the breast cancer Dataset

In [3]:
breast_df = breastCancer.data.values[:,5:12] #Get just 7 columns in order to avoid overfitting
target_df = breastCancer.target

In [4]:
#Creating our DataFrame of Decision Tree models
#We really dont want to be creating the model each time so lets pickle it
import pickle
amodel = []
models_ = []
criterions = ['gini','entropy']
try:
    models = pickle.load( open( "decision_tree.p", "rb" ) )
except Exception:
    #Initialise the DataFrame we're going to be using
    columns = ['Algorithm','Criterion','Max Depth','accuracy_score','precision_score','recall_score','f1_score','accuracy_score_train','precision_score_train','recall_score_train','f1_score_train']
    models = pd.DataFrame(columns=columns) 
    for i in range(20):
        criterion = random.choice(criterions)
        max_depth = random.randint(1,10)
        amodel.append('Decision Tree')
        amodel.append(criterion)
        amodel.append(max_depth)
        model = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,splitter='random')
        models_.append(model)
        x_train, x_test, y_train, y_test = model_selection.train_test_split(breast_df, target_df,random_state=0)
        model.fit(x_train, y_train)
        y_predicted = model.predict(x_test)
        y_predicted_train = model.predict(x_train)
        amodel.append(metrics.accuracy_score(y_test, y_predicted))
        amodel.append(metrics.precision_score(y_test, y_predicted))
        amodel.append(metrics.recall_score(y_test, y_predicted))
        amodel.append(metrics.f1_score(y_test, y_predicted))
        amodel.append(metrics.accuracy_score(y_train, y_predicted_train))
        amodel.append(metrics.precision_score(y_train, y_predicted_train))
        amodel.append(metrics.recall_score(y_train, y_predicted_train))
        amodel.append(metrics.f1_score(y_train, y_predicted_train))
        models = models.append(pd.DataFrame([amodel], columns=columns))
        amodel.clear()
    models.reset_index(drop=True)
    pickle.dump(models, open( "decision_tree.p", "wb" ) )

In [5]:
models.to_excel("DecisionTrees.xlsx",sheet_name="Decision Tree",index=False,float_format="%.6f",freeze_panes=(1,0))

In [6]:
for i in range(20):
    dot_data = tree.export_graphviz(models_[i],feature_names = breastCancer.data.columns[5:12],class_names = breastCancer.target_names)
    graph = graphviz.Source(dot_data)
    graph.format = 'png'
    graph.render(f"breastCancerTreePlot{i+1}")