# Decision Tree

In [34]:
# import preprocessed data
%run Preprocessed_Data.ipynb

# change the pathology to discrete values (0, 1, 2)
label = LabelEncoder()
train_calc['pathology'] = label.fit_transform(train_calc['pathology'])
test_calc['pathology'] = label.fit_transform(test_calc['pathology'])
train_mass['pathology'] = label.fit_transform(train_mass['pathology'])
test_mass['pathology'] = label.fit_transform(test_mass['pathology'])

In [35]:
import sklearn.tree as tree
import graphviz

## Calc Models for 'overall BI-RADS assessment'

In [56]:
# Decision Tree with no pruning
assessment_calc = tree.DecisionTreeClassifier(criterion='entropy')
assessment_calc = assessment_calc.fit(train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_calc['overall BI-RADS assessment'])

print('Score Against Training Data: ', assessment_calc.score(train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_calc['overall BI-RADS assessment']))
print('Score Against Test Data: ', assessment_calc.score(test_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), test_calc['overall BI-RADS assessment']))

Score Against Training Data:  0.924886191198786
Score Against Test Data:  0.6678700361010831


In [59]:
# print tree information
print("Number of Leaves: ", assessment_calc.get_n_leaves())
print("Depth of Tree: ", assessment_calc.get_depth())

Number of Leaves:  153
Depth of Tree:  15


In [61]:
# export the tree to a pdf file
graph = tree.export_graphviz(assessment_calc, out_file=None, feature_names=train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1).columns, class_names=['1', '2', '3', '4', '5'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(graph)
graph.render('calc_tree')

'calc_tree.pdf'

In [77]:
# Decision Tree with pruning
assessment_calc_pruned = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=10, max_depth=5)
assessment_calc_pruned = assessment_calc_pruned.fit(train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_calc['overall BI-RADS assessment'])

print('Score Against Training Data: ', assessment_calc_pruned.score(train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_calc['overall BI-RADS assessment']))
print('Score Against Test Data: ', assessment_calc_pruned.score(test_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1), test_calc['overall BI-RADS assessment']))

Score Against Training Data:  0.8437025796661608
Score Against Test Data:  0.6768953068592057


In [98]:
# export the tree to a pdf file
graph = tree.export_graphviz(assessment_calc_pruned, out_file=None, feature_names=train_calc.drop(['pathology', 'overall BI-RADS assessment'], axis=1).columns, class_names=['1', '2', '3', '4', '5'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(graph)
graph.render('calc_tree_pruned')

'calc_tree_pruned.pdf'

In [80]:
# Decision Tree without subtlety
assessment_calc_no_subtlety = tree.DecisionTreeClassifier(criterion='entropy')
assessment_calc_no_subtlety = assessment_calc_no_subtlety.fit(train_calc.drop(['pathology', 'overall BI-RADS assessment', 'subtlety'], axis=1), train_calc['overall BI-RADS assessment'])

print('Score Against Training Data: ', assessment_calc_no_subtlety.score(train_calc.drop(['pathology', 'overall BI-RADS assessment', 'subtlety'], axis=1), train_calc['overall BI-RADS assessment']))
print('Score Against Test Data: ', assessment_calc_no_subtlety.score(test_calc.drop(['pathology', 'overall BI-RADS assessment', 'subtlety'], axis=1), test_calc['overall BI-RADS assessment']))

Score Against Training Data:  0.8861911987860395
Score Against Test Data:  0.6624548736462094


Pruning and/or setting a max depth seem to have a minor benefit to score. Another way of looking at it is that having a huge/complex decision doesn't really improve prediction power over a simpler one.

## Mass Models for 'overall BI-RADS assessment'

In [91]:
assessment_mass = tree.DecisionTreeClassifier(criterion='entropy')
assessment_mass = assessment_mass.fit(train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_mass['overall BI-RADS assessment'])

print('Score Against Training Data: ', assessment_mass.score(train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_mass['overall BI-RADS assessment']))
print('Score Against Test Data: ', assessment_mass.score(test_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), test_mass['overall BI-RADS assessment']))

Score Against Training Data:  0.7754172989377845
Score Against Test Data:  0.6031746031746031


In [101]:
# export the tree to a pdf file
graph = tree.export_graphviz(assessment_mass, out_file=None, feature_names=train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1).columns, class_names=['1', '2', '3', '4', '5'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(graph)
graph.render('mass_tree')

'mass_tree.pdf'

In [96]:
# Decision Tree with pruning
assessment_mass_pruned = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5)
assessment_mass_pruned = assessment_mass_pruned.fit(train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_mass['overall BI-RADS assessment'])

print('Score Against Training Data: ', assessment_mass_pruned.score(train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), train_mass['overall BI-RADS assessment']))
print('Score Against Test Data: ', assessment_mass_pruned.score(test_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1), test_mass['overall BI-RADS assessment']))

Score Against Training Data:  0.6206373292867982
Score Against Test Data:  0.5978835978835979


In [97]:
# export the tree to a pdf file
graph = tree.export_graphviz(assessment_mass_pruned, out_file=None, feature_names=train_mass.drop(['pathology', 'overall BI-RADS assessment'], axis=1).columns, class_names=['1', '2', '3', '4', '5'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(graph)
graph.render('mass_tree_pruned')

IndexError: list index out of range

## Calc Model for 'pathology'

In [89]:
model = tree.DecisionTreeClassifier()
model.fit(train_calc.drop('pathology', axis=1), train_calc['pathology'])

model.score(test_calc.drop('pathology', axis=1), test_calc['pathology'])

0.720216606498195

In [90]:
# export the tree to a pdf file
data = tree.export_graphviz(model, out_file=None, feature_names=train_calc.drop('pathology', axis=1).columns, class_names=['benign without callback', 'benign', 'malignant'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(data)
graph.render("calc_pathology_tree")

'calc_pathology_tree.pdf'

## Mass Model for 'pathology'

In [62]:
model = tree.DecisionTreeClassifier()
model.fit(train_mass.drop('pathology', axis=1), train_mass['pathology'])

model.score(test_mass.drop('pathology', axis=1), test_mass['pathology'])

0.708994708994709

In [65]:
# export the tree to a pdf file
data = tree.export_graphviz(model, out_file=None, feature_names=train_mass.drop('pathology', axis=1).columns, class_names=['benign without callback', 'benign', 'malignant'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(data)
graph.render("mass_pathology_tree")

'mass_pathology_tree.pdf'