## Imports

In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function

## Data

In [10]:
df = pd.read_csv('./data/breast-cancer/breast-cancer.data')
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


### Preprocessing

In [11]:
attributes_missing = ['node-caps', 'breast-quad']

for attribute in attributes_missing:
    df[attribute] = df[attribute].replace('?', df[attribute].value_counts().idxmax())

attributes_to_onehot = ['menopause', 'breast', 'breast-quad']

for attribute in attributes_to_onehot:
    new_columns = pd.get_dummies(df[attribute], prefix=attribute)
    df = df.drop(attribute, axis=1)
    df = df.join(new_columns)

binary_attributes = ['node-caps', 'irradiat']

for attribute in binary_attributes:
    df[attribute] = df[attribute].map(dict(yes=1, no=0))

attributes_to_normalize = ['age', 'tumor-size', 'inv-nodes']

for attribute in attributes_to_normalize:
    df[attribute] = df[attribute].map(lambda a: float(a.split('-')[0]) + (float(a.split('-')[1]) - float(a.split('-')[0])) / 2)
    df[attribute] = (df[attribute]-df[attribute].min())/(df[attribute].max() - df[attribute].min())

classes_dict = {'no-recurrence-events': 0, 'recurrence-events' : 1}
df['class'] = df['class'].map(lambda a: classes_dict[a])

In [12]:
X = df.drop(columns = ['class'])
Y = df['class']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

## Tree model

In [13]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train, Y_train)

## Model evaluation

In [14]:
true_preds, num_preds = 0., 0.

for X, Y in zip(X_test.to_numpy(), Y_test.to_numpy()):
    y_pred = clf.predict([X])
    if y_pred == Y:
        true_preds += 1
    num_preds += 1

acc = true_preds / num_preds
print(f"Accuracy of the model: {100.0*acc:4.2f}%")

Accuracy of the model: 65.12%


