# Decision Tree

### Load Data

In [1]:
import pandas as pd
df = pd.read_csv("healthcare-dataset-stroke-data.csv", header=0)
print("rows of original dataset:", len(df))
print(df.head())

rows of original dataset: 5110
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


### handle missing values

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
df["bmi"] = imputer.fit_transform(df[["bmi"]])

### Encode categorical variables

In [6]:
label_encoder = LabelEncoder()
df["gender"] = label_encoder.fit_transform(df["gender"])
df["ever_married"] = label_encoder.fit_transform(df["ever_married"])
df["work_type"] = label_encoder.fit_transform(df["work_type"])
df["Residence_type"] = label_encoder.fit_transform(df["Residence_type"])
df["smoking_status"] = label_encoder.fit_transform(df["smoking_status"])

### Split Train data and Test data

In [4]:
X = df.drop(columns=["id", "stroke"])
y = df["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### training classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

### Evaluate

In [8]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 89.89%
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      1444
           1       0.18      0.20      0.19        89

    accuracy                           0.90      1533
   macro avg       0.56      0.57      0.57      1533
weighted avg       0.91      0.90      0.90      1533



### Run the model for trees of depth 1, 2, 3, 4, 5, and 6 and for the Gini Impurity and Entropy impurity measures for each tree depth.

In [9]:
depths = [1, 2, 3, 4, 5, 6]
criteria = ['gini', 'entropy']
results = []
for criterion in criteria:
    for depth in depths:
        clf = DecisionTreeClassifier(criterion=criterion, max_depth=depth, random_state=42)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        
        results.append({
            'Criterion': criterion,
            'Depth': depth,
            'Accuracy': accuracy
        })

results_df = pd.DataFrame(results)
print(results_df)

   Criterion  Depth  Accuracy
0       gini      1  0.941944
1       gini      2  0.941944
2       gini      3  0.941944
3       gini      4  0.936725
4       gini      5  0.936725
5       gini      6  0.932159
6    entropy      1  0.941944
7    entropy      2  0.941944
8    entropy      3  0.941944
9    entropy      4  0.940639
10   entropy      5  0.939335
11   entropy      6  0.934116


### Visualize the Tree

In [23]:
best_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
best_clf.fit(X_train, y_train)

In [24]:
from sklearn.tree import export_graphviz

class_names = list(map(str, y.unique()))

export_graphviz(
    best_clf,
    out_file='tree_nonlimited.dot',
    feature_names=X.columns,
    class_names=class_names,
    rounded=True,
    proportion=False,
    precision=2,
    filled=True
)


In [None]:
!dot -Tpng tree_nonlimited.dot -o tree_nonlimited.png -Gdpi=600

In [None]:
from IPython.display import Image
Image(filename = 'tree_nonlimited.png')