In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Training Dataset/training datalist.csv')
display(df.head())

Unnamed: 0,ID,Sex,Age,Disease category,Narrow pitch range,Decreased volume,Fatigue,Dryness,Lumping,heartburn,...,Onset of dysphonia,Noise at work,Occupational vocal demand,Diabetes,Hypertension,CAD,Head and Neck Cancer,Head injury,CVA,Voice handicap index - 10
0,1202f15,2,39,1,1,1,1,1,1,0,...,2,3,1,0,0,0,0,0,0,22.0
1,0600ve0,1,69,2,1,1,1,1,0,0,...,2,1,3,0,0,0,0,0,1,19.0
2,1001o7l,2,59,2,1,1,1,1,0,0,...,2,3,4,0,0,0,0,0,0,18.0
3,1201c1t,2,47,1,1,0,1,1,1,0,...,3,1,1,0,0,0,0,0,0,27.0
4,0402jvt,1,87,1,0,0,0,0,0,0,...,1,1,4,0,1,0,0,0,0,16.0


In [3]:
# data cleaning
df['PPD'].fillna(0, inplace=True)
df.dropna(subset=['Voice handicap index - 10'], inplace=True)
# df.info()
df.reset_index(drop=True, inplace=True)

In [4]:
top_10_features = ['Age', 'Voice handicap index - 10', 'Onset of dysphonia ', 'Occupational vocal demand', 'Noise at work', 'Diurnal pattern', 'Choking', 'Sex', 'Smoking', 'Decreased volume']

In [5]:
from sklearn.model_selection import train_test_split

X = df[top_10_features]
y = df['Disease category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


In [6]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
forest.fit(X_train.values, y_train.values)

y_pred = forest.predict(X_test.values)

In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test.values, y_pred)
precision = precision_score(y_test.values, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test.values, y_pred, average='unweighted', zero_division=1)
f1 = f1_score(y_test.values, y_pred, average='weighted', zero_division=1)

print(f"Accuracy: {accuracy.round(2)}")
print(f"Precision: {precision.round(2)}")
print(f"Recall: {recall.round(2)}")
print(f"F1: {f1.round(2)}")

print(classification_report(y_test.values, y_pred, zero_division=1))
print(confusion_matrix(y_test.values, y_pred))

Accuracy: 0.63
Precision: 0.6
Recall: 0.63
F1: 0.6
              precision    recall  f1-score   support

           1       0.67      0.87      0.76       153
           2       0.48      0.41      0.44        64
           3       0.66      0.46      0.54        59
           4       0.00      0.00      0.00        11
           5       0.50      0.09      0.15        11

    accuracy                           0.63       298
   macro avg       0.46      0.36      0.38       298
weighted avg       0.60      0.63      0.60       298

[[133  10   8   2   0]
 [ 31  26   6   0   1]
 [ 18  14  27   0   0]
 [  8   3   0   0   0]
 [  9   1   0   0   1]]


In [8]:
# print the error position
print("(test, pred)")
for i in range(len(y_test)):
    if y_test.values[i] != y_pred[i]:
        print((y_test.values[i], y_pred[i]))

(test, pred)
(2, 3)
(2, 1)
(2, 1)
(3, 1)
(4, 1)
(2, 1)
(2, 3)
(3, 2)
(2, 1)
(3, 1)
(5, 1)
(3, 1)
(2, 1)
(1, 4)
(1, 2)
(1, 2)
(3, 1)
(4, 1)
(3, 2)
(1, 2)
(3, 1)
(2, 3)
(2, 1)
(1, 3)
(2, 1)
(2, 1)
(2, 1)
(3, 2)
(1, 2)
(2, 1)
(2, 1)
(3, 2)
(1, 2)
(1, 2)
(5, 1)
(2, 1)
(5, 1)
(4, 1)
(4, 1)
(1, 3)
(2, 1)
(3, 2)
(3, 1)
(2, 1)
(5, 2)
(5, 1)
(3, 2)
(3, 1)
(3, 2)
(1, 3)
(2, 1)
(3, 1)
(2, 1)
(5, 1)
(2, 1)
(2, 1)
(3, 2)
(3, 1)
(3, 1)
(1, 3)
(2, 1)
(2, 5)
(3, 2)
(5, 1)
(1, 3)
(3, 1)
(4, 2)
(2, 3)
(1, 2)
(2, 1)
(4, 1)
(4, 2)
(3, 1)
(3, 2)
(1, 2)
(2, 1)
(4, 2)
(1, 4)
(1, 3)
(4, 1)
(5, 1)
(2, 1)
(3, 1)
(1, 2)
(3, 2)
(2, 1)
(3, 1)
(5, 1)
(3, 2)
(3, 2)
(3, 1)
(2, 1)
(1, 2)
(4, 1)
(2, 1)
(2, 1)
(2, 1)
(3, 1)
(3, 1)
(2, 1)
(3, 1)
(2, 1)
(2, 1)
(2, 3)
(1, 3)
(2, 1)
(3, 2)
(4, 1)
(2, 3)
(5, 1)
(1, 3)


In [9]:
# plot tree
from sklearn import tree
import graphviz

# Select an individual decision tree from the Random Forest
selected_tree = forest.estimators_[0]

# Calculate the deppth of the selected decision tree
depth = selected_tree.get_depth()
print(depth)

# Plot the selected decision tree
dot_data = tree.export_graphviz(selected_tree, out_file=None,
                                feature_names=X_train.columns,
                                class_names=y_train.unique().astype(str),
                                filled=True, rounded=True,
                                special_characters=True)

graph = graphviz.Source(dot_data)
graph.render("random_forest_tree_full_data")  # Save the tree visualization as a file
graph.view()  # Display the tree visualization

15


'random_forest_tree_full_data.pdf'