In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('../0. Data/data_processed.csv')

In [3]:
data = data.drop('Unnamed: 0', axis=1)

# Podela na ulazne i ciljne atribute

In [4]:
X = data.drop('Diabetes_012', axis=1)
Y = data['Diabetes_012']

# Podela na trening i test skup

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=44)

# Pomoćne funkcije

In [6]:
# Sa časova vežbi

# !pip install termcolor
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['0', '1', '2'], index=['0', '1', '2']))
    print("---------------------------------------------------------------------------------")
    

def plot_decision_tree(model, feature_names):
    plt.figure(figsize=(7, 7))
    plot_tree(model, class_names=['0', '1', '2'], feature_names=feature_names, filled=True)
    plt.title("Decision tree of depth {} with {} nodes".format(model.get_depth(), model.get_n_leaves()))

    plt.show()
    print("---------------------------------------------------------------------------------")
    print(colored("Parameters of model {}".format(type(model).__name__), "green"))
    for k, v in model.get_params().items():
        print(colored(k, 'blue'), v)
    
    print("---------------------------------------------------------------------------------")

# Slučajne šume

In [7]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=44, max_depth=6)

In [8]:
random_forest.fit(X_train, Y_train)

In [9]:
report(random_forest, X_train, Y_train)

[32mClassification report for model RandomForestClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      1.00      0.92    160277
           1       0.00      0.00      0.00      3473
           2       0.66      0.03      0.06     26510

    accuracy                           0.84    190260
   macro avg       0.50      0.34      0.33    190260
weighted avg       0.80      0.84      0.78    190260

---------------------------------------------------------------------------------
[32mConfusion matrix for model RandomForestClassifier on training data [0m
---------------------------------------------------------------------------------
        0  1    2
0  159887  0  390
1    3437  0   36
2   25684  0  826
---------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
report(random_forest, X_test, Y_test, "test")

[32mClassification report for model RandomForestClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      1.00      0.92     53426
           1       0.00      0.00      0.00      1158
           2       0.65      0.03      0.05      8836

    accuracy                           0.84     63420
   macro avg       0.50      0.34      0.32     63420
weighted avg       0.80      0.84      0.78     63420

---------------------------------------------------------------------------------
[32mConfusion matrix for model RandomForestClassifier on test data [0m
---------------------------------------------------------------------------------
       0  1    2
0  53305  0  121
1   1145  0   13
2   8585  0  251
---------------------------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
