# Sprendinių medžiai ir atsitiktiniai miškai

In [None]:
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import graphviz
import time
import numpy as np
from matplotlib import pyplot as plt


In [None]:
def printData(data, header=[], topic=None):

    data_frame = pd.DataFrame(data, columns=(header if len(header) > 0 else data.columns))
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    if topic:
        print(topic)
    print(data_frame)
    print()

def confusionMatrix(test_keepAttr, predicted, classNamesSize, dtc):
    conf = confusion_matrix(test_keepAttr, predicted, labels=dtc.classes_)
    display_labels = np.round(np.linspace(0, 1, classNamesSize), decimals=2)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf, display_labels=display_labels)
    disp.plot(cmap='gist_earth')
    plt.xlabel('Spėtos reikšmės')
    plt.ylabel('Teisingos reikšmės')
    plt.title("Susimaišymo matrica")
    plt.show()

def generateClassNames(data, column_name):
    unique_values = data[column_name].unique()
    class_names = []
    for i in range(len(unique_values)):
        if i != len(unique_values):
            class_names.append('{:.1f}'.format(i / len(unique_values)))
        else:
            class_names.append('1')
    return class_names

In [None]:
input_file = 'data.csv'

learning_percentage = 60
stringCols = ['Type']
prediction_attribute = 'quality'
depths = [2,4,6,8,10]
maxDepth = 10
Trees = [3,4,5,6,7,8,9]

data = pd.read_csv(input_file)


df = pd.DataFrame({
    'Atributo pavadinimas': data.columns,
    'Kiekis (Eilučių sk.)': data.count(),
    'Trūkstamos reikšmės': data.isnull().sum(),
    'Kardinalumnas': data.nunique(),
    'Tipas': data.dtypes,
})

df

In [None]:
# Convert the last categorical column to numerical values
for col in stringCols:
    data[col] = data[col].astype('category').cat.codes

# write to csv
if stringCols:
    data.to_csv('data_fixed.csv', index=False)

In [None]:
print(data.head(5))
print(data.tail(5))


## Duomenų rinkinio suskaidymas į apmokymo ir testavimo poaibius

In [None]:
# Split the data into learning and prediction sets
learning_set = data.groupby(prediction_attribute, group_keys=False) \
    .apply(lambda x: x.sample(frac=learning_percentage/100))
prediction_set = data.drop(learning_set.index)

## Duomenų paruošimas apmokymui ir testavimui

In [None]:
# Splitting the data into training and testing sets
X_train = learning_set.drop(prediction_attribute, axis=1)
Y_train = learning_set[prediction_attribute]

X_test = prediction_set.drop(prediction_attribute, axis=1)
Y_test = prediction_set[prediction_attribute]

classNames = generateClassNames(data, prediction_attribute)
featureNames = data.drop(columns=prediction_attribute).columns.tolist()


# Sprendimo medis

## Pilnas gylis

In [None]:
dataToWrite = pd.DataFrame()
dtc = DecisionTreeClassifier(criterion="gini")
start = time.time()
dtc.fit(X_train, Y_train)
stop = time.time()
predicted = dtc.predict(X_test)
correct = accuracy_score(Y_test, predicted, normalize=False)
data = tree.export_graphviz(dtc, feature_names=featureNames, class_names=classNames, filled=True)
image = graphviz.Source(data, format="png")
image.render("Sprendimu_medis", cleanup=True)
confusionMatrix(Y_test, predicted, len(classNames), dtc)

tempDataToWrite = {
    "Teisingi spėjimai": [correct],
    "Blogi spėjimai": [(len(Y_test) - correct)],
    "Tikslumas": [(correct / len(Y_test)) * 100],
    "Užtruktas laikas": [stop - start]
}
tempDataToWrite = pd.DataFrame(tempDataToWrite)
dataToWrite = pd.concat([dataToWrite, tempDataToWrite], ignore_index=True)
dataToWrite["Tikslumas"] = dataToWrite["Tikslumas"].map("{:.2f}%".format)
dataToWrite["Užtruktas laikas"] = dataToWrite["Užtruktas laikas"].map("{}s".format)
printData(dataToWrite, [], "Rezultatai, gauti naudojant sprendimo medį")

## Keleta skirtingų gylių

In [None]:
dataToPrint = pd.DataFrame()
for depth in depths:
    dtc = DecisionTreeClassifier(criterion="gini", max_depth=depth)
    start = time.time()
    dtc.fit(X_train, Y_train)
    stop = time.time()
    predicted = dtc.predict(X_test)
    correct = accuracy_score(Y_test, predicted, normalize=False)

    tempDataToWrite = {
        "Gylis": [depth],
        "Teisingi spėjimai": [correct],
        "Blogi spėjimai": [(len(Y_test) - correct)],
        "Tikslumas": [(correct / len(Y_test)) * 100],
        "Užtruktas laikas": [stop - start]
    }
    tempDataToWrite = pd.DataFrame(tempDataToWrite)
    dataToPrint = pd.concat([dataToPrint, tempDataToWrite], ignore_index=True)

dataToPrint["Tikslumas"] = dataToPrint["Tikslumas"].map("{:.2f}%".format)
printData(dataToPrint, [], "Rezultatai, gauti naudojant sprendimo medį")


## Atsitiktinis miskas

In [None]:
dataToPrint = pd.DataFrame()
rfc = RandomForestClassifier(criterion="gini", max_depth=maxDepth, n_estimators=5)
start = time.time()
rfc.fit(X_train, Y_train)
stop = time.time()
for count, est in enumerate(rfc.estimators_, 1):
    data = tree.export_graphviz(est, feature_names=featureNames, class_names=classNames, filled=True)
    image = graphviz.Source(data, format="png")
    image.render(f"Miškas_medis_nr_{count}", cleanup=True)
predicted = rfc.predict(X_test)
correct = accuracy_score(Y_test, predicted, normalize=False)
tempDataToPrint = {
    "Teisingi spėjimai": [correct],
    "Blogi spėjimai": [(len(Y_test) - correct)],
    "Tikslumas": [(correct / len(Y_test)) * 100],
    "Užtruktas laikas": [stop - start]
}
tempDataToPrint = pd.DataFrame(tempDataToPrint)
dataToPrint = pd.concat([dataToPrint, tempDataToPrint], ignore_index=True)

dataToPrint["Tikslumas"] = dataToPrint["Tikslumas"].map("{:.2f}%".format)
printData(dataToPrint, [], "Rezultatai, gauti naudojant mišką")

## Skirtingi sudarančių medžių kiekiai

In [None]:
dataToPrint = pd.DataFrame()
for tree in Trees:
    rfc = RandomForestClassifier(criterion="gini", max_depth=maxDepth, n_estimators=tree)
    start = time.time()
    rfc.fit(X_train, Y_train)
    stop = time.time()

    predicted = rfc.predict(X_test)
    correct = accuracy_score(Y_test, predicted, normalize=False)
    tempDataToPrint = {
        "Medžių kiekis": [tree],
        "Teisingi spėjimai": [correct],
        "Blogi spėjimai": [(len(Y_test) - correct)],
        "Tikslumas": [(correct / len(Y_test)) * 100],
        "Užtruktas laikas": [stop - start]
    }
    tempDataToPrint = pd.DataFrame(tempDataToPrint)
    dataToPrint = pd.concat([dataToPrint, tempDataToPrint], ignore_index=True)
dataToPrint["Tikslumas"] = dataToPrint["Tikslumas"].map("{:.2f}%".format)
dataToPrint["Užtruktas laikas"] = dataToPrint["Užtruktas laikas"].map("{}s".format)
printData(dataToPrint, [], "Rezultatai, gauti naudojant mišką")