In [1]:
from numpy import mean
from numpy import std
import numpy as np
import os
import pandas as pd
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeRegressor
import networkx as nx

In [2]:
NUMBER_NODES = 7

def load_data():
    train_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_train.csv'))
    val_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_val.csv'))
    test_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_test.csv'))

    featuresNumber = (NUMBER_NODES * NUMBER_NODES - NUMBER_NODES) // 2 
    def get_tuple_tensor_dataset(row):
        X = row[0 : featuresNumber].astype('float32')
        Y = row[featuresNumber + 1: ].astype('float32') # Inclui a banda otima na posicao 0
        return X, Y

    train_dataset = list(map(get_tuple_tensor_dataset, train_df.to_numpy()))
    val_dataset = list(map(get_tuple_tensor_dataset, val_df.to_numpy()))
    test_dataset = list(map(get_tuple_tensor_dataset, test_df.to_numpy()))

    X = []
    Y = []
    for x, y in train_dataset:
        X.append(x)
        Y.append(y)
    x_train = np.array(X, dtype=object)
    y_train = np.array(Y, dtype=object)

    X = []
    Y = []
    for x, y in test_dataset:
        X.append(x)
        Y.append(y)
    x_test = np.array(X, dtype=object)
    y_test = np.array(Y, dtype=object)

    X = []
    Y = []
    for x, y in val_dataset:
        X.append(x)
        Y.append(y)
    x_val = np.array(X)
    y_val = np.array(Y)

    x_train = np.concatenate((x_train, x_val))
    y_train = np.concatenate((y_train, y_val))


    return x_train, y_train, x_test, y_test

In [3]:
x_train, y_train, x_test, y_test = load_data()

In [24]:
max_depths = np.arange(2, 24, 2)
models = {}

for i in range(len(max_depths)):
    models[f'model_{max_depths[i]}'] = DecisionTreeRegressor(max_depth=max_depths[i])

print(models)

{'model_2': DecisionTreeRegressor(max_depth=2), 'model_4': DecisionTreeRegressor(max_depth=4), 'model_6': DecisionTreeRegressor(max_depth=6), 'model_8': DecisionTreeRegressor(max_depth=8), 'model_10': DecisionTreeRegressor(max_depth=10), 'model_12': DecisionTreeRegressor(max_depth=12), 'model_14': DecisionTreeRegressor(max_depth=14), 'model_16': DecisionTreeRegressor(max_depth=16), 'model_18': DecisionTreeRegressor(max_depth=18), 'model_20': DecisionTreeRegressor(max_depth=20), 'model_22': DecisionTreeRegressor(max_depth=22)}


In [25]:
for model in models.values():
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2652124)
    n_scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    n_scores = absolute(n_scores)
    print(f'MAE Error: {mean(n_scores):.3f} ({std(n_scores):.3f}) - {model}')

MAE Error: 1.563 (0.026) - DecisionTreeRegressor(max_depth=2)
MAE Error: 1.524 (0.033) - DecisionTreeRegressor(max_depth=4)
MAE Error: 1.506 (0.048) - DecisionTreeRegressor(max_depth=6)
MAE Error: 1.499 (0.061) - DecisionTreeRegressor(max_depth=8)
MAE Error: 1.511 (0.087) - DecisionTreeRegressor(max_depth=10)
MAE Error: 1.525 (0.089) - DecisionTreeRegressor(max_depth=12)
MAE Error: 1.537 (0.094) - DecisionTreeRegressor(max_depth=14)
MAE Error: 1.554 (0.090) - DecisionTreeRegressor(max_depth=16)
MAE Error: 1.544 (0.096) - DecisionTreeRegressor(max_depth=18)
MAE Error: 1.552 (0.093) - DecisionTreeRegressor(max_depth=20)
MAE Error: 1.545 (0.097) - DecisionTreeRegressor(max_depth=22)


In [26]:
for model in models.values():
    model.fit(x_train, y_train)

In [27]:
def count_repeats(output):
    counts = np.unique(np.round(output))
    repeated = NUMBER_NODES - counts.shape[0]
    return repeated

def get_valid_pred(pred):
    valid = np.ones(7)
    labels = np.arange(0, 7)
    for i in labels:
        min_value = np.amin(pred)
        min_idx, = np.where(pred == min_value)
        min_idx = min_idx[0]
        pred[min_idx] = 100
        valid[min_idx] = i
    return valid
    
def get_bandwidth(Graph, nodelist):
    Graph = nx.Graph(Graph)
    if not Graph.edges:
        return 0
    if nodelist.all() != None:
        L = nx.laplacian_matrix(Graph, nodelist=nodelist)
    else:
        L = nx.laplacian_matrix(Graph)
    x, y = np.nonzero(L)
    return (x-y).max()

def getGraph(upperTriangleAdjMatrix):
    dense_adj = np.zeros((NUMBER_NODES, NUMBER_NODES))
    dense_adj = np.zeros((NUMBER_NODES, NUMBER_NODES))
    k = 0
    for i in range(NUMBER_NODES):
        for j in range(NUMBER_NODES):
            if i == j:
                continue
            elif i < j:
                dense_adj[i][j] = upperTriangleAdjMatrix[k]
                k += 1
            else:
                dense_adj[i][j] = dense_adj[j][i]
    return dense_adj

In [28]:
for model in models.values():
    pred = model.predict(x_test)

    sumTest_original = 0
    sumTest_pred = 0
    sumTest_true = 0

    count = 0
    cases_with_repetition = 0

    for i in range(len(pred)):
        output = pred[i]

        quantity_repeated = count_repeats(np.round(output))
        # print('Pred: ', output)
        # print('True: ', y_test[i])
        if quantity_repeated != 0:
            cases_with_repetition += 1
        output = get_valid_pred(output)
        # print('Pred valid: ', output)
        count += quantity_repeated

        # print("Bandwidth")
        graph = getGraph(x_test[i])
        original_band = get_bandwidth(graph, np.array(None))
        sumTest_original += original_band
        pred_band = get_bandwidth(graph, output)
        sumTest_pred += pred_band
        true_band = get_bandwidth(graph, y_test[i])
        sumTest_true += true_band
        # print("Bandwidth")
        # print(original_band)
        # print(pred_band)
        # print(true_band)
    print(f"Modelo - {model}")
    print('Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 - ', count)
    print('Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 - ', cases_with_repetition)
    test_length = pred.shape[0]
    print('Test length - ', test_length)
    print("Bandwidth mean")
    print(sumTest_original / test_length)
    print("Pred bandwidth mean")
    print(sumTest_pred / test_length)
    print("True bandwidth mean")
    print(sumTest_true / test_length)

Modelo - DecisionTreeRegressor(max_depth=2)
Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 -  252
Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 -  63
Test length -  63
Bandwidth mean
5.904761904761905
Pred bandwidth mean
5.238095238095238
True bandwidth mean
3.1904761904761907
Modelo - DecisionTreeRegressor(max_depth=4)
Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 -  256
Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 -  62
Test length -  63
Bandwidth mean
5.904761904761905
Pred bandwidth mean
4.698412698412699
True bandwidth mean
3.1904761904761907
Modelo - DecisionTreeRegressor(max_depth=6)
Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 -  203
Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 -  61
Test length -  63
Bandwidth mean
5.904761904761905
Pred bandwidth mean
4.666666666666667
True bandwidth mean