In [56]:
# gradient boosting
import os
import pandas as pd
import numpy as np
import catboost
from catboost import CatBoostRegressor
import networkx as nx

# https://catboost.ai/en/docs/concepts/tutorials
# https://youtu.be/usdEWSDisS0

In [57]:
NUMBER_NODES = 7

def load_data():
    train_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_train.csv'))
    val_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_val.csv'))
    test_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_test.csv'))

    featuresNumber = (NUMBER_NODES * NUMBER_NODES - NUMBER_NODES) // 2 
    def get_tuple_tensor_dataset(row):
        X = row[0 : featuresNumber].astype('int32')
        Y = row[featuresNumber + 1: ].astype('int32') # Inclui a banda otima na posicao 0
        return X, Y

    train_dataset = list(map(get_tuple_tensor_dataset, train_df.to_numpy()))
    val_dataset = list(map(get_tuple_tensor_dataset, val_df.to_numpy()))
    test_dataset = list(map(get_tuple_tensor_dataset, test_df.to_numpy()))

    X = []
    Y = []
    for x, y in train_dataset:
        X.append(x)
        Y.append(y)
    x_train = np.array(X)
    y_train = np.array(Y)

    X = []
    Y = []
    for x, y in test_dataset:
        X.append(x)
        Y.append(y)
    x_test = np.array(X)
    y_test = np.array(Y)

    X = []
    Y = []
    for x, y in val_dataset:
        X.append(x)
        Y.append(y)
    x_val = np.array(X)
    y_val = np.array(Y)

    x_train = np.concatenate((x_train, x_val))
    y_train = np.concatenate((y_train, y_val))


    return x_train, y_train, x_test, y_test

In [58]:
x_train, y_train, x_test, y_test = load_data()

In [59]:
cat_features = list(range(0, x_test.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [60]:
model = CatBoostRegressor(objective='MultiRMSE', verbose=100)
model.fit(x_train, y_train, eval_set=(x_test, y_test), cat_features=cat_features, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 4.9685372	test: 5.0103356	best: 5.0103356 (0)	total: 8.67ms	remaining: 8.66s
100:	learn: 4.2595071	test: 4.6323238	best: 4.6323238 (100)	total: 1.13s	remaining: 10.1s
200:	learn: 3.9160863	test: 4.5308368	best: 4.5305104 (199)	total: 2.22s	remaining: 8.84s
300:	learn: 3.6589244	test: 4.4920674	best: 4.4920674 (300)	total: 3.51s	remaining: 8.15s
400:	learn: 3.4594657	test: 4.4632432	best: 4.4584854 (380)	total: 4.66s	remaining: 6.96s
500:	learn: 3.2926053	test: 4.4579057	best: 4.4513965 (466)	total: 5.89s	remaining: 5.87s
600:	learn: 3.1553447	test: 4.4583899	best: 4.4513965 (466)	total: 7.09s	remaining: 4.71s
700:	learn: 3.0330676	test: 4.4716848	best: 4.4513965 (466)	total: 8.36s	remaining: 3.56s
800:	learn: 2.9279096	test: 4.4923077	best: 4.4513965 (466)	total: 9.6s	remaining: 2.39s
900:	learn: 2.8332496	test: 4.5102409	best: 4.4513965 (466)	total: 10.8s	remaining: 1.18s
999:	learn: 2.7492686	test: 4.5257821	best: 4.4513965 (466)	total: 12s	remaining: 0us

bestTest = 4.4513

<catboost.core.CatBoostRegressor at 0x1a1e376ba00>

In [61]:
def count_repeats(output):
    counts = np.unique(np.round(output))
    repeated = NUMBER_NODES - counts.shape[0]
    return repeated

def get_valid_pred(pred):
    valid = np.ones(7)
    labels = np.arange(0, 7)
    for i in labels:
        min_value = np.amin(pred)
        min_idx = np.where(pred == min_value)
        pred[min_idx] = 100
        valid[min_idx] = i
    return valid
    
def get_bandwidth(Graph, nodelist):
    Graph = nx.Graph(Graph)
    if not Graph.edges:
        return 0
    if nodelist.all() != None:
        L = nx.laplacian_matrix(Graph, nodelist=nodelist)
    else:
        L = nx.laplacian_matrix(Graph)
    x, y = np.nonzero(L)
    return (x-y).max()

def getGraph(upperTriangleAdjMatrix):
    dense_adj = np.zeros((NUMBER_NODES, NUMBER_NODES))
    dense_adj = np.zeros((NUMBER_NODES, NUMBER_NODES))
    k = 0
    for i in range(NUMBER_NODES):
        for j in range(NUMBER_NODES):
            if i == j:
                continue
            elif i < j:
                dense_adj[i][j] = upperTriangleAdjMatrix[k]
                k += 1
            else:
                dense_adj[i][j] = dense_adj[j][i]
    return dense_adj

In [62]:
pred = model.predict(x_test)

sumTest_original = 0
sumTest_pred = 0
sumTest_true = 0

count = 0
cases_with_repetition = 0

for i in range(len(pred)):

    output = pred[i]

    quantity_repeated = count_repeats(np.round(output))
    print('Pred: ', output)
    print('True: ', y_test[i])
    if quantity_repeated != 0:
        cases_with_repetition += 1
    output = get_valid_pred(output)
    print('Pred valid: ', output)
    count += quantity_repeated

    print("Bandwidth")
    graph = getGraph(x_test[i])
    original_band = get_bandwidth(graph, np.array(None))
    sumTest_original += original_band
    pred_band = get_bandwidth(graph, output)
    sumTest_pred += pred_band
    true_band = get_bandwidth(graph, y_test[i])
    sumTest_true += true_band
    print("Bandwidth")
    print(original_band)
    print(pred_band)
    print(true_band)
print('Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 - ', count)
print('Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 - ', cases_with_repetition)
test_length = pred.shape[0]
print('Test length - ', test_length)
print("Bandwidth mean")
print(sumTest_original / test_length)
print("Pred bandwidth mean")
print(sumTest_pred / test_length)
print("True bandwidth mean")
print(sumTest_true / test_length)

Pred:  [2.04922577 4.85535776 3.34922341 1.45359576 2.9127382  3.34101561
 3.0388435 ]
True:  [1 5 6 0 3 2 4]
Pred valid:  [1. 6. 5. 0. 2. 4. 3.]
Bandwidth
Bandwidth
6
2
2
Pred:  [1.57521545 1.69954669 3.66552972 4.17823601 5.00200548 2.00750928
 2.87195738]
True:  [3 0 6 4 5 2 1]
Pred valid:  [0. 1. 4. 5. 6. 2. 3.]
Bandwidth
Bandwidth
6
4
3
Pred:  [2.62136386 3.74249796 3.64491407 2.76031619 1.82559514 3.44653925
 2.95877352]
True:  [1 5 6 0 2 4 3]
Pred valid:  [1. 6. 5. 2. 0. 4. 3.]
Bandwidth
Bandwidth
6
3
2
Pred:  [3.67473548 2.95678946 2.93900905 2.86295607 3.697998   2.40441228
 2.46409966]
True:  [2 0 4 6 5 1 3]
Pred valid:  [5. 4. 3. 2. 6. 0. 1.]
Bandwidth
Bandwidth
6
6
2
Pred:  [1.8312402  3.20817215 3.06739794 3.17381489 4.03759283 2.12343512
 3.55834689]
True:  [1 2 6 4 0 5 3]
Pred valid:  [0. 4. 2. 3. 6. 1. 5.]
Bandwidth
Bandwidth
6
6
2
Pred:  [3.02236761 2.05934492 2.90927278 5.0489756  3.58087683 1.2234111
 3.15575116]
True:  [4 0 2 6 5 3 1]
Pred valid:  [3. 1. 2. 6. 5. 0.