In [1]:
import numpy as np
import os
import pandas as pd
from numpy import absolute
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
import networkx as nx

In [2]:
NUMBER_NODES = 7

def load_data():
    train_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_train.csv'))
    val_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_val.csv'))
    test_df = pd.read_csv(os.path.join('..', 'datasets', f'dataset_{NUMBER_NODES}_test.csv'))

    featuresNumber = (NUMBER_NODES * NUMBER_NODES - NUMBER_NODES) // 2 
    def get_tuple_tensor_dataset(row):
        X = row[0 : featuresNumber].astype('float32')
        Y = row[featuresNumber + 1: ].astype('float32') # Pula a banda otima na posicao 0
        return X, Y

    train_dataset = list(map(get_tuple_tensor_dataset, train_df.to_numpy()))
    val_dataset = list(map(get_tuple_tensor_dataset, val_df.to_numpy()))
    test_dataset = list(map(get_tuple_tensor_dataset, test_df.to_numpy()))

    X = []
    Y = []
    for x, y in train_dataset:
        X.append(x)
        Y.append(y)
    x_train = np.array(X, dtype=object)
    y_train = np.array(Y, dtype=object)

    X = []
    Y = []
    for x, y in test_dataset:
        X.append(x)
        Y.append(y)
    x_test = np.array(X, dtype=object)
    y_test = np.array(Y, dtype=object)

    X = []
    Y = []
    for x, y in val_dataset:
        X.append(x)
        Y.append(y)
    x_val = np.array(X)
    y_val = np.array(Y)

    x_train = np.concatenate((x_train, x_val))
    y_train = np.concatenate((y_train, y_val))


    return x_train, y_train, x_test, y_test

In [3]:
x_train, y_train, x_test, y_test = load_data()

In [4]:
from sklearn.tree import DecisionTreeRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2652124)
n_scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
n_scores = absolute(n_scores)
print('MAE Error: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

# define the chained multioutput wrapper model
# wrapper = RegressorChain(model)

MAE Error: 1.535 (0.084)


In [5]:
model.fit(x_train, y_train)

DecisionTreeRegressor()

In [6]:
print(model.get_depth())

18


In [7]:
def count_repeats(output):
    counts = np.unique(np.round(output))
    repeated = NUMBER_NODES - counts.shape[0]
    return repeated

def get_valid_pred(pred):
    valid = np.ones(NUMBER_NODES)
    labels = np.arange(0, NUMBER_NODES)
    for i in labels:
        min_value = np.amin(pred)
        min_idx, = np.where(pred == min_value)
        min_idx = min_idx[0]
        pred[min_idx] = 100
        valid[min_idx] = i
    return valid
    
def get_bandwidth(Graph, nodelist):
    Graph = nx.Graph(Graph)
    if not Graph.edges:
        return 0
    if nodelist.all() != None:
        L = nx.laplacian_matrix(Graph, nodelist=nodelist)
    else:
        L = nx.laplacian_matrix(Graph)
    x, y = np.nonzero(L)
    return (x-y).max()

def getGraph(upperTriangleAdjMatrix):
    dense_adj = np.zeros((NUMBER_NODES, NUMBER_NODES))
    k = 0
    for i in range(NUMBER_NODES):
        for j in range(NUMBER_NODES):
            if i == j:
                continue
            elif i < j:
                dense_adj[i][j] = upperTriangleAdjMatrix[k]
                k += 1
            else:
                dense_adj[i][j] = dense_adj[j][i]
    return dense_adj

In [8]:
import time

pred = model.predict(x_test)

sumTest_original = []
sumTest_pred = []
sumTest_true = []

count = 0
cases_with_repetition = 0

start = time.time()
for i in range(len(pred)):

    output = pred[i]

    quantity_repeated = count_repeats(np.round(output))
    print('Pred: ', output)
    print('True: ', y_test[i])
    if quantity_repeated != 0:
        cases_with_repetition += 1
    output = get_valid_pred(output)
    print('Pred valid: ', output)
    count += quantity_repeated

    print("Bandwidth")
    graph = getGraph(x_test[i])
    original_band = get_bandwidth(graph, np.array(None))
    sumTest_original.append(original_band)

    pred_band = get_bandwidth(graph, output)
    sumTest_pred.append(pred_band)

    true_band = get_bandwidth(graph, y_test[i])
    sumTest_true.append(true_band)

    print("Bandwidth")
    print(original_band)
    print(pred_band)
    print(true_band)
end = time.time()

Pred:  [5. 1. 6. 0. 4. 2. 3.]
True:  [1.0 5.0 6.0 0.0 3.0 2.0 4.0]
Pred valid:  [5. 1. 6. 0. 4. 2. 3.]
Bandwidth
Bandwidth
6
3
2
Pred:  [3. 2. 6. 0. 5. 4. 1.]
True:  [3.0 0.0 6.0 4.0 5.0 2.0 1.0]
Pred valid:  [3. 2. 6. 0. 5. 4. 1.]
Bandwidth
Bandwidth
6
3
3
Pred:  [5. 1. 0. 6. 4. 2. 3.]
True:  [1.0 5.0 6.0 0.0 2.0 4.0 3.0]
Pred valid:  [5. 1. 0. 6. 4. 2. 3.]
Bandwidth
Bandwidth
6
2
2
Pred:  [1. 5. 6. 0. 4. 3. 2.]
True:  [2.0 0.0 4.0 6.0 5.0 1.0 3.0]
Pred valid:  [1. 5. 6. 0. 4. 3. 2.]
Bandwidth
Bandwidth
6
2
2
Pred:  [3. 2. 1. 5. 6. 4. 0.]
True:  [1.0 2.0 6.0 4.0 0.0 5.0 3.0]
Pred valid:  [3. 2. 1. 5. 6. 4. 0.]
Bandwidth
Bandwidth
6
3
2
Pred:  [2. 5. 6. 1. 0. 3. 4.]
True:  [4.0 0.0 2.0 6.0 5.0 3.0 1.0]
Pred valid:  [2. 5. 6. 1. 0. 3. 4.]
Bandwidth
Bandwidth
6
4
3
Pred:  [3. 2. 5. 6. 0. 1. 4.]
True:  [2.0 3.0 1.0 4.0 5.0 6.0 0.0]
Pred valid:  [3. 2. 5. 6. 0. 1. 4.]
Bandwidth
Bandwidth
6
3
3
Pred:  [1. 4. 6. 0. 2. 5. 3.]
True:  [1.0 4.0 6.0 0.0 2.0 5.0 3.0]
Pred valid:  [1. 4. 6. 0. 2. 5

In [9]:
print('Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 - ', count)
print('Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 - ', cases_with_repetition)
test_length = pred.shape[0]

print('Test length - ', test_length)
print('Tempo medio - ', (end - start) / test_length)
print("Bandwidth mean±std")
print(f'{np.mean(sumTest_original)}±{np.std(sumTest_original)}')
print("Pred bandwidth mean±std")
print(f'{np.mean(sumTest_pred)}±{np.std(sumTest_pred)}')
print("True bandwidth mean±std")
print(f'{np.mean(sumTest_true)}±{np.std(sumTest_true)}')

Quantidade de rótulos repetidos, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 6 -  0
Quantidade de saídas com repetição, exemplo [1, 1, 1, 1, 1, 1, 1] conta como 1 -  0
Test length -  63
Tempo medio -  0.0052856528569781595
Bandwidth mean±std
5.904761904761905±0.2935435239509036
Pred bandwidth mean±std
3.634920634920635±0.9644734940294166
True bandwidth mean±std
3.1904761904761907±0.7095078297976829


# Notes

## No Chained Multioutput Regression
- Only DecisionTreeRegressor, counted repeats: 0
- Only LinearRegression, counted repeats: 238
- Only KNeighborsRegressor, counted repeats: I couldn't get the output (infinite loading)

## Chained Multioutput Regression

- DecisionTreeRegressor, counted repeats: 26, 30
- LinearRegression, counted repeats: 238
- KNeighborsRegressor, counted repeats: 119
- SVM, actually support vector regression: 213

## No Chained Multioutput Regression but treating eache output independently (wrong)
- SVM, actually support vector regression: 194
- KNeighborsRegressor, counted repeats: I couldn't get the output (infinite loading)
- LinearRegression, counted repeats: 238
- DecisionTreeRegressor, counted repeats: 116

### https://machinelearningmastery.com/multi-output-regression-models-with-python/

#### Essa abordagens eu testei:
https://machinelearningmastery.com/multi-label-classification-with-deep-learning/
(sigmoid no final, e binary_crossentropy como loss)

https://machinelearningmastery.com/how-to-develop-a-convolutional-neural-network-to-classify-satellite-photos-of-the-amazon-rainforest/