In [13]:
# Imports
import os
import platform
import random
from tkinter import Tk

from itertools import permutations
import math
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np
import torch
import torchvision
from torchvision import transforms, models,datasets
from torch.utils.data import Dataset, DataLoader

# Extend width of Jupyter Notebook Cell to the size of browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# OS related settings
if platform.system() == 'Windows':
    print('Windows')
#     %matplotlib tk
    %matplotlib qt
elif platform.system() == 'Darwin':
    print('macOS')
    Tk().withdraw()
    %matplotlib osx
elif platform == 'linux' or platform == 'linux2':
    print('Linux')
# This line of "print" must exist right after %matplotlib command, otherwise JN will hang on the first import statement after this.
print('Interactive plot activated')

macOS
Interactive plot activated


In [14]:
# ChINN

# Convert decimal to binary string
def sources_and_subsets_nodes(N):
    str1 = "{0:{fill}"+str(N)+"b}"
    a = []
    for i in range(1,2**N):
        a.append(str1.format(i, fill='0'))

    sourcesInNode = []
    sourcesNotInNode = []
    subset = []
    sourceList = list(range(N))
    # find subset nodes of a node
    def node_subset(node, sourcesInNodes):
        return [node - 2**(i) for i in sourcesInNodes]
    
    # convert binary encoded string to integer list
    def string_to_integer_array(s, ch):
        N = len(s) 
        return [(N - i - 1) for i, ltr in enumerate(s) if ltr == ch]
    
    for j in range(len(a)):
        # index from right to left
        idxLR = string_to_integer_array(a[j],'1')
        sourcesInNode.append(idxLR)  
        sourcesNotInNode.append(list(set(sourceList) - set(idxLR)))
        subset.append(node_subset(j,idxLR))

    return sourcesInNode, subset


def subset_to_indices(indices):
    return [i for i in indices]

class Choquet_integral(torch.nn.Module):
    
    def __init__(self, N_in, N_out):
        super(Choquet_integral,self).__init__()
        self.N_in = N_in
        self.N_out = N_out
        self.nVars = 2**self.N_in - 2
        
        # The FM is initialized with mean
        dummy = (1./self.N_in) * torch.ones((self.nVars, self.N_out), requires_grad=True)
#        self.vars = torch.nn.Parameter( torch.Tensor(self.nVars,N_out))
        self.vars = torch.nn.Parameter(dummy)
        
        # following function uses numpy vs pytorch
        self.sourcesInNode, self.subset = sources_and_subsets_nodes(self.N_in)
        
        self.sourcesInNode = [torch.tensor(x) for x in self.sourcesInNode]
        self.subset = [torch.tensor(x) for x in self.subset]
        
    def forward(self,inputs):    
        self.FM = self.chi_nn_vars(self.vars)
        sortInputs, sortInd = torch.sort(inputs,1, True)
        M, N = inputs.size()
        sortInputs = torch.cat((sortInputs, torch.zeros(M,1)), 1)
        sortInputs = sortInputs[:,:-1] -  sortInputs[:,1:]
        
        out = torch.cumsum(torch.pow(2,sortInd),1) - torch.ones(1, dtype=torch.int64)
        
        data = torch.zeros((M,self.nVars+1))
        
        for i in range(M):
            data[i,out[i,:]] = sortInputs[i,:] 
        
        
        ChI = torch.matmul(data,self.FM)
            
        return ChI
    
    # Converts NN-vars to FM vars
    def chi_nn_vars(self, chi_vars):
#        nVars,_ = chi_vars.size()
        chi_vars = torch.abs(chi_vars)
        #        nInputs = inputs.get_shape().as_list()[1]
        
        FM = chi_vars[None, 0,:]
        for i in range(1,self.nVars):
            indices = subset_to_indices(self.subset[i])
            if (len(indices) == 1):
                FM = torch.cat((FM,chi_vars[None,i,:]),0)
            else:
                #         ss=tf.gather_nd(variables, [[1],[2]])
                maxVal,_ = torch.max(FM[indices,:],0)
                temp = torch.add(maxVal,chi_vars[i,:])
                FM = torch.cat((FM,temp[None,:]),0)
              
        FM = torch.cat([FM, torch.ones((1,self.N_out))],0)
        FM = torch.min(FM, torch.ones(1))  
        
        return FM
    


In [15]:
# ChIQP

import numpy as np
import itertools
from cvxopt import solvers, matrix





# Silencing solvers.qp

from contextlib import redirect_stdout
from io import StringIO
class NullIO(StringIO):
    def write(self, txt):
        pass


def silent(fn):
    """Decorator to silence functions."""
    def silent_fn(*args, **kwargs):
        with redirect_stdout(NullIO()):
            return fn(*args, **kwargs)
    return silent_fn








class ChoquetIntegral:

    def __init__(self):
        """Instantiation of a ChoquetIntegral.
           This sets up the ChI. It doesn't take any input parameters
           because you may want to use pass your own values in(as opposed
           to learning from data). To instatiate, use
           chi = ChoquetIntegral.ChoquetIntegral()
        """
        self.trainSamples, self.trainLabels = [], []
        self.testSamples, self.testLabels = [], []
        self.N, self.numberConstraints, self.M = 0, 0, 0
        self.g = 0
        self.fm = []
        self.type = []


    def train_chi(self, x1, l1):
        """
        This trains this instance of your ChoquetIntegral w.r.t x1 and l1.
        :param x1: These are the training samples of size N x M(inputs x number of samples)
        :param l1: These are the training labels of size 1 x M(label per sample)
        """
        self.type = 'quad'
        self.trainSamples = x1
        self.trainLabels = l1
        self.N = self.trainSamples.shape[0]
        self.M = self.trainSamples.shape[1]
        print("Number Inputs : ", self.N, "; Number Samples : ", self.M)
        self.fm = self.produce_lattice()

        return self



    def chi_quad(self, x2):
        """
        This will produce an output for this instance of the ChI
        This will use the learned(or specified) Choquet integral to
        produce an output w.r.t. to the new input.
        :param x2: testing sample
        :return: output of the choquet integral.
        """
        if self.type == 'quad':
            n = len(x2)
            pi_i = np.argsort(x2)[::-1][:n] + 1
            ch = x2[pi_i[0] - 1] * (self.fm[str(pi_i[:1])])
            for i in range(1, n):
                latt_pti = np.sort(pi_i[:i + 1])
                latt_ptimin1 = np.sort(pi_i[:i])
                ch = ch + x2[pi_i[i] - 1] * (self.fm[str(latt_pti)] - self.fm[str(latt_ptimin1)])
            return ch
        else:
            print("If using sugeno measure, you need to use chi_sugeno.")


    def produce_lattice(self):
        """
            This method builds is where the lattice(or FM variables) will be learned.
            The FM values can be found via a quadratic program, which is used here
            after setting up constraint matrices. Refer to papers for complete overview.
        :return: Lattice, the learned FM variables.
        """

        fm_len = 2 ** self.N - 1  # nc
        E = np.zeros((fm_len, fm_len))  # D
        L = np.zeros(fm_len)  # f
        index_keys = self.get_keys_index()
        for i in range(0, self.M):  # it's going through one sample at a time.
            l = self.trainLabels[i]  # this is the labels
            fm_coeff = self.get_fm_class_img_coeff(index_keys, self.trainSamples[:, i], fm_len)  # this is Hdiff
            # print(fm_coeff)
            L = L + (-2) * l * fm_coeff
            E = E + np.matmul(fm_coeff.reshape((fm_len, 1)), fm_coeff.reshape((1, fm_len)))

        G, h, A, b = self.build_constraint_matrices(index_keys, fm_len)
        solvers_qp = silent(solvers.qp)
        sol = solvers_qp(matrix(2 * E, tc='d'), matrix(L.T, tc='d'), matrix(G, tc='d'), matrix(h, tc='d'),
                         matrix(A, tc='d'), matrix(b, tc='d'))
        g = sol['x']
        Lattice = {}
        for key in index_keys.keys():
            Lattice[key] = g[index_keys[key]]
        return Lattice


    def build_constraint_matrices(self, index_keys, fm_len):
        """
        This method builds the necessary constraint matrices.
        :param index_keys: map to reference lattice components
        :param fm_len: length of the fuzzy measure
        :return: the constraint matrices
        """

        vls = np.arange(1, self.N + 1)
        line = np.zeros(fm_len)
        G = line
        line[index_keys[str(np.array([1]))]] = -1.
        h = np.array([0])
        for i in range(2, self.N + 1):
            line = np.zeros(fm_len)
            line[index_keys[str(np.array([i]))]] = -1.
            G = np.vstack((G, line))
            h = np.vstack((h, np.array([0])))
        for i in range(2, self.N + 1):
            parent = np.array(list(itertools.combinations(vls, i)))
            for latt_pt in parent:
                for j in range(len(latt_pt) - 1, len(latt_pt)):
                    children = np.array(list(itertools.combinations(latt_pt, j)))
                    for latt_ch in children:
                        line = np.zeros(fm_len)
                        line[index_keys[str(latt_ch)]] = 1.
                        line[index_keys[str(latt_pt)]] = -1.
                        G = np.vstack((G, line))
                        h = np.vstack((h, np.array([0])))

        line = np.zeros(fm_len)
        line[index_keys[str(vls)]] = 1.
        G = np.vstack((G, line))
        h = np.vstack((h, np.array([1])))

        # equality constraints
        A = np.zeros((1, fm_len))
        A[0, -1] = 1
        b = np.array([1]);

        return G, h, A, b


    def get_fm_class_img_coeff(self, Lattice, h, fm_len):  # Lattice is FM_name_and_index, h is the samples, fm_len
        """
        This creates a FM map with the name as the key and the index as the value
        :param Lattice: dictionary with FM
        :param h: sample
        :param fm_len: fm length
        :return: the fm_coeff
        """

        n = len(h)  # len(h) is the number of the samples
        fm_coeff = np.zeros(fm_len)
        pi_i = np.argsort(h)[::-1][:n] + 1
        for i in range(1, n):
            fm_coeff[Lattice[str(np.sort(pi_i[:i]))]] = h[pi_i[i - 1] - 1] - h[pi_i[i] - 1]
        fm_coeff[Lattice[str(np.sort(pi_i[:n]))]] = h[pi_i[n - 1] - 1]
        np.matmul(fm_coeff, np.transpose(fm_coeff))
        return fm_coeff


    def get_keys_index(self):
        """
        Sets up a dictionary for referencing FM.
        :return: The keys to the dictionary
        """

        vls = np.arange(1, self.N + 1)
        count = 0
        Lattice = {}
        for i in range(0, self.N):
            Lattice[str(np.array([vls[i]]))] = count
            count = count + 1
        for i in range(2, self.N + 1):
            A = np.array(list(itertools.combinations(vls, i)))
            for latt_pt in A:
                Lattice[str(latt_pt)] = count
                count = count + 1
        return Lattice


In [16]:
# Creating data

num_train = 200
dim = 3
num_test = 1000

# Get the N! possible permutations
all_perms = list(permutations(list(range(dim))))
random.shuffle(all_perms)
print(all_perms)

# Get the sort of each data sample
train_data = np.random.rand(dim, num_train)
train_data_perms = np.argsort(train_data, 0)
print('Train data size')
print(train_data.shape)

# Divide data into N! sets according to its permutation
train_data_div_by_perm = []

print('All permutations')
for i, current_perm in enumerate(all_perms):
    print(i, current_perm)
    temp = np.where(train_data_perms[0, :]==current_perm[0])
    for j, idx in enumerate(current_perm):
#         print(j, idx)
        temp = np.intersect1d(temp, np.where(train_data_perms[j, :]==idx))
    train_data_div_by_perm.append(temp)
# print(train_data_div_by_perm)
# print(train_data_permutation[:, train_data_div_by_perm[0]])



# Test data
test_data = np.random.rand(dim, num_test)
test_data_perms = np.argsort(test_data, 0)

# Divide data into N! sets according to its permutation
test_data_div_by_perm = []

for i, current_perm in enumerate(all_perms):
    temp = np.where(test_data_perms[0, :]==current_perm[0])
    for j, idx in enumerate(current_perm):
        temp = np.intersect1d(temp, np.where(test_data_perms[j, :]==idx))
    test_data_div_by_perm.append(temp)

[(0, 1, 2), (2, 0, 1), (1, 0, 2), (2, 1, 0), (0, 2, 1), (1, 2, 0)]
Train data size
(3, 200)
All permutations
0 (0, 1, 2)
1 (2, 0, 1)
2 (1, 0, 2)
3 (2, 1, 0)
4 (0, 2, 1)
5 (1, 2, 0)


In [17]:
# Making N! train datasets with data percentage = i/N!
train_data_ioverN_percent = [train_data_div_by_perm[0]]
for i in range(1, len(all_perms)):
    train_data_iN = np.concatenate((train_data_ioverN_percent[i-1], train_data_div_by_perm[i]))
    random.shuffle(train_data_iN)
    train_data_ioverN_percent.append(train_data_iN)
    
print('train data size')
for i, data_idx in enumerate(train_data_ioverN_percent):
    train_d = train_data[:, data_idx]
    print(train_d.shape)
    


    
# Making N! test datasets with data percentage = i/N!
test_data_ioverN_percent = [test_data_div_by_perm[0]]
test_data_ioverN_percent_c = []
for i in range(1, len(all_perms)):
    test_data_iN = np.concatenate((test_data_ioverN_percent[i-1], test_data_div_by_perm[i]))
    test_data_iN_c = list(set(range(num_test)) - set(test_data_ioverN_percent[i-1]))
    test_data_ioverN_percent.append(test_data_iN)
    test_data_ioverN_percent_c.append(test_data_iN_c)
    
print('test data size')
for i, data_idx in enumerate(test_data_ioverN_percent):
    test_d = test_data[:, data_idx]
    print(test_d.shape)

print('Test data complement size')
for i, data_idx in enumerate(test_data_ioverN_percent_c):
    test_d_c = test_data[:, data_idx]
    print(test_d_c.shape)

train data size
(3, 41)
(3, 66)
(3, 92)
(3, 127)
(3, 179)
(3, 200)
test data size
(3, 165)
(3, 350)
(3, 514)
(3, 678)
(3, 828)
(3, 1000)
Test data complement size
(3, 835)
(3, 650)
(3, 486)
(3, 322)
(3, 172)


In [18]:
# Train

eva_func_num = 4

SSEs = np.zeros((len(all_perms), eva_func_num))
SSEs_c = np.zeros((len(all_perms)-1, eva_func_num))

for i, data_idx in enumerate(train_data_ioverN_percent):
    print('Seen data percentage:', (i+1)/math.factorial(dim))
    
    train_d = train_data[:, data_idx]
    test_d = test_data[:, test_data_ioverN_percent[i]]
    
    train_label_min = np.amin(train_d, 0)
    train_label_max = np.amax(train_d, 0)
    train_label_mean = np.mean(train_d, 0)
    train_label_gmean = np.cbrt(np.prod(train_d, 0))
    
    test_label_min = np.amin(test_d, 0)
    test_label_max = np.amax(test_d, 0)
    test_label_mean = np.mean(test_d, 0)
    test_label_gmean = np.cbrt(np.prod(test_d, 0))
    
    if i < len(all_perms)-1:
        test_d_c = test_data[:, test_data_ioverN_percent_c[i]]

        test_label_min_c = np.amin(test_d_c, 0)
        test_label_max_c = np.amax(test_d_c, 0)
        test_label_mean_c = np.mean(test_d_c, 0)
        test_label_gmean_c = np.cbrt(np.prod(test_d_c, 0))
    
    
    chi = ChoquetIntegral()
    chi.train_chi(train_d, train_label_min)
    SSE = 0
    for j in range(np.size(test_d, 1)):
        chi_min_test = chi.chi_quad(test_d[:, j])
        SSE += (chi_min_test - test_label_min[j]) ** 2
    SSEs[i, 0] = SSE / np.size(test_d)
    print('\nChi min', chi.fm)
    print('\nSSE', SSE)
    if i < len(all_perms)-1:
        SSE_c = 0
        for j in range(np.size(test_d_c, 1)):
            chi_min_test_c = chi.chi_quad(test_d_c[:, j])
            SSE_c += (chi_min_test_c - test_label_min_c[j]) ** 2
        SSEs_c[i, 0] = SSE_c / np.size(test_d_c)
        print('\nUnseen data SSE', SSE_c)
        

    chi = ChoquetIntegral()
    chi.train_chi(train_d, train_label_max)
    SSE = 0
    for j in range(np.size(test_d, 1)):
        chi_max_test = chi.chi_quad(test_d[:, j])
        SSE += (chi_max_test - test_label_max[j]) ** 2
    SSEs[i, 1] = SSE / np.size(test_d)
    print('\nChi max', chi.fm)
    print('\nSSE', SSE)
    if i < len(all_perms)-1:
        SSE_c = 0
        for j in range(np.size(test_d_c, 1)):
            chi_max_test_c = chi.chi_quad(test_d_c[:, j])
            SSE_c += (chi_max_test_c - test_label_max_c[j]) ** 2
        SSEs_c[i, 1] = SSE_c / np.size(test_d_c)
        print('\nUnseen data SSE', SSE_c)
        
        
    chi = ChoquetIntegral()
    chi.train_chi(train_d, train_label_mean)
    SSE = 0
    for j in range(np.size(test_d, 1)):
        chi_mean_test = chi.chi_quad(test_d[:, j])
        SSE += (chi_mean_test - test_label_mean[j]) ** 2
    SSEs[i, 2] = SSE / np.size(test_d)
    print('\nChi mean', chi.fm)
    print('\nSSE', SSE)
    if i < len(all_perms)-1:
        SSE_c = 0
        for j in range(np.size(test_d_c, 1)):
            chi_mean_test_c = chi.chi_quad(test_d_c[:, j])
            SSE_c += (chi_mean_test_c - test_label_mean_c[j]) ** 2
        SSEs_c[i, 2] = SSE_c / np.size(test_d_c)
        print('\nUnseen data SSE', SSE_c)
        
        
    chi = ChoquetIntegral()
    chi.train_chi(train_d, train_label_gmean)
    SSE = 0
    for j in range(np.size(test_d, 1)):
        chi_gmean_test = chi.chi_quad(test_d[:, j])
        SSE += (chi_gmean_test - test_label_gmean[j]) ** 2
    SSEs[i, 3] = SSE / np.size(test_d)
    print('\nChi geometric mean', chi.fm)
    print('\nSSE', SSE)
    if i < len(all_perms)-1:
        SSE_c = 0
        for j in range(np.size(test_d_c, 1)):
            chi_gmean_test_c = chi.chi_quad(test_d_c[:, j])
            SSE_c += (chi_gmean_test_c - test_label_gmean_c[j]) ** 2
        SSEs_c[i, 3] = SSE_c / np.size(test_d_c)
        print('\nUnseen data SSE', SSE_c)
    
    
    print('\n\n\n')
    

Seen data percentage: 0.16666666666666666
Number Inputs :  3 ; Number Samples :  41

Chi min {'[1]': 0.2354879142068756, '[2]': 0.0001849275946498831, '[3]': 1.8637436170893987e-05, '[1 2]': 0.719153104800557, '[1 3]': 0.7122960354239325, '[2 3]': 0.0003870308197371083, '[1 2 3]': 1.0}

SSE 2.3781459239824693e-06

Unseen data SSE 42.94300935129752
Number Inputs :  3 ; Number Samples :  41

Chi max {'[1]': 0.2794067654824745, '[2]': 0.287202606481952, '[3]': 0.9990570883064791, '[1 2]': 0.7646721188148462, '[1 3]': 0.9995489603566573, '[2 3]': 0.9999536491448996, '[1 2 3]': 1.0}

SSE 1.369233329855145e-05

Unseen data SSE 40.23260729702878
Number Inputs :  3 ; Number Samples :  41

Chi mean {'[1]': 0.23381603250379973, '[2]': 0.2150156433321878, '[3]': 0.33333330146158896, '[1 2]': 0.7656312561380358, '[1 3]': 0.7848534138080756, '[2 3]': 0.6666667011697853, '[1 2 3]': 1.0}

SSE 1.6662938156904184e-14

Unseen data SSE 1.0290310829847267
Number Inputs :  3 ; Number Samples :  41

Chi geo

In [19]:
fig, ax = plt.subplots()
x = (np.asarray(list(range(1, len(all_perms)+1)))) / len(all_perms)
print(x.shape, SSEs.shape)
print(x)
print(SSEs)
plt.plot(x, SSEs)
ax.set_title('SSE (Data with same pattern)')
ax.legend(['Min', 'Max', 'Mean', 'Geometric Mean'])
ax.set_xlabel('Percentage of Seen Data')
ax.set_ylabel('SSE')
ax.xaxis.set_major_formatter(FuncFormatter('{0:.0%}'.format))

plt.show()

(6,) (6, 4)
[0.16666667 0.33333333 0.5        0.66666667 0.83333333 1.        ]
[[4.80433520e-09 2.76612794e-08 3.36625013e-17 7.49089748e-04]
 [7.58025949e-09 4.90270237e-08 7.18705342e-16 6.96594024e-04]
 [8.18595506e-09 3.36938880e-08 5.51118888e-16 7.18994370e-04]
 [5.40009699e-09 3.93546091e-08 2.97662542e-16 7.38822614e-04]
 [4.13600519e-09 2.28304516e-08 6.60204868e-17 7.27941691e-04]
 [3.09944702e-09 1.99495912e-08 3.33667210e-17 7.30039683e-04]]


In [20]:
fig, ax = plt.subplots()
x = (np.asarray(list(range(1, len(all_perms))))) / len(all_perms)
print(x.shape, SSEs_c.shape)
print(x)
print(SSEs_c)
plt.plot(x, SSEs_c)
ax.set_title('SSE (Data with unseen pattern)')
ax.legend(['Min', 'Max', 'Mean', 'Geometric Mean'])
ax.set_xlabel('Percentage of Seen Data')
ax.set_ylabel('SSE')
ax.xaxis.set_major_formatter(FuncFormatter('{0:.0%}'.format))

plt.show()

(5,) (5, 4)
[0.16666667 0.33333333 0.5        0.66666667 0.83333333]
[[1.71429179e-02 1.60609211e-02 4.10790851e-04 3.31711800e-03]
 [8.05585302e-03 6.39065345e-03 3.52006954e-04 2.21867577e-03]
 [9.86391872e-09 8.31515837e-03 3.40203934e-04 8.41676878e-04]
 [6.95676317e-09 4.79632700e-08 2.94031058e-16 7.79052796e-04]
 [7.66568893e-09 4.09475874e-08 1.16152832e-16 8.24999427e-04]]


In [21]:
# NN test
    
# training samples size
M = 700

# number of inputs
N_in = 3

# number of outputs aka number of Choquet integral neurons
N_out = 2  

# Create a synthetic dataset via random sampling from a normal distribution with mean =-1 and std=2
X_train = np.random.rand(M,N_in)*2-1

# Let's specify the FMs  (There will be N_out number of FMs)
# Herein we adopt binary encoding instead of lexicographic encoding to represent a FM that is easier to code. 
# As for example, an FM for three inputs using lexicographic encoding is, g = {g_1, g_2, g_3, g_{12}, g_{13}, g_{23}, g_{123}}.
# whereas its binary encoding is g = {g_1, g_2, g_{12}, g_3 g_{13}, g_{23}, g_{123}}.

# For simplicity, here we use OWA. 

# OWA = np.array([[0.7, 0.2, 0.1], # this is soft-max
#                 [0.1,0.2,0.7]])  # soft-min

# The FMs of the above OWAs in binary encoding
# FM = [[0.7, 0.7, 0.9, 0.7, 0.9, 0.9, 1.0].
#      [0.1, 0.1, 0.3, 0.1, 0.3, 0.3, 1.0]]

# print('Actual/groundtruth FMs in binary encoding:')
# print('FM1 = ', np.array([0.7, 0.7, 0.9, 0.7, 0.9, 0.9, 1.0]))
# print('FM2 = ', np.array([0.1, 0.1, 0.3, 0.1, 0.3, 0.3, 1.0]))

# Generate the label or the groundtruth based on the provided FMs/OWAs. The labels are two dimentional
# label_train = np.matmul(np.sort(X_train), np.fliplr(OWA).T)
label_train = np.amax(X_train, 0)

# Now we want to recover the FMs from the training data and groundtruth
# First, build a Choquet integral neuron with N_in inputs and N_out outputs
net = Choquet_integral(N_in,N_out)

# set the optimization algorithms and paramters the learning
learning_rate = 0.3;

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)   

num_epochs = 300;

# convert from numpy to torch tensor
X_train = torch.tensor(X_train,dtype=torch.float)
label_train = torch.tensor(label_train,dtype=torch.float)

# optimize
for t in range(num_epochs):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = net(X_train)

    # Compute the loss
    loss = criterion(y_pred, label_train)

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()  

# Finally, the learned FMs
FM_learned = (net.chi_nn_vars(net.vars).cpu()).detach().numpy()
print('\n\nLearned FMs:')
print('FM1 = ', FM_learned[:,0])
print('FM2 = ',FM_learned[:,1])





RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1