# On the purpose of analyzing x and scaling/normalizing to make GNN run with this x

In [42]:
import numpy as np
import math

In [2]:
# load numpy array
load_x = np.nan_to_num(np.loadtxt("data/descriptors_x.csv",delimiter = ","), nan=0).astype('float64')

In [3]:
load_x==0

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]])

In [4]:
# find columns with only zeros and delete them
index_cols = np.argwhere(np.all(load_x==0, axis=0))
index_cols

array([[ 67],
       [ 80],
       [160],
       [195]], dtype=int64)

In [5]:
load_x = np.delete(load_x, index_cols, axis=1)

In [16]:
# next is how to deal with negative values
# we will shift them so that they are positive now. even if there are very large small entries then later on it will be either scaled to [0,1] or they will be scaled with log treating these negative values like small values and the bigger ones as an exception.
# The relation between the numbers is never lost, only shifted
index_cols = (load_x<0).sum(axis=0)>0
# index_cols = np.argwhere(np.any(load_x<0, axis=0))
index_cols

array([False,  True, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False,

In [23]:
# find the minimal entries to shift
min_entries = load_x[:, index_cols].min(axis=0)-0.1
min_entries

array([ -9.869375  ,  -0.31888651,  -0.97516071,  -3.01910214,
        -3.7058597 ,  -1.24265491, -33.45      , -27.14      ,
        -0.16190162,  -0.16190162,  -0.16190162,  -0.16190162,
        -4.45069444, -12.240625  , -14.42262346,  -9.41635802,
       -38.70929934, -91.1707568 , -65.43988874, -89.98155098,
        -8.74040141, -51.28947068, -23.28585   ])

In [38]:
load_x[:, index_cols]-=np.transpose(np.repeat(np.reshape(min_entries, (-1,1)), load_x.shape[0], axis=1))

In [39]:
(load_x<0).sum()

0

In [44]:
# scaling is missing. for the majority it is ok to divide it with the maximum entry, but if there are very large entries we either do not want to scale this way as information in the lower parts will be lost or if the large numbers are outliers
# For this reason we have following rules: If we have 5 or more entries above 10^10 we keep them and apply logarithmic scaling (base 10). Else we assume that they are outliers and delete them.
# getting the entries above 10^10
idx_large = load_x>math.pow(10,10)

In [48]:
# compute the row wise sum
apply_log_col_index = idx_large.sum(axis=0)>5
apply_log_col_index

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [55]:
# for the rows with many large entries use log 10 scaling (first add 1 so that 0 entries will not produce -inf)

In [54]:
load_x[:, apply_log_col_index]=np.log10(load_x[:, apply_log_col_index]+1)

In [56]:
# now every remaining 10^10 index is labelled as an outlier and can be replaced by 0
load_x[load_x>math.pow(10,10)]=0

In [57]:
# run 0 column erasing part again
load_x = np.delete(load_x, np.argwhere(np.all(load_x==0, axis=0)), axis=1)

In [69]:
# run a [0,1] scaling over all the columns
col_max_entry = np.max(load_x, axis=0)

In [70]:
load_x /= col_max_entry

In [80]:
# in case we did not have any need to shift columns we have now many zero rows, which we will solve by introducing a new column using random values
load_x = np.c_[load_x, np.random.rand(load_x.shape[0])]

In [81]:
load_x

array([[0.        , 0.73820766, 0.        , ..., 0.        , 0.        ,
        0.60837226],
       [0.        , 0.73820766, 0.        , ..., 0.        , 0.        ,
        0.77614077],
       [0.        , 0.73820766, 0.        , ..., 0.        , 0.        ,
        0.70438978],
       ...,
       [0.58123546, 0.55479141, 0.58123546, ..., 0.        , 0.        ,
        0.43603667],
       [0.66394602, 0.71434246, 0.66394602, ..., 0.        , 0.        ,
        0.4158152 ],
       [0.60097239, 0.51088925, 0.60097239, ..., 0.        , 0.        ,
        0.3011869 ]])

## Testing section

In [43]:
math.pow(10,10)

10000000000.0

In [36]:
np.transpose(np.repeat(np.reshape(np.array([1,2,3]), (-1,1)), 3, axis=1))

array([[1, 2, 3],
       [1, 2, 3],
       [1, 2, 3]])

In [22]:
load_x[:, 67].sum()

2655047.4161205404

In [23]:
np.isnan(load_x).sum()

0

In [24]:
load_x.shape

(457560, 204)

In [7]:
# analyze if a column is just containing 0 entries
x_zero = np.sum(load_x==0, axis=0)
x_neg  = np.sum(load_x<0 , axis=0)
x_pos  = np.sum(load_x>0 , axis=0)

In [8]:
(x_zero==load_x.shape[0]).sum()

0

In [9]:
((x_zero+x_neg)==load_x.shape[0]).sum()

3

In [10]:
x_neg

array([     0, 393134,      0,      0,      0,      0,      0,      0,
            0,      0,    310, 454905,      0,      0,      0,      0,
            0,      0,      0,      0, 454905,      0, 454905,      0,
       261813,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0, 454049,
            0,      0,      0,      3,      0,      0,      1,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            1,      0,      0,      0,      0,      0,      0,      0,
            1,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      1,      0,      0,
            0,      0,      0,    609,    222,   1360,   2031,  31623,
       196488,   4161,  38198,  19429,  84388,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [9]:
neg_column_entries = load_x[:, (x_neg>0)]
neg_column_entries.shape

(457560, 23)

In [10]:
# per column determine which was the minimal entry and shift values so that all values in matrix are >=0
min_entries = neg_column_entries.min(axis=0)
min_entries

array([-9.76937500e+00, -2.18886515e-01, -8.75160711e-01, -2.91910214e+00,
       -3.60585970e+00, -1.14265491e+00, -3.33500000e+01, -2.70400000e+01,
       -6.19016157e-02, -6.19016157e-02, -6.19016157e-02, -6.19016157e-02,
       -4.35069444e+00, -1.21406250e+01, -1.43226235e+01, -9.31635802e+00,
       -3.86092993e+01, -9.10707568e+01, -6.53398887e+01, -8.98815510e+01,
       -8.64040141e+00, -5.11894707e+01, -2.31858500e+01])

In [11]:
neg_column_entries -= (np.repeat(np.reshape(min_entries, (-1,1)), neg_column_entries.shape[0], axis=1).transpose() - 0.1) # 0.1 for avoidance of zero entries for multiple min entries

In [12]:
# get positive column entries
pos_column_entries = load_x[:, np.logical_and(x_pos>0, x_neg==0)]
pos_column_entries.shape

(457560, 181)

In [13]:
#reunite both column matrices, every entry is now >=0
pos_x = np.concatenate((pos_column_entries, neg_column_entries), axis=1)
pos_x.shape

(457560, 204)

In [14]:
# now we have to address the problem of rows with only zeros and too large entries which need to be scaled
# first the scaling. idea: divide each row by the max entry, meaning that every entry is then between 0 and 1.
maximum_per_column = pos_x.max(axis=0)
maximum_per_column

array([1.74681989e+001, 1.74681989e+001, 8.76493197e+000, 9.48329259e-001,
       4.70734000e+003, 4.36663600e+003, 4.70450998e+003, 1.85600000e+003,
       2.00000000e+000, 1.79769313e+308, 6.01705788e-001, 2.00000000e+000,
       2.77777778e+000, 3.55555556e+000, 1.27904000e+002, 1.25904000e+002,
       3.31868517e+000, 3.12569820e+000, 1.50200000e+001, 7.10846639e+000,
       1.28277760e+004, 2.51177027e+002, 1.91115817e+002, 1.91115817e+002,
       1.55526275e+002, 1.09424866e+002, 1.09424866e+002, 8.49903232e+001,
       8.49903232e+001, 5.30649032e+001, 5.95720077e+001, 3.55820414e+001,
       7.47874836e+001, 4.75997012e+128, 2.89782514e+002, 1.48653503e+002,
       1.91500878e+003, 3.73694664e+002, 1.55945729e+002, 2.65823088e+002,
       8.43374801e+001, 3.96560434e+002, 2.71567814e+002, 7.21369131e+001,
       1.05369961e+002, 9.28075191e+001, 3.14145051e+002, 4.16183347e+002,
       1.56137987e+002, 3.13175989e+002, 3.10906253e+002, 3.31498818e+002,
       3.24557016e+001, 2

In [15]:
pos_x

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.74040141e+00, 5.12894707e+01, 2.32858500e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.74040141e+00, 5.12894707e+01, 2.32858500e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.74040141e+00, 5.12894707e+01, 2.32858500e+01],
       ...,
       [1.01531366e+01, 1.01531366e+01, 1.92523739e-01, ...,
        1.25343364e+01, 5.12894707e+01, 2.98756500e+01],
       [1.15979412e+01, 1.15979412e+01, 1.26046863e-01, ...,
        1.09318376e+01, 5.34793717e+01, 2.48403500e+01],
       [1.04979052e+01, 1.04979052e+01, 4.99433107e-03, ...,
        1.07558064e+01, 5.12894707e+01, 2.79793500e+01]])

In [16]:
# rescale matrix
pos_x /= np.repeat(np.reshape(maximum_per_column, (-1,1)), pos_x.shape[0], axis=1).transpose()
pos_x

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.45261625e-01, 7.10743590e-01, 5.07342969e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.45261625e-01, 7.10743590e-01, 5.07342969e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.45261625e-01, 7.10743590e-01, 5.07342969e-01],
       ...,
       [5.81235457e-01, 5.81235457e-01, 2.19652291e-02, ...,
        2.08315156e-01, 7.10743590e-01, 6.50918947e-01],
       [6.63946025e-01, 6.63946025e-01, 1.43808148e-02, ...,
        1.81682330e-01, 7.41090133e-01, 5.41211805e-01],
       [6.00972388e-01, 6.00972388e-01, 5.69808309e-04, ...,
        1.78756769e-01, 7.10743590e-01, 6.09603106e-01]])

In [17]:
((pos_x<=1).sum(axis=0)==457560).sum()==204

True

In [18]:
# matrix is scaled now and there are no zero columns. though we have 0 entries in the matrix (for example for the aid nodes). we fix this by adding a column containing ones, random values or the id and scale it. 
# taking last one so that columns and rows cannot be reduced (though may fail regardless)

In [19]:
np.arange(1, pos_x.shape[0]+1) / (pos_x.shape[0]+1)

array([2.18550095e-06, 4.37100190e-06, 6.55650285e-06, ...,
       9.99993443e-01, 9.99995629e-01, 9.99997814e-01])

In [20]:
#column_to_append = np.ones(pos_x.shape[0])
#column_to_append = np.random.rand(pos_x.shape[0])
column_to_append = np.arange(1, pos_x.shape[0]+1) / (pos_x.shape[0]+1)

In [21]:
np.reshape(column_to_append, (-1,1))

array([[2.18550095e-06],
       [4.37100190e-06],
       [6.55650285e-06],
       ...,
       [9.99993443e-01],
       [9.99995629e-01],
       [9.99997814e-01]])

In [22]:
pos_x = np.concatenate((pos_x, np.reshape(column_to_append, (-1,1))), axis=1)
pos_x

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.10743590e-01, 5.07342969e-01, 2.18550095e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.10743590e-01, 5.07342969e-01, 4.37100190e-06],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.10743590e-01, 5.07342969e-01, 6.55650285e-06],
       ...,
       [5.81235457e-01, 5.81235457e-01, 2.19652291e-02, ...,
        7.10743590e-01, 6.50918947e-01, 9.99993443e-01],
       [6.63946025e-01, 6.63946025e-01, 1.43808148e-02, ...,
        7.41090133e-01, 5.41211805e-01, 9.99995629e-01],
       [6.00972388e-01, 6.00972388e-01, 5.69808309e-04, ...,
        7.10743590e-01, 6.09603106e-01, 9.99997814e-01]])

In [24]:
# save the file
np.savetxt("data/descriptors_x_transformed.csv", pos_x, delimiter=",")