In [67]:
""" In this Notebook I shall investigate using Osborne's PIGEBaQ method on 
the test data obtained from my permutation algorithm. We shall see if the results agree, 
and learn a bit about the challenges of using this approach."""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

df = pd.read_csv("/Users/jacob/Documents/4YP data/ONET_18-10-21/scaled_total_set.csv") #This is all the data I will need
df.drop(columns=['O*NET-SOC Code', 'Title'], inplace=True)
df.head()

Unnamed: 0,Reading Comprehension,Active Listening,Writing,Speaking,Mathematics,Science,Critical Thinking,Active Learning,Learning Strategies,Monitoring,...,Repairing,Quality Control Analysis,Judgment and Decision Making,Systems Analysis,Systems Evaluation,Time Management,Management of Financial Resources,Management of Material Resources,Management of Personnel Resources,Auto label value
0,1.416147,1.658374,1.365633,1.917195,1.261042,0.220817,1.625616,1.970645,1.457906,3.373452,...,-0.462245,-0.559854,3.198755,3.231506,2.796083,3.000691,3.518175,3.370579,3.269982,0.0
1,0.471545,0.428166,0.442029,0.64073,0.75752,-0.36028,0.700087,0.603361,0.796184,1.331014,...,-0.360652,0.576425,0.525771,1.025229,1.216929,1.436145,1.634337,1.785321,1.403902,0.0
2,0.622682,0.481653,0.87131,0.696228,0.433029,-0.10495,0.589576,0.879327,1.457906,1.316106,...,-0.462245,0.327864,0.840981,0.634534,0.901098,1.436145,1.564887,1.735155,1.650365,0.0
3,0.622682,0.815949,0.87131,0.862724,0.701574,0.995613,0.755342,0.879327,1.285859,1.137206,...,-0.462245,0.173993,1.143583,1.358469,1.445634,1.651944,1.45203,0.992692,2.096347,0.0
4,0.547113,0.401422,0.546097,0.529733,0.634437,-0.377889,0.589576,0.164325,-0.28904,-0.174725,...,-0.462245,0.398881,0.513163,-0.146855,-0.122629,-0.254284,0.644672,0.169964,-0.403496,1.0


In [68]:
#Lets now drop the unseen observations to leave just the training data:

df1 = df.dropna()
df1.reset_index(drop=True,inplace=True)
df1.head()

''' Not necessary when using scaled training set'''

' Not necessary when using scaled training set'

In [69]:
#We first need to model the input variables of our data. We shall assume they are gaussian (non-mixture)

mu = list(df.mean(axis='index'))
del mu[-1]
m = len(mu)
print(m)

n = len(df1.index)
print(n)

np_mu = np.array(mu)
print(np_mu)
#We have our vector of means (mx1)

#Checked with excel, works

35
63
[ 0.03752818 -0.02286795  0.03277732 -0.00432976 -0.01632227  0.05702519
  0.00698149  0.00105528 -0.03947646 -0.02232934 -0.05512616 -0.04766119
 -0.04796171 -0.09966144 -0.04570855 -0.1407273  -0.00371802  0.05501268
  0.06092517  0.02039227  0.05932767  0.00446861 -0.0243302  -0.00667049
  0.0367995  -0.01393973  0.04027407  0.0486789  -0.01779188 -0.03212745
 -0.04864175 -0.00166131 -0.01951496 -0.01525377 -0.04618905]


In [70]:
#Now lets get our covariance matrix:

df2 = df1.drop(columns=['Auto label value']) #This dataframe removes the output of our GP so the covariance isn't effected

df_cov = df2.cov()
df_cov.head()

np_cov = df_cov.to_numpy()
print(np_cov)
#We have our covariance matrix (mxm)

[[1.212615   0.97899862 1.14550018 ... 0.49550273 0.46400925 0.50102343]
 [0.97899862 1.01154023 1.00482735 ... 0.41960006 0.36609006 0.49286511]
 [1.14550018 1.00482735 1.18477781 ... 0.43113451 0.39791495 0.49348872]
 ...
 [0.49550273 0.41960006 0.43113451 ... 0.98767927 0.95893632 0.77478643]
 [0.46400925 0.36609006 0.39791495 ... 0.95893632 1.01381003 0.81230896]
 [0.50102343 0.49286511 0.49348872 ... 0.77478643 0.81230896 0.98395153]]


In [71]:
print(np.shape(np_mu))
print(np.shape(np_cov))

(35,)
(35, 35)


In [72]:
#Now lets focus on the matrix of lengthscales and the output variance of the kernel:

output_var = 3.48
lengthscale = 8.13  #both values estimated while get_params method continues to not work
np_lengthscale = lengthscale*np.identity(m)



In [73]:
#We now have all the ingredients we need to make z_n (for each observation):
#Lets begin by just focusing on the first observation, and then creating a function for this
import math

x = np.array(df2.iloc[[1]])

element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
#print(element1)
element2 = np.linalg.inv(np_lengthscale+np_cov)
#print(element2)
element3 = np.transpose(np_mu - x)
#print(element3)
#temp = np.matmul(element2,-1*element3)
element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))
#print(element4)

z = element1*math.exp(element4.item())*np.matmul(element2,element3)
print(z)

[[ 0.00276283]
 [ 0.00094172]
 [ 0.00315331]
 [-0.00267687]
 [-0.01184604]
 [ 0.03229108]
 [-0.0057681 ]
 [-0.00062749]
 [-0.01054357]
 [-0.02768055]
 [-0.00522451]
 [-0.02961399]
 [-0.01188937]
 [-0.02243728]
 [-0.00491342]
 [ 0.00136302]
 [ 0.00260466]
 [-0.0266213 ]
 [ 0.01060608]
 [ 0.01817409]
 [ 0.00797495]
 [ 0.01446153]
 [-0.02561375]
 [-0.02483259]
 [ 0.01698292]
 [-0.01144229]
 [ 0.01275354]
 [-0.01712443]
 [ 0.00345916]
 [-0.01543496]
 [-0.02293169]
 [-0.02879506]
 [-0.03818787]
 [-0.04390562]
 [-0.02999286]]


In [74]:
#Lets now create a function that does this for each observation:

def compute_zn(index):
    x = np.array(df2.iloc[[index]])

    element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
    element2 = np.linalg.inv(np_lengthscale+np_cov)
    element3 = np.transpose(np_mu - x)
    element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))

    zn = element1*math.exp(element4.item())*np.matmul(element2,element3)
    return zn


In [75]:
Z = [0]*n
for i in range(0,n):
    Z[i] = compute_zn(i)

Z = np.array(Z)
Z = np.transpose(Z)
Z = Z.reshape(m,n)
print(Z)

[[ 0.00022248  0.00276283  0.00042028 ...  0.00727782  0.04306511
   0.00490365]
 [-0.00063314  0.00094172  0.00302241 ...  0.00715614  0.00993501
   0.00601285]
 [ 0.00030379  0.00315331 -0.00824936 ...  0.00399186  0.02768605
   0.00698272]
 ...
 [-0.00474205 -0.03818787 -0.03151992 ...  0.0030293  -0.00138965
  -0.00138328]
 [-0.00442558 -0.04390562 -0.03750254 ...  0.00315084 -0.00034172
   0.00053402]
 [-0.00406597 -0.02999286 -0.03326817 ... -0.00457978  0.00740965
   0.01007128]]


In [86]:
poop = compute_zn(1)
#np.shape(poop)
np.shape(Z)
#print(poop)

[[ 0.00276283]
 [ 0.00094172]
 [ 0.00315331]
 [-0.00267687]
 [-0.01184604]
 [ 0.03229108]
 [-0.0057681 ]
 [-0.00062749]
 [-0.01054357]
 [-0.02768055]
 [-0.00522451]
 [-0.02961399]
 [-0.01188937]
 [-0.02243728]
 [-0.00491342]
 [ 0.00136302]
 [ 0.00260466]
 [-0.0266213 ]
 [ 0.01060608]
 [ 0.01817409]
 [ 0.00797495]
 [ 0.01446153]
 [-0.02561375]
 [-0.02483259]
 [ 0.01698292]
 [-0.01144229]
 [ 0.01275354]
 [-0.01712443]
 [ 0.00345916]
 [-0.01543496]
 [-0.02293169]
 [-0.02879506]
 [-0.03818787]
 [-0.04390562]
 [-0.02999286]]


In [77]:
#We now need to find K, which is a matrix of the kernel outputs of the training data.
X = df2.to_numpy()
X_norm = np.sum(X ** 2, axis = -1)
K = output_var * np.exp(-(1/lengthscale) * (X_norm[:,None] + X_norm[None,:] - 2 * np.dot(X, X.T)))
print(K)
np.shape(K)

#I don't trust this code

[[3.48000000e+00 2.06787098e-03 9.74759995e-04 ... 1.08346600e-13
  6.96167650e-18 1.43109995e-19]
 [2.06787098e-03 3.48000000e+00 1.00992596e+00 ... 3.66008006e-05
  9.92392145e-07 4.85009871e-08]
 [9.74759995e-04 1.00992596e+00 3.48000000e+00 ... 3.32364577e-06
  2.10022450e-07 2.16432386e-09]
 ...
 [1.08346600e-13 3.66008006e-05 3.32364577e-06 ... 3.48000000e+00
  7.17187467e-04 5.59579155e-02]
 [6.96167650e-18 9.92392145e-07 2.10022450e-07 ... 7.17187467e-04
  3.48000000e+00 1.11639997e-01]
 [1.43109995e-19 4.85009871e-08 2.16432386e-09 ... 5.59579155e-02
  1.11639997e-01 3.48000000e+00]]


(63, 63)

In [78]:
# I do not trust the code that I've been presented with. I shall create my own function for computing the covariance matrix:

X = df2.to_numpy() #Each row is an abservation, each column is a variable

def compute_cov(x,y,var,l):  #x and y are each full observations (vectors)
    d = 0
    m = len(x)
    for i in range(m):
        d += (x[i]-y[i])**2
    val = var*np.exp(-0.5*d/(l**2))
    return val
#This function takes in two observations and outputs a scalar


num = df2.shape[0]
K = np.empty([num,num])

#Now lets start filling this matrix:

K[0,0] = compute_cov(X[0,:],X[0,:],output_var,lengthscale)
#K

for i in range(num):
    for j in range(num):
        K[i,j] = compute_cov(X[i,:],X[j,:],output_var,lengthscale)

K

#Which of these methods should I trust?

array([[3.48      , 2.20381344, 2.10420062, ..., 0.51393116, 0.2838495 ,
        0.22352901],
       [2.20381344, 3.48      , 3.22504364, ..., 1.71958532, 1.37741282,
        1.14403932],
       [2.10420062, 3.22504364, 3.48      , ..., 1.48370556, 1.25195011,
        0.94490637],
       ...,
       [0.51393116, 1.71958532, 1.48370556, ..., 3.48      , 2.0648631 ,
        2.69938296],
       [0.2838495 , 1.37741282, 1.25195011, ..., 2.0648631 , 3.48      ,
        2.81651534],
       [0.22352901, 1.14403932, 0.94490637, ..., 2.69938296, 2.81651534,
        3.48      ]])

In [79]:
#Final ingredient: we need a row vector of the outputs:

f = df1["Auto label value"].to_numpy()
f = f.reshape(n)
np.shape(f)

(63,)

In [80]:
#Moment of truth...

E = np.matmul(Z,np.matmul(np.linalg.inv(K),f))
print(E)

[-0.2573979  -0.05659378 -0.15506038 -0.18654817 -0.75427853 -0.18625383
  0.24173035  0.15150696  0.29413873 -0.01649291  0.11497358  0.17062347
  0.21726065 -0.33312263 -0.10107973  0.46224445  0.12928378 -0.26798706
 -0.00273377  0.2684818  -0.15783488 -0.24817751  0.03984263  0.31825804
 -0.07589788 -0.0662493  -0.13615566  0.4867432  -0.17784934 -0.55207722
  0.19629749 -0.06179917 -0.05521807 -0.01234103  0.00639952]


In [81]:
print(np.shape(Z))
print(np.shape(K))
print(np.shape(f))

(35, 63)
(63, 63)
(63,)


In [82]:
#Lets now hitch these values onto a new dataframe of skills:

output_df = pd.DataFrame({"Skills":df2.columns,"Importance":E})

In [83]:
output_df = output_df.sort_values(by="Importance")
output_df.reset_index(drop=True, inplace = True)
print(output_df)

                               Skills  Importance
0                         Mathematics   -0.754279
1                    Systems Analysis   -0.552077
2                         Negotiation   -0.333123
3                 Operations Analysis   -0.267987
4               Reading Comprehension   -0.257398
5                         Programming   -0.248178
6                            Speaking   -0.186548
7                             Science   -0.186254
8        Judgment and Decision Making   -0.177849
9                        Installation   -0.157835
10                            Writing   -0.155060
11                          Repairing   -0.136156
12                        Instructing   -0.101080
13              Equipment Maintenance   -0.075898
14                    Troubleshooting   -0.066249
15                    Time Management   -0.061799
16                   Active Listening   -0.056594
17  Management of Financial Resources   -0.055218
18                         Monitoring   -0.016493


In [88]:
output_df.to_csv("PIGEBaQ_Ex.csv")

In [12]:
X_norm

array([554.0729, 330.1999, 332.6104, 378.3367, 237.0379, 243.5691,
       299.7154, 187.9249, 301.2836, 298.1673, 283.1239, 234.4784,
       219.5566, 232.146 , 215.9703, 323.7935, 310.0904, 403.8916,
       207.7635, 241.111 , 308.8178, 504.3654, 308.5041, 310.5558,
       354.0708, 380.4842, 351.2378, 244.381 , 364.0085, 188.4971,
       205.7941, 286.1248, 186.5247, 222.3067, 210.6354, 391.0291,
       138.788 , 318.1841,  97.0642, 116.4398, 104.9929,  91.0192,
       124.5191, 176.4997, 177.2292, 177.4204, 126.6923, 112.8203,
       180.6432, 129.7653, 204.0797, 123.6887, 152.7537, 135.9679,
       247.3617, 205.4042, 170.0322,  99.6029, 199.6029, 139.1711,
       232.2723, 109.019 , 133.3697])

In [13]:
np.shape(X_norm)

(63,)

In [23]:
X_norm[:,None]

array([[554.0729],
       [330.1999],
       [332.6104],
       [378.3367],
       [237.0379],
       [243.5691],
       [299.7154],
       [187.9249],
       [301.2836],
       [298.1673],
       [283.1239],
       [234.4784],
       [219.5566],
       [232.146 ],
       [215.9703],
       [323.7935],
       [310.0904],
       [403.8916],
       [207.7635],
       [241.111 ],
       [308.8178],
       [504.3654],
       [308.5041],
       [310.5558],
       [354.0708],
       [380.4842],
       [351.2378],
       [244.381 ],
       [364.0085],
       [188.4971],
       [205.7941],
       [286.1248],
       [186.5247],
       [222.3067],
       [210.6354],
       [391.0291],
       [138.788 ],
       [318.1841],
       [ 97.0642],
       [116.4398],
       [104.9929],
       [ 91.0192],
       [124.5191],
       [176.4997],
       [177.2292],
       [177.4204],
       [126.6923],
       [112.8203],
       [180.6432],
       [129.7653],
       [204.0797],
       [123.6887],
       [152.

In [16]:
X_norm[None,:]

(1, 63)

In [18]:
np.dot(X, X.T)

array([[554.0729, 422.3088, 419.1619, ..., 308.5094, 226.0056, 228.3662],
       [422.3088, 330.1999, 327.3978, ..., 250.1323, 181.6586, 186.4566],
       [419.1619, 327.3978, 332.6104, ..., 246.0862, 181.8938, 182.6163],
       ...,
       [308.5094, 250.1323, 246.0862, ..., 232.2723, 149.9388, 173.43  ],
       [226.0056, 181.6586, 181.8938, ..., 149.9388, 109.019 , 113.1337],
       [228.3662, 186.4566, 182.6163, ..., 173.43  , 113.1337, 133.3697]])

In [22]:
X_norm[:,None] + X_norm[None,:]

array([[1108.1458,  884.2728,  886.6833, ...,  786.3452,  663.0919,
         687.4426],
       [ 884.2728,  660.3998,  662.8103, ...,  562.4722,  439.2189,
         463.5696],
       [ 886.6833,  662.8103,  665.2208, ...,  564.8827,  441.6294,
         465.9801],
       ...,
       [ 786.3452,  562.4722,  564.8827, ...,  464.5446,  341.2913,
         365.642 ],
       [ 663.0919,  439.2189,  441.6294, ...,  341.2913,  218.038 ,
         242.3887],
       [ 687.4426,  463.5696,  465.9801, ...,  365.642 ,  242.3887,
         266.7394]])

In [24]:
df2.shape[0]

63

In [34]:
compute_cov(X[0,:],X[1,:],1,8)

0.7335890753394024

In [36]:
test = X[0,:]
test[1]

4.88