In [1]:
""" In this Notebook I shall investigate using Osborne's PIGEBaQ method on 
the test data obtained from my permutation algorithm. We shall see if the results agree, 
and learn a bit about the challenges of using this approach."""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

df = pd.read_csv("/Users/jacob/Documents/4YP data/ONET_18-10-21/scaled_total_set.csv") #This is all the data I will need
df.drop(columns=['O*NET-SOC Code', 'Title'], inplace=True)
df.head()

Unnamed: 0,Reading Comprehension,Active Listening,Writing,Speaking,Mathematics,Science,Critical Thinking,Active Learning,Learning Strategies,Monitoring,...,Repairing,Quality Control Analysis,Judgment and Decision Making,Systems Analysis,Systems Evaluation,Time Management,Management of Financial Resources,Management of Material Resources,Management of Personnel Resources,Auto label value
0,1.416147,1.658374,1.365633,1.917195,1.261042,0.220817,1.625616,1.970645,1.457906,3.373452,...,-0.462245,-0.559854,3.198755,3.231506,2.796083,3.000691,3.518175,3.370579,3.269982,0.0
1,0.471545,0.428166,0.442029,0.64073,0.75752,-0.36028,0.700087,0.603361,0.796184,1.331014,...,-0.360652,0.576425,0.525771,1.025229,1.216929,1.436145,1.634337,1.785321,1.403902,0.0
2,0.622682,0.481653,0.87131,0.696228,0.433029,-0.10495,0.589576,0.879327,1.457906,1.316106,...,-0.462245,0.327864,0.840981,0.634534,0.901098,1.436145,1.564887,1.735155,1.650365,0.0
3,0.622682,0.815949,0.87131,0.862724,0.701574,0.995613,0.755342,0.879327,1.285859,1.137206,...,-0.462245,0.173993,1.143583,1.358469,1.445634,1.651944,1.45203,0.992692,2.096347,0.0
4,0.547113,0.401422,0.546097,0.529733,0.634437,-0.377889,0.589576,0.164325,-0.28904,-0.174725,...,-0.462245,0.398881,0.513163,-0.146855,-0.122629,-0.254284,0.644672,0.169964,-0.403496,1.0


In [2]:
#Lets now drop the unseen observations to leave just the training data:

df1 = df.dropna()
df1.reset_index(drop=True,inplace=True)
df1.head()

''' Not necessary when using scaled training set'''

' Not necessary when using scaled training set'

In [3]:
#We first need to model the input variables of our data. We shall assume they are gaussian (non-mixture)

mu = list(df.mean(axis='index'))
del mu[-1]
m = len(mu)
print(m)

n = len(df1.index)
print(n)

np_mu = np.array(mu)
print(np_mu)
#We have our vector of means (mx1)

#Checked with excel, works

35
63
[ 0.03752818 -0.02286795  0.03277732 -0.00432976 -0.01632227  0.05702519
  0.00698149  0.00105528 -0.03947646 -0.02232934 -0.05512616 -0.04766119
 -0.04796171 -0.09966144 -0.04570855 -0.1407273  -0.00371802  0.05501268
  0.06092517  0.02039227  0.05932767  0.00446861 -0.0243302  -0.00667049
  0.0367995  -0.01393973  0.04027407  0.0486789  -0.01779188 -0.03212745
 -0.04864175 -0.00166131 -0.01951496 -0.01525377 -0.04618905]


In [4]:
#Now lets get our covariance matrix:

df2 = df1.drop(columns=['Auto label value']) #This dataframe removes the output of our GP so the covariance isn't effected

df_cov = df2.cov()
df_cov.head()

np_cov = df_cov.to_numpy()
print(np_cov)
#We have our covariance matrix (mxm)

[[1.212615   0.97899862 1.14550018 ... 0.49550273 0.46400925 0.50102343]
 [0.97899862 1.01154023 1.00482735 ... 0.41960006 0.36609006 0.49286511]
 [1.14550018 1.00482735 1.18477781 ... 0.43113451 0.39791495 0.49348872]
 ...
 [0.49550273 0.41960006 0.43113451 ... 0.98767927 0.95893632 0.77478643]
 [0.46400925 0.36609006 0.39791495 ... 0.95893632 1.01381003 0.81230896]
 [0.50102343 0.49286511 0.49348872 ... 0.77478643 0.81230896 0.98395153]]


In [None]:
print(np.shape(np_mu))
print(np.shape(np_cov))

In [5]:
#Now lets focus on the matrix of lengthscales and the output variance of the kernel:

output_var = 2.3316000000000003
lengthscale = 14.390100000000004  #both values estimated while get_params method continues to not work
np_lengthscale = lengthscale*np.identity(m)



In [6]:
#We now have all the ingredients we need to make z_n (for each observation):
#Lets begin by just focusing on the first observation, and then creating a function for this
import math

x = np.array(df2.iloc[[1]])

element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
#print(element1)
element2 = np.linalg.inv(np_lengthscale+np_cov)
#print(element2)
element3 = np.transpose(np_mu - x)
#print(element3)
#temp = np.matmul(element2,-1*element3)
element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))
#print(element4)

z = element1*math.exp(element4.item())*np.matmul(element2,element3)
print(z)

[[ 0.00011649]
 [-0.00149788]
 [ 0.00045762]
 [-0.00546622]
 [-0.01297132]
 [ 0.02545067]
 [-0.00766286]
 [-0.00354615]
 [-0.01228941]
 [-0.02775709]
 [-0.00786925]
 [-0.03023731]
 [-0.01487956]
 [-0.02389525]
 [-0.00798971]
 [-0.00240927]
 [-0.00138198]
 [-0.02623644]
 [ 0.00714309]
 [ 0.01480278]
 [ 0.00739952]
 [ 0.01130127]
 [-0.02356174]
 [-0.02225887]
 [ 0.01469697]
 [-0.00997662]
 [ 0.01097069]
 [-0.01575743]
 [-0.00089057]
 [-0.01731699]
 [-0.02384149]
 [-0.02925903]
 [-0.03793443]
 [-0.04280505]
 [-0.03065676]]


In [7]:
#Lets now create a function that does this for each observation:

def compute_zn(index):
    x = np.array(df2.iloc[[index]])

    element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
    element2 = np.linalg.inv(np_lengthscale+np_cov)
    element3 = np.transpose(np_mu - x)
    element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))

    zn = element1*math.exp(element4.item())*np.matmul(element2,element3)
    return zn


In [8]:
Z = [0]*n
for i in range(0,n):
    Z[i] = compute_zn(i)

Z = np.array(Z)
Z = np.transpose(Z)
Z = Z.reshape(m,n)
print(Z)

[[-0.00054779  0.00011649 -0.00216038 ...  0.01101506  0.04148635
   0.00890875]
 [-0.00205622 -0.00149788 -0.00013263 ...  0.01101027  0.01308579
   0.01011851]
 [-0.00043766  0.00045762 -0.00997745 ...  0.00727168  0.02851336
   0.01124271]
 ...
 [-0.00951555 -0.03793443 -0.03249189 ...  0.00340099  0.00168985
  -0.00016125]
 [-0.0089571  -0.04280505 -0.03779664 ...  0.00284834  0.00276941
   0.00143169]
 [-0.00835241 -0.03065676 -0.03445337 ... -0.00639661  0.00983245
   0.01205588]]


In [None]:
poop = compute_zn(1)
#np.shape(poop)
np.shape(Z)
#print(poop)

In [None]:
#We now need to find K, which is a matrix of the kernel outputs of the training data.
X = df2.to_numpy()
X_norm = np.sum(X ** 2, axis = -1)
K = output_var * np.exp(-(1/lengthscale) * (X_norm[:,None] + X_norm[None,:] - 2 * np.dot(X, X.T)))
print(K)
np.shape(K)

#I don't trust this code

In [9]:
# I do not trust the code that I've been presented with. I shall create my own function for computing the covariance matrix:

X = df2.to_numpy() #Each row is an abservation, each column is a variable

def compute_cov(x,y,var,l):  #x and y are each full observations (vectors)
    d = 0
    m = len(x)
    for i in range(m):
        d += (x[i]-y[i])**2
    val = var*np.exp(-0.5*d/(l**2))
    return val
#This function takes in two observations and outputs a scalar


num = df2.shape[0]
K = np.empty([num,num])

#Now lets start filling this matrix:

K[0,0] = compute_cov(X[0,:],X[0,:],output_var,lengthscale)
#K

for i in range(num):
    for j in range(num):
        K[i,j] = compute_cov(X[i,:],X[j,:],output_var,lengthscale)

K

#Which of these methods should I trust?

array([[2.3316    , 2.01523051, 1.98569655, ..., 1.26621823, 1.04764759,
        0.97072806],
       [2.01523051, 2.3316    , 2.27565668, ..., 1.86179271, 1.7344988 ,
        1.63470596],
       [1.98569655, 2.27565668, 2.3316    , ..., 1.77614619, 1.68242147,
        1.53790662],
       ...,
       [1.26621823, 1.86179271, 1.77614619, ..., 2.3316    , 1.97377122,
        2.15001923],
       [1.04764759, 1.7344988 , 1.68242147, ..., 1.97377122, 2.3316    ,
        2.17936864],
       [0.97072806, 1.63470596, 1.53790662, ..., 2.15001923, 2.17936864,
        2.3316    ]])

In [10]:
#Final ingredient: we need a row vector of the outputs:

f = df1["Auto label value"].to_numpy()
f = f.reshape(n)
np.shape(f)

(63,)

In [11]:
#Moment of truth...

E = np.matmul(Z,np.matmul(np.linalg.inv(K),f))
print(E)

[-1.24571308 -0.42691972 -0.7364672   0.37694129 -5.90102583 -1.58464594
  1.75518411  0.76251649  1.03462635 -0.57934575  0.72122459  0.05398722
  0.63459162 -1.97177508 -0.5674741   3.61182461  0.21800034 -3.01548145
 -1.21767194  1.67737079 -1.56514624 -1.76174394  0.94215582  2.52243659
 -0.5864454  -1.01802825 -1.17275897  2.3269412  -1.46918052 -4.86203424
 -0.36162484 -1.28854735 -1.79537025 -1.48509259 -0.97351873]


In [None]:
print(np.shape(Z))
print(np.shape(K))
print(np.shape(f))

In [12]:
#Lets now hitch these values onto a new dataframe of skills:

output_df = pd.DataFrame({"Skills":df2.columns,"Importance":E})

In [13]:
output_df = output_df.sort_values(by="Importance")
output_df.reset_index(drop=True, inplace = True)
print(output_df)

                               Skills  Importance
0                         Mathematics   -5.901026
1                    Systems Analysis   -4.862034
2                 Operations Analysis   -3.015481
3                         Negotiation   -1.971775
4   Management of Financial Resources   -1.795370
5                         Programming   -1.761744
6                             Science   -1.584646
7                        Installation   -1.565146
8    Management of Material Resources   -1.485093
9        Judgment and Decision Making   -1.469181
10                    Time Management   -1.288547
11              Reading Comprehension   -1.245713
12                  Technology Design   -1.217672
13                          Repairing   -1.172759
14                    Troubleshooting   -1.018028
15  Management of Personnel Resources   -0.973519
16                            Writing   -0.736467
17              Equipment Maintenance   -0.586445
18                         Monitoring   -0.579346


In [None]:
output_df.to_csv("PIGEBaQ_Ex.csv")

In [None]:
X_norm

In [None]:
np.shape(X_norm)

In [None]:
X_norm[:,None]

In [None]:
X_norm[None,:]

In [None]:
np.dot(X, X.T)

In [None]:
X_norm[:,None] + X_norm[None,:]

In [None]:
df2.shape[0]

In [None]:
compute_cov(X[0,:],X[1,:],1,8)

In [None]:
test = X[0,:]
test[1]