In [18]:
""" In this Notebook I shall investigate using Osborne's PIGEBaQ method on 
the test data obtained from my permutation algorithm. We shall see if the results agree, 
and learn a bit about the challenges of using this approach."""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

df1 = pd.read_excel("/Users/jacob/Documents/4YP data/ONET_18-10-21/unknown_jobs.xlsx") #This is all the data I will need
df1.drop(columns=['O*NET-SOC Code', 'Title','Auto label value','Auto probability0','Auto probability1'], inplace=True)
df1.head()

Unnamed: 0,Reading Comprehension,Active Listening,Writing,Speaking,Mathematics,Science,Critical Thinking,Active Learning,Learning Strategies,Monitoring,...,Repairing,Quality Control Analysis,Judgment and Decision Making,Systems Analysis,Systems Evaluation,Time Management,Management of Financial Resources,Management of Material Resources,Management of Personnel Resources,Auto probability
0,4.25,4.0,4.25,4.12,3.12,1.88,4.12,3.88,3.75,4.0,...,0.0,1.5,4.0,4.0,4.0,3.88,3.12,2.62,4.0,0.176306
1,4.0,4.0,3.88,4.0,2.5,1.12,4.0,3.62,3.25,4.0,...,0.0,2.12,3.75,3.0,3.12,3.75,3.38,3.25,3.88,0.090045
2,4.0,4.12,3.88,4.12,3.25,0.62,4.12,4.12,3.25,4.25,...,0.0,1.0,4.0,3.12,3.75,3.88,3.62,2.62,3.88,0.341019
3,4.25,4.12,3.88,4.12,3.12,1.5,4.25,4.12,3.5,4.25,...,0.0,1.38,4.0,3.75,3.75,3.75,3.75,2.75,3.88,0.267772
4,4.0,4.0,4.0,4.12,3.25,0.62,4.0,3.88,3.75,4.75,...,0.0,1.38,4.0,3.75,3.88,3.88,3.62,3.0,4.0,0.132264


In [31]:
#We first need to model the input variables of our data. We shall assume they are gaussian (non-mixture)

mu = list(df1.mean(axis='index'))
del mu[-1]
m = len(mu)

np_mu = np.array(mu)
print(np_mu)
#We have our vector of means (mx1)

[3.64277778 3.54018519 3.29003704 3.45434568 2.56759259 1.54771605
 3.56935802 3.19358025 2.85107407 3.38977778 3.0997284  3.15017284
 2.75749383 2.55220988 2.93035802 2.84587654 3.16495062 1.74250617
 1.02639506 1.12487654 0.37941975 0.82948148 2.32934568 1.82325926
 1.10159259 1.62637037 1.07355556 2.23146914 3.209      2.64719753
 2.63446914 3.04465432 1.33997531 1.4938642  2.58975309]


In [26]:
#Now lets get our covariance matrix:

df2 = df1.drop(columns=['Auto probability']) #This dataframe removes the output of our GP so the covariance isn't effected

df_cov = df2.cov()
df_cov.head()

np_cov = df_cov.to_numpy()
print(np_cov)
#We have our covariance matrix (mxm)

[[0.7023212  0.48004942 0.66776108 ... 0.31827003 0.26150273 0.35550056]
 [0.48004942 0.39169947 0.47694239 ... 0.22818567 0.1759779  0.26393503]
 [0.66776108 0.47694239 0.72033461 ... 0.33661323 0.27250134 0.37274834]
 ...
 [0.31827003 0.22818567 0.33661323 ... 0.79005266 0.62692222 0.41116798]
 [0.26150273 0.1759779  0.27250134 ... 0.62692222 0.62672609 0.36459366]
 [0.35550056 0.26393503 0.37274834 ... 0.41116798 0.36459366 0.45339129]]


In [30]:
print(np.shape(np_mu))
print(np.shape(np_cov))

(35,)
(35, 35)


In [35]:
#Now lets focus on the matrix of lengthscales and the output variance of the kernel:

output_var = 125
lengthscale = 8  #both values estimated while get_params method continues to not work
np_lengthscale = lengthscale*np.identity(m)



(35, 35)

In [71]:
#We now have all the ingredients we need to make z_n (for each observation):
#Lets begin by just focusing on the first observation, and then creating a function for this
import math

x = np.array(df2.iloc[[0]])

element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
#print(element1)
element2 = np.linalg.inv(np_lengthscale+np_cov)
#print(element2)
element3 = np.transpose(np_mu - x)
#print(element3)
#temp = np.matmul(element2,-1*element3)
element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))
#print(element4)

z = element1*math.exp(element4.item())*np.matmul(element2,element3)
print(z)

[[-0.01876064]
 [ 0.00322823]
 [-0.63572852]
 [-0.249215  ]
 [-0.1629253 ]
 [ 0.36663627]
 [-0.1776416 ]
 [-0.23814994]
 [-0.70359621]
 [-0.43108496]
 [-0.61152384]
 [-0.80854267]
 [-1.40358688]
 [-0.39985298]
 [ 0.0757246 ]
 [-0.03133469]
 [-0.95294061]
 [-1.48826056]
 [ 0.17465573]
 [ 0.68462686]
 [ 0.23317987]
 [ 0.02900624]
 [ 0.09286823]
 [ 0.3037086 ]
 [ 0.68643703]
 [ 1.98937267]
 [ 0.66013405]
 [ 0.80898793]
 [-0.59132452]
 [-1.48959956]
 [-1.44512639]
 [-0.9269268 ]
 [-2.39127602]
 [-1.3658371 ]
 [-1.82494356]]


In [63]:
np.shape(element3)

(35, 1)

In [67]:
element4.item()

-0.8985978831053343

In [70]:
element1

37.973210892541736