In [27]:
""" In this Notebook I shall investigate using Osborne's PIGEBaQ method on 
the test data obtained from my permutation algorithm. We shall see if the results agree, 
and learn a bit about the challenges of using this approach."""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

df = pd.read_csv("/Users/jacob/Documents/4YP data/ONET_18-10-21/ControlData1.csv") #This is all the data I will need
#df.drop(columns=['O*NET-SOC Code', 'Title'], inplace=True)
df.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Output
0,0.418287,0.106315,0.367642,0.609699,0.0
1,0.35335,0.277904,-0.357989,0.899122,0.0
2,0.1202,0.397125,-0.023316,0.789937,0.0
3,0.395666,-0.163691,0.999948,0.734246,0.0
4,0.230537,0.238473,0.186635,0.045195,0.0


In [28]:
#Lets now drop the unseen observations to leave just the training data:

df1 = df.dropna()
df1.reset_index(drop=True,inplace=True)
df1.head()

''' Not necessary when using scaled training set'''

' Not necessary when using scaled training set'

In [29]:
#We first need to model the input variables of our data. We shall assume they are gaussian (non-mixture)

mu = list(df.mean(axis='index'))
del mu[-1]
m = len(mu)
print(m)

n = len(df1.index)
print(n)

np_mu = np.array(mu)
print(np_mu)
#We have our vector of means (mx1)

#Checked with excel, works

4
200
[0.4835983  0.49095875 0.50332774 0.47212716]


In [30]:
#Now lets get our covariance matrix:

df2 = df1.drop(columns=['Output']) #This dataframe removes the output of our GP so the covariance isn't effected

df_cov = df2.cov()
df_cov.head()

np_cov = df_cov.to_numpy()
print(np_cov)
#We have our covariance matrix (mxm)

[[ 0.08141584  0.06413026  0.05244204 -0.00666724]
 [ 0.06413026  0.12806973  0.06218848 -0.00364999]
 [ 0.05244204  0.06218848  0.32192593  0.00143457]
 [-0.00666724 -0.00364999  0.00143457  0.0889202 ]]


In [None]:
print(np.shape(np_mu))
print(np.shape(np_cov))

In [31]:
#Now lets focus on the matrix of lengthscales and the output variance of the kernel:

output_var = 9.933829376632232
lengthscale = 5.873398029288366	 #both values estimated while get_params method continues to not work
np_lengthscale = lengthscale*np.identity(m)



In [32]:
#We now have all the ingredients we need to make z_n (for each observation):
#Lets begin by just focusing on the first observation, and then creating a function for this
import math

x = np.array(df2.iloc[[1]])

element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
#print(element1)
element2 = np.linalg.inv(np_lengthscale+np_cov)
#print(element2)
element3 = np.transpose(np_mu - x)
#print(element3)
#temp = np.matmul(element2,-1*element3)
element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))
#print(element4)

z = element1*math.exp(element4.item())*np.matmul(element2,element3)
print(z)

[[ 0.17599813]
 [ 0.2944133 ]
 [ 1.206494  ]
 [-0.62361823]]


In [33]:
#Lets now create a function that does this for each observation:

def compute_zn(index):
    x = np.array(df2.iloc[[index]])

    element1 = (output_var*math.sqrt(np.linalg.det(np_lengthscale)))/(math.sqrt(np.linalg.det(np_lengthscale+np_cov)))
    element2 = np.linalg.inv(np_lengthscale+np_cov)
    element3 = np.transpose(np_mu - x)
    element4 = -0.5*np.matmul(np.transpose(-1*element3),np.matmul(element2,-1*element3))

    zn = element1*math.exp(element4.item())*np.matmul(element2,element3)
    return zn


In [34]:
Z = [0]*n
for i in range(0,n):
    Z[i] = compute_zn(i)

Z = np.array(Z)
Z = np.transpose(Z)
Z = Z.reshape(m,n)
print(Z)

[[ 9.35108753e-02  1.75998133e-01  5.43219576e-01  1.26297131e-01
   3.80227372e-01  6.08227864e-01  6.23306170e-01 -3.68313684e-02
   6.36418116e-01  3.29122376e-01  4.50894684e-01  5.52839622e-01
   4.27558304e-01  2.50379728e-01  5.25147773e-02  1.00388766e-01
   1.76634020e-01  6.14369263e-01  1.96575693e-01  6.18690869e-02
   3.82446716e-01  4.63229440e-01  2.19678882e-02  2.17154533e-01
   4.16586433e-01  3.27070251e-01  6.63080785e-01  4.82915510e-01
   1.24887057e-01  4.68161577e-01  5.85998121e-03  6.01572541e-01
   4.94281617e-01  4.99910521e-01  4.17288588e-01  5.91717192e-01
   6.17647634e-01  4.02306895e-01  3.16333061e-01  3.01041891e-01
   7.09956321e-01  2.23197781e-01  4.64529947e-01  1.35251813e-02
   7.70256291e-02  3.27817550e-01  5.78919409e-01  2.44462998e-01
   6.33216756e-01  6.29680864e-01  1.41932439e-01  4.68495871e-01
   4.36328078e-01  2.65127852e-01  6.59118595e-01  7.16652402e-01
   2.08839516e-01  5.21825277e-02  4.97848442e-01  1.03347544e-01
   4.19477

In [None]:
poop = compute_zn(1)
#np.shape(poop)
np.shape(Z)
#print(poop)

In [35]:
#We now need to find K, which is a matrix of the kernel outputs of the training data.
X = df2.to_numpy()
X_norm = np.sum(X ** 2, axis = -1)
K = output_var * np.exp(-(1/lengthscale) * (X_norm[:,None] + X_norm[None,:] - 2 * np.dot(X, X.T)))
print(K)
np.shape(K)

#I don't trust this code

[[9.93382938 8.90225811 9.34520992 ... 7.97044532 8.63472031 7.84132461]
 [8.90225811 9.93382938 9.61352785 ... 6.11462264 6.57448741 5.54661998]
 [9.34520992 9.61352785 9.93382938 ... 7.14835125 7.57368869 6.33443415]
 ...
 [7.97044532 6.11462264 7.14835125 ... 9.93382938 9.66229731 8.70744553]
 [8.63472031 6.57448741 7.57368869 ... 9.66229731 9.93382938 9.41724731]
 [7.84132461 5.54661998 6.33443415 ... 8.70744553 9.41724731 9.93382938]]


(200, 200)

In [36]:
# I do not trust the code that I've been presented with. I shall create my own function for computing the covariance matrix:

X = df2.to_numpy() #Each row is an abservation, each column is a variable

def compute_cov(x,y,var,l):  #x and y are each full observations (vectors)
    d = 0
    m = len(x)
    for i in range(m):
        d += (x[i]-y[i])**2
    val = var*np.exp(-0.5*d/(l**2))
    return val
#This function takes in two observations and outputs a scalar


num = df2.shape[0]
K = np.empty([num,num])

#Now lets start filling this matrix:

K[0,0] = compute_cov(X[0,:],X[0,:],output_var,lengthscale)
#K

for i in range(num):
    for j in range(num):
        K[i,j] = compute_cov(X[i,:],X[j,:],output_var,lengthscale)

K

#Which of these methods should I trust?

array([[9.93382938, 9.84154135, 9.88230855, ..., 9.7493442 , 9.81601   ,
        9.73579825],
       [9.84154135, 9.93382938, 9.90615159, ..., 9.53182118, 9.59084357,
        9.45303777],
       [9.88230855, 9.90615159, 9.93382938, ..., 9.65941348, 9.70705841,
        9.56052228],
       ...,
       [9.7493442 , 9.53182118, 9.65941348, ..., 9.93382938, 9.91041979,
        9.82302108],
       [9.81601   , 9.59084357, 9.70705841, ..., 9.91041979, 9.93382938,
        9.88877076],
       [9.73579825, 9.45303777, 9.56052228, ..., 9.82302108, 9.88877076,
        9.93382938]])

In [37]:
#Final ingredient: we need a row vector of the outputs:

f = df1["Output"].to_numpy()
f = f.reshape(n)
np.shape(f)

(200,)

In [38]:
#Moment of truth...

E = np.matmul(Z,np.matmul(np.linalg.inv(K),f))
print(E)

[ 1.30668736e+08 -6.27977933e+08 -3.40389077e+08  1.00497611e+08]


In [None]:
print(np.shape(Z))
print(np.shape(K))
print(np.shape(f))

In [39]:
#Lets now hitch these values onto a new dataframe of skills:

output_df = pd.DataFrame({"Skills":df2.columns,"Importance":E})

In [40]:
output_df = output_df.sort_values(by="Importance")
output_df.reset_index(drop=True, inplace = True)
print(output_df)

     Skills    Importance
0  Feature2 -6.279779e+08
1  Feature3 -3.403891e+08
2  Feature4  1.004976e+08
3  Feature1  1.306687e+08


In [None]:
output_df.to_csv("PIGEBaQ_Ex.csv")

In [None]:
X_norm

In [None]:
np.shape(X_norm)

In [None]:
X_norm[:,None]

In [None]:
X_norm[None,:]

In [None]:
np.dot(X, X.T)

In [None]:
X_norm[:,None] + X_norm[None,:]

In [None]:
df2.shape[0]

In [None]:
compute_cov(X[0,:],X[1,:],1,8)

In [None]:
test = X[0,:]
test[1]