In [1]:
# Read in data from excel file. Excel file needs to be in the right format. Can then use the data to apply either the first or
# second model.

# First sheet contains the number of participants, artefacts, and measurements as well as a table showing which participants
# measured which artefacts, and details of the linking. The second sheet has a column of the measurement values, and the 
# covariance matrix. Contents of other sheets don't matter.

In [2]:
import xlrd
import math
import xlwt
from scipy import stats
import numpy as np
import numpy.linalg as linalg

In [3]:
def read_data(filename):
    # Takes in filename and gets data about the comparison
    # Calculates the design matrix, and stores the measurement values and the covariance matrix
    
    # Open workbook
    book = xlrd.open_workbook(filename)
    sheet1 = book.sheet_by_index(0)
    
    # Get information about comparison (Number of artefacts (F), participants (P), measurements (N))
    NumArtefacts = int(sheet1.cell_value(2,1))
    NumParticipants = int(sheet1.cell_value(3,1))
    NumMeasurements = int(sheet1.cell_value(4,1))

    # Save artefact and participant names
    ArtefactNames = []
    for i in range(0,NumArtefacts):
        ArtefactNames.append(sheet1.cell_value(8,i+1))

    ParticipantNames = []
    for i in range(0,NumParticipants):
        ParticipantNames.append(sheet1.cell_value(i+9,0))

    # Get the measurements (y) and covariance matrix (U) from the comparison, y is the list of measurements, and U is the 
    # covariance matrix (uncertainties squared, with correlations on the off diagonal)
    sheet2 = book.sheet_by_index(1)
    y = np.zeros((NumMeasurements,1))
    U = np.zeros((NumMeasurements, NumMeasurements))
    
    for i in range(0,NumMeasurements):
        y[i,0] = sheet2.cell_value(i+3,1)
    
        for j in range(0,NumMeasurements):
            U[i,j] = sheet2.cell_value(i+3,j+7)

    # Save the measurement titles
    MeasurementTitles = []
    for i in range(0,NumMeasurements):
        MeasurementTitles.append([sheet2.cell_value(i+3,2)])
        MeasurementTitles[i].append(sheet2.cell_value(i+3,3))
        MeasurementTitles[i].append(sheet2.cell_value(i+3,4))
    
    # Create the matrix X, and calculate the weights. w is the list of weights, and X is the design matrix

    # Initialise matrix to store X
    X = np.zeros((NumMeasurements, NumArtefacts + NumParticipants))

    # Create vectors to store the average uncertainty for each NMI (used to determine the weights)
    ave_U = np.zeros(NumParticipants)
    count_per_lab = np.zeros(NumParticipants)

    # Fill in 1's in the appropriate places in X
    # Iterate through the number of measurements
    for i in range(0, NumMeasurements):
        
        # Iterate through the number of artefacts
        for j in range(0, NumArtefacts):
            if MeasurementTitles[i][1] == ArtefactNames[j]:
                X[i,j] = 1.0

        # Iterate through the number of participants
        for k in range(0, NumParticipants):
            if MeasurementTitles[i][0] == ParticipantNames[k]:
                X[i,NumArtefacts+k] = 1.0

                # Add the uncertainty to the sum of uncertainties for lab k
                ave_U[k] += np.sqrt(U[i,i]) 
                count_per_lab[k] += 1

    # Determine the average uncertainty for each NMI by dividing the sum of their uncertainties by the number of measurements 
    # by that participant
    ave_U = ave_U / count_per_lab
    
    # Initialise w to store the weights. The weight vector has length F+P, where the first F rows of w are all 0's, and the 
    # next P rows are the weights for each participant
    w = np.zeros((NumArtefacts + NumParticipants, 1))

    # Update the non-zero entries of w
    for i in range(0, NumParticipants):
        w[NumArtefacts+i, 0] = 1/(ave_U[i]**2)

    # Standardise the weights so they sum to 1
    w = w/sum(w)
    
    # Scale the weights so that they have the same order of magnitude as (X.T, inv(U), X) for when we use GLS
    c = np.mean(linalg.multi_dot([X.T,linalg.inv(U),X]))/np.mean(np.dot(w,w.T))
    w = np.sqrt(c)*w
    
    return(y, U, NumArtefacts, NumMeasurements, NumParticipants, ave_U, X, w,ParticipantNames,MeasurementTitles,count_per_lab,
          ArtefactNames)

In [4]:
def calculate_beta(y, U, X, w):
    # Calculates gamma, beta, CovBeta, ChiSq using GLS
    # Takes in: measurements (y), covariance matrix (U), design matrix (X), weights (w)
    # Returns:  gamma, beta, CovBeta
    
    gamma = np.linalg.multi_dot([ linalg.inv( np.linalg.multi_dot([X.T,linalg.inv(U),X]) + np.dot(w,w.T)), X.T, linalg.inv(U)])
    beta = np.dot(gamma,y)
    CovBeta = np.linalg.multi_dot([ gamma, U, gamma.T])
    
    return(gamma, beta, CovBeta)

In [5]:
def model_B(y, U, X, beta):
    # Model B - calculates ChiSq using Model B and GLS
    # y = theta + delta + e
    # Takes in: measurements (y), covariance matrix (U), design matrix (X), beta
    # Returns:  ChiSq calculated using Model B
    
    # Calculate ChiSq
    ChiSq_B = np.linalg.multi_dot([ (y - np.dot(X, beta)).T, linalg.inv(U), (y - np.dot(X, beta))])
    
    return(ChiSq_B)

In [1]:
def model_A(beta, ave_U):
    # Calculate ChiSq using model A and GLS
    # Takes in: beta, average uncertainty for each lab
    # Returns: ChiSq calculated using model A
    
    # Create a matrix cov_beta_diag which contains the diagonal elements of CovBeta, and zeros elsewhere
    unc_diag = np.diag(ave_U**2)
    
    ChiSq_A = linalg.multi_dot([(beta[NumArtefacts:]).T, linalg.inv(unc_diag), beta[NumArtefacts:]])
    
    return(ChiSq_A)