In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import statistics as stats
import scipy
import string as st
import lmfit
from lmfit import Model
from lmfit import models
from lmfit.models import *
from scipy import io
from scipy.fftpack import dct, idct
from itertools import combinations
import tkinter as tk
from tkinter import filedialog
import os
import inspect
import csv
import sys

In [2]:
speaker1 ={'var1':[10,11,12,10,12], 'var2':[14,15,13,12,15]}

speaker2 = {'var1':[20,17,15,16], 'var2':[18,19,17,19]}

background = {'speak1' :
                  {'var1':[10,11,12,10,13],
                   'var2':[14,15,13,18,15]},
             'speak2' :
                  {'var1':[19, 18, 17, 20, 17, 18],
                   'var2':[16, 19, 17, 18, 19, 17]},
             'speak3' :
                  {'var1':[13, 12, 15],
                   'var2':[14, 15, 17]},
             }

In [3]:
def likelihood_ratio(background, file1, file2):
    
    off_covar = None

    ######################################################
    ### some prior calculations on the background data ###
    ######################################################

    # Get the number of speakers
    num_speakers = len(background)

    # get the number of variables
    num_variables = len(next(iter(background.values())))

    # get the number of measurements 
    num_measur = 0
    for speaker in background.values():
        first_variable = next(iter(speaker.values()))
        num_measur += len(first_variable)

    # background mean : overall mean for each variable. 

    var_all = {}

    #first reshape the data to put all values together, regardless of speaker
    for speaker, variables in background.items(): 
        # loop through every variable 
        for variable, values in variables.items():
            # if the new variable has not yet been added to the new dictionary, add it.
            var_all.setdefault(variable, [])

            # add the values to the coefficient list
            var_all[variable].extend(values)

    #calculate the mean of each variable in the background data:

    background_means = {}

    for variable, values in var_all.items():
        background_means.setdefault(variable, )
        background_means[variable] = np.mean(values)

    #mean of each variable for eah of the speakers in the background data: 

    var_means_per_speaker = {}

    for speaker,var in background.items():
        var_means_per_speaker[speaker] = []
        for variable, values in background[speaker].items():
            var_means_per_speaker[speaker].append(np.mean(values))


    ######################################
    ### within group covariance matrix ###
    ######################################

    #calculated differently to make use of the cov function from the numpy package. 

    #create a covariance matrix for each speaker. 
                        
    cov_matrices = {}
    for speaker, values in background.items():
        speaker_data = np.vstack(list(values.values()))

        numeric_data = speaker_data.astype(np.number)
        cov_matrices[speaker] = np.cov(numeric_data)

    #adding all matrices

    matrices = [np.array(matrix) for matrix in cov_matrices.values()]
    sum_matrices = sum(matrix[:matrix.shape[0], :matrix.shape[1]] for matrix in matrices)

    # divide by the number of speakers.
    within_group = sum_matrices/num_speakers

    #######################################
    ### between group covariance matrix ###
    #######################################

    #according to the changes introduced by Morrison to accomodate for unequal n across speakers. 

    #calculate the first part of the group covariance matrix equation.
    Sstar= np.cov(np.array(list(var_means_per_speaker.values())), rowvar=False)

    #final calculations
    between_group_cov = Sstar - (sum_matrices/num_measur)

    #between group inverse covariance
    betw_inv = np.linalg.inv(between_group_cov)

    #between group square root
    betw_sqrt = scipy.linalg.sqrtm(between_group_cov)

    ####################################
    ### calulations on offender data ###
    ####################################

    # number of measures
    off_num_measures = len(next(iter(file1.values())))

    # offender mean
    off_mean = []
    for variable, values in file1.items():
        off_mean.append(stats.mean(file1[variable])) 

    #offender covariance matrix 

    off_covar= [[value / off_num_measures for value in row] for row in within_group]
        
    #offender inverse covariance matrix.
    off_inv = np.linalg.inv(off_covar)

    #offender square root 
    off_sqrt = scipy.linalg.sqrtm(off_covar)

    # offender square root inverse
    off_sqrt_inv = np.linalg.inv(off_sqrt)

     #################################
     ### suspect data calculations ###
    #################################

    # number of measures
    suspect_num_measures = len(next(iter(file2.values())))

    # suspect mean
    suspect_mean = []
    for variable, values in file2.items():
        suspect_mean.append(stats.mean(file2[variable])) 

    #suspect covariance matrix 

    suspect_covar = [[value / suspect_num_measures for value in row] for row in within_group]
    suspect_covar = np.array(suspect_covar)

    #suspect inverse covariance matrix.
    suspect_inv = np.linalg.inv(suspect_covar)

    #suspect square root 
    suspect_sqrt = scipy.linalg.sqrtm(suspect_covar)

    # suspect square root inverse
    suspect_sqrt_inv = np.linalg.inv(suspect_sqrt)

    ###########################
    ### smoothing parameter ###
    ###########################
    
    # as given by Silverman(1986)

    smooth_param= ((4/(2*num_variables+1))**(1/(num_variables+4)) 
                   * num_speakers**-(1/(num_variables+4)))

    ####################################
    ### a few other pre-calculations ###
    ####################################

    kernel = smooth_param**2 * between_group_cov

    inv_kernel = np.linalg.inv(kernel)

    kernel_typ = 0
    dist_back_off = 0
    dist_back_suspect = 0 
    suspect_off_mean_typ = 0

    suspect_off_mean_diff = np.subtract(suspect_mean, off_mean)
    suspect_off_mean_typ = np.linalg.solve(off_inv 
                                           + suspect_inv, np.linalg.solve(off_covar, off_mean) 
                                           + np.linalg.solve(suspect_covar, suspect_mean))
    
    for speaker, values in var_means_per_speaker.items():
        typicality = np.subtract(suspect_off_mean_typ, values)

        #kernel density at typicality.
        kernel_typ += (np.exp(-0.5 * typicality.T 
                             @ np.linalg.inv(np.linalg.inv(off_inv + suspect_inv) + kernel) 
                             @ typicality))

    #calculations of the distance between background and offender data
    dist_back_off += (np.exp(-0.5 * np.subtract(off_mean, values).T 
                                 @ np.linalg.inv(off_covar + kernel) 
                                 @ np.subtract(off_mean, values)))

    #calculations of the distance between background and suspect data
    dist_back_suspect += (np.exp(-0.5 * np.subtract(off_mean, values).T
                                 @ np.linalg.inv(off_covar + kernel) 
                                 @ np.subtract(off_mean, values)))

    #################
    ### numerator ###
    #################

    numerator = ((2 * np.pi) **(-num_variables) 
                     * np.linalg.det(off_sqrt_inv) 
                     * np.linalg.det(suspect_sqrt_inv)
                     * 1/np.sqrt(np.abs(np.linalg.det(between_group_cov))) 
                     * (num_speakers * smooth_param ** num_variables) ** (-1) 
                     * np.abs(np.linalg.det(off_inv + suspect_inv + inv_kernel)) ** (-0.5) 
                     * np.exp(-0.5 * 
                              suspect_off_mean_diff.T 
                              @ np.linalg.inv(suspect_covar + off_covar) 
                              @ suspect_off_mean_diff) 
                     * kernel_typ)

    ###################
    ### denominator ###
    ###################

    denominator = ( (2*np.pi)**-num_variables * 1/abs(np.linalg.det(between_group_cov))
                       * (num_speakers * smooth_param**num_variables)**-2 

                       * abs(np.linalg.det(suspect_sqrt_inv))
                       * abs(np.linalg.det(suspect_inv + inv_kernel))**-0.5
                       * dist_back_suspect

                       * abs(np.linalg.det(off_sqrt_inv))
                       * abs(np.linalg.det(off_inv + inv_kernel))**-0.5
                       * dist_back_off)

    ########################
    ### likelihood ratio ###
    ########################

    likelihood_ratio= numerator / denominator

    return(likelihood_ratio)

In [4]:
likelihood_ratio(background, speaker1, speaker2)

4.652194667121164e-10