In [1]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import os.path
from pathlib import Path
import time
import pandas as pd
import pickle as pkl
import librosa as lb

In [2]:
def evalAllQueries(pairsFile, annotFile, pathsRoot, scoring_collar = 0.1, numThresholds = 1000):
    insertionAttributions = []
    deletionAttributions = []
    replacementAttributions = []
    
    annotList = map(lambda x: x.split(), open(annotFile, 'r').readlines()) # I doubt this works but lets give it a go...
    
    with open(pairsFile, 'r') as f:
        for line in f:
            parts = line.strip().split()
            assert len(parts) == 2
            
            # Find correct annotations
            queryId = os.path.basename(parts[0])
            annot = findAnnot(annotList, queryId)
            
            # Find path
            pathBasename = queryId + '__' + os.path.basename(parts[1])  # This will need to change based on file structure
            pathFile = pathsRoot + '/' + pathBasename + '.pkl'
            path = pkl.load(open(pathFile, 'rb'))

            tamperType, theseAttributions = evalQuery(path, annot)
            
            # Add new costs and GT to
            if tamperType == "I":
                insertionAttributions += theseAttributions
            elif tamperType == "D":
                deletionAttributions += theseAttributions
            else:
                replacementAttributions += theseAttributions

    # Get ROCs
    insertionROC = calc_ROC(np.array(insertionAttributions), numTresholds)
    deletionROC = calc_ROC(np.array(deletionAttributions), numTresholds)
    replacementROC = calc_ROC(np.array(replacementAttributions), numTresholds)
    
    return [insertionROC, deletionROC, replacementROC]

In [3]:
def findAnnot(annotList, queryId):
    for annot in annotList:
        if annotList[0] == queryId:
            return annot
    
    print("Error: Annotations not found")
    return None

In [4]:
def evalQuery(path, annot, scoring_collar):
    
    tamperType = annot[4]
    attributions = []
    
    # set up boundaries list (in seconds, relative to modified query)
    # In form matchingRegionStart, tamperStart, tamperEnd, matchingRegionEndEnd
    # This will depend on tamperType
    if tamperType == "I":
        insertionStart = int(annot[5])
        insertionLength = int(annot[8]) - int(annot[7])
        insertionEnd = insertionStart + insertionLength
        offset = 0
        
        # I use +/- inf here to signal that the matching region extends to the first and last frames, and no scoring
        # collar is needed
        boundaries = [-float('inf'), insertionStart, insertionEnd, float('inf')]

    elif tamperType == "D": # For deletions, also flip query and reference
        queryPath = path[:,0]
        refPath = path[:,1]
        path[:,0] = refPath
        path[:1] = queryPath
        
        deletionStart = annot[5]
        deletionEnd = annot[6]
        offset = annot[2]
        matchEnd = annot[3] - offset
        
        # Here, the matching region starts and ends at the boundaries of the query recording
        boundaries = [0, deletionStart, deletionEnd, matchEnd]
        
    
    else: # replacement
        replacementStart = annot[5]
        replacementEnd = annot[6]
        offset = 0
        
        # Again, the matching region will extend all the way to the first and last frames
        boundaries = [-float('inf'), replacementStart, replacementEnd, float('inf')]
    
    query_length = path[-1, 0] # NOTE: May need to change this based on path format
    
    GT = getAttributionsGT(query_length, offset, boundaries, scoring_collar, hop_sec)

    # Impute cost scores
    # Interpolate here to fill in the frames that the path jumps over
    costs = np.interp(np.arange(offset / hop_sec, query_length), path[:,0], path[:,2])
    for i in range(gt.shape[0]):
        if gt[i] >=0:
            attributions.append([gt[i], costs[i]])
            
    return attributions

In [5]:
def getAttributionsGT(query_length, offset, boundaries, scoring_collar, hop_sec):
    offsetFrames = offset / hop_sec
    gt = np.zeros(query_length - offsetFrames)
    
    # Get the GT for each frame
    # For now, just represent each frame with a single timestamp (at where the frame begins)
    for frame in range(offset, query_length):
        t_query = frame * hop_sec - offset
        if withinCollar(t_query, boundaries, scoring_collar):
            gt[frame] = -1
        elif t_query < boundaries[0] or (t_query > boundaries[1] and t_query < boundaries[2]) or t_query > boundaries[3]:
            gt[frame] = 1 # Non-matching region
        else:
            gt[frame] = 0
            
    return gt

In [6]:
def withinCollar(t_query, boundaries, scoring_collar):
    for t_boundary in boundaries:
        if np.abs(t_query - t_boundary) < scoring_collar:
            return True
    return False

In [7]:
def calc_ROC(attributions, numThresholds):
    '''
    Calculates ROC curve for attributions based on number of thresholds
    
    '''
    
    costs = attributions[:,1]
    gt = attributions[:,0]
        
        
    # Get minimum and max for thresholds
    thresholdMin = np.min(costs)
    thresholdMax = np.max(costs)
    
    thresholds = np.linspace(thresholdMin, thresholdMax, numThresholds)
    ROC = np.ones((numThresholds,3))*-1
    
    # For each threshold, calculate false positive and false negative (miss) rate
    for i, threshold in enumerate(thresholds):
        FPCountTot = 0
        FNCountTot = 0
        TrueNegCount = 0
        TruePosCount = 0
        
        FPCount, FNCount = calcFPFN(costs, gt, threshold)

        TrueNegCount += np.sum(gt == 0)
        TruePosCount += np.sum(gt == 1)

        FPCountTot += FPCount
        FNCountTot += FNCount
        
        # Threshold, False Positive, False Negative
        ROC[i,:] = [threshold, FPCountTot/TrueNegCount, FNCountTot/TruePosCount]
    
    return ROC

In [8]:
def calcFPFN(costVec, gtAttribution, threshold):
    '''
    Calculate number of false positives and false negatives
    '''
    
    # If cost is lower than threshold, then we consider sample in
    # non tampered region (nontampered = 1, tampered = 0)
    costHypVec = (costVec < threshold).astype(int)

    diffVec = (costHypVec - gtAttribution)

    FPCount = np.sum(np.maximum(diffVec,0))
    FNCount = np.sum(np.minimum(diffVec,0))*-1

    return FPCount, FNCount

## TODO: Run on data, Gen plots, Debug