In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sys

%matplotlib inline
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

### 0. Ensure that you have `train` and `unknown` vocab matrices.
To do this, run `1_Preprocess.ipynb` for both your small set of ground truth images and your larger set of unknown-label images to generate `train_vocab_matrix` and `unknown_vocab_matrix` respectively.

In [None]:
train_vocab_matrix = np.load('/dfs/scratch0/vschen/mri-data/relative_train_vocab_matrix.npy')
train_labels = np.load('/dfs/scratch0/vschen/mri-data/train_labels.npy')
unknown_vocab_matrix = np.load('/dfs/scratch0/vschen/mri-data/relative_4k_unknown_vocab_matrix.npy')

print "Number of Labeled Datapoints: ", train_vocab_matrix.shape[1]
print "Number of Unlabeled Datapoints: ", full_unknown_vocab_matrix.shape[1]

### 1. Add Coral and Numbskull to your path.

In [None]:
sys.path.append('../ukb/weak_supervision/numbskull') 
sys.path.append('../ukb/weak_supervision/coral') 

### 2. Construct primitive matrices from your vocab matrices

In [None]:
class PrimitiveObject(object):

    def save_primitive_matrix(self,primitive_mtx):
        self.primitive_mtx = primitive_mtx
        self.discrete_primitive_mtx = primitive_mtx
        self.num_primitives = np.shape(self.primitive_mtx)[1]
    
    def save_primitive_names(self,names):
        self.primitive_names = names
        if len(self.primitive_names) != self.num_primitives:
            Exception('Incorrect number of Primitive Names')
            
def create_primitives(vocab_matrix):
    m = 5
    num_examples = vocab_matrix.shape[1]
    primitive_mtx = np.zeros((num_examples, m))
    for i in range(num_examples):
        primitive_mtx[i, 0] = vocab_matrix[0, :][i] # area
        primitive_mtx[i, 1] = vocab_matrix[1, :][i] # eccentricity
        primitive_mtx[i, 2] = vocab_matrix[6, :][i] # perimeter
        primitive_mtx[i, 3] = vocab_matrix[8, :][i] # intensity
    
    
    primitive_mtx[:, 4] = primitive_mtx[:, 0]/(primitive_mtx[:, 2]**2.) # ratio
    P = PrimitiveObject()
    P.save_primitive_matrix(primitive_mtx)
    return P

def create_primitives_bsa(vocab_matrix, normal_matrix):
    m = 5
    num_examples = vocab_matrix.shape[1]
    primitive_mtx = np.zeros((num_examples, m))
    for i in range(num_examples):
        primitive_mtx[i, 0] = vocab_matrix[0, :][i] # area
        primitive_mtx[i, 1] = vocab_matrix[1, :][i] # eccentricity
        primitive_mtx[i, 2] = vocab_matrix[6, :][i] # perimeter
        primitive_mtx[i, 3] = vocab_matrix[8, :][i] # intensity
    
    
    primitive_mtx[:, 4] = normal_matrix[0,:]/(normal_matrix[6,:]**2.) # ratio
    P = PrimitiveObject()
    P.save_primitive_matrix(primitive_mtx)
    return P

P_train = create_primitives(train_vocab_matrix)
P_unknown = create_primitives(unknown_vocab_matrix)

primitive_names = ['area', 'eccentricity', 'perimeter', 'intensity', 'ratio']

### 3. Write heuristic functions over your chosen primitives

In [None]:
def lf_area(area):
    if area >= 2.13:
        return -1 
    if area <= 0.9: 
        return 1
    return 0

def lf_eccentricity(eccentricity):
    if eccentricity >= 0.011: 
        return 1 
    if eccentricity <= 0.015:
        return -1  
    return 0
        
def lf_perimeter(perimeter):
    if perimeter <= 0.46: 
        return 1 
    return 0
    
def lf_intensity(intensity):
    if intensity >= 3.05: 
        return 1
    if intensity <= 2.0: 
        return -1
    return 0

def lf_ratio(ratio):
    if ratio >= 4.15: 
        return -1
    if ratio <= 3.7:
        return 1
    return 0

### 4. Generate a label matrix for the train set by applying your labeling functions to each set of primitives
In this step, we depend on the Coral paradigm's ability to automatically find dependencies between primitives that we can leverage in our generative model.

In [None]:
from coral.static_analysis.dependency_learning import find_dependencies

L_names = [lf_area, lf_eccentricity, lf_perimeter, lf_intensity, lf_ratio]
L_deps = find_dependencies(L_names, primitive_names)

num_examples_train = P_train.primitive_mtx.shape[0]
L = np.zeros((len(L_names), num_examples_train))
for i in xrange(num_examples_train):
    for j in xrange(5):
        vocab_elems = P_train.primitive_mtx[i,L_deps[j]]
        L[j,i] = L_names[j](*vocab_elems)
unlabeled = np.sum(np.abs(L), axis=0)

#### Visualize LF performance on the train set

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score

unlabeled = np.sum(np.abs(L), axis=0)
print 'Coverage:', 1-float(np.sum(unlabeled == 0))/len(unlabeled)

total = float(num_examples_train)
stats_table = np.zeros((5,6))
for i in range(5):    
    predicted = L[i, :]
    stats_table[i,5] = precision_score(predicted[predicted != 0], train_labels[predicted != 0])
    stats_table[i,4] = recall_score(predicted[predicted != 0], train_labels[predicted != 0])
    stats_table[i,3] = f1_score(predicted[predicted != 0], train_labels[predicted != 0])
    stats_table[i,2] = np.sum(L[i,:] == train_labels)/float(np.sum(L[i,:] != 0))
    try: 
        stats_table[i,1] = roc_auc_score(predicted[predicted != 0], train_labels[predicted != 0])
    except ValueError as err: 
        stats_table[i,1] = None
        print 'LF:', i, err.args
    stats_table[i,0] = np.sum(np.abs(L[i,:]) != 0)/total
    
stats_table = pd.DataFrame(stats_table, index = [i.__name__ for i in L_names], columns = ["Coverage", "AUC", "Accuracy", "F1", "Recall", "Precision"])
stats_table

### 5. Discretize Primitives for Gibbs Sampler

In [None]:
def primitives_to_discrete(P, L_names):
    num_examples = P.primitive_mtx.shape[0]
    code = discretize_primitives(L_names)
    
    P.discrete_primitive_mtx = np.zeros((num_examples,len(primitive_names)))
    for i in range(num_examples):
        for j in range(len(code)):
            exec(code[j])

    P.discrete_primitive_mtx = P.discrete_primitive_mtx.astype(int)
    cardinality = []
    for v_idx in xrange(P.num_primitives):
        cardinality.append(int(np.max(P.discrete_primitive_mtx[:,v_idx])+1))
    return cardinality

cardinality_train = primitives_to_discrete(P_train, L_names)
cardinality_unknown = primitives_to_discrete(P_unknown, L_names)

### 6. Learn Generative Model

In [None]:
coral_model = CoralModel()
deps = ds.select(P.discrete_primitive_mtx, cardinality, L_deps, "HEART_MRI", threshold=thresh)
coral_model.train(P_unknown.discrete_primitive_mtx, cardinality_unknown, L_deps, MRI_UDF_OFFSET, deps=list(deps), epochs=1000, burn_in=0, reg_type=1, reg_param=0.01)
marginals_deps = coral_model.marginals(P_unknown.discrete_primitive_mtx, cardinality_train, L_deps, MRI_UDF_OFFSET, deps=list(deps), epochs=1000)

#### Measure and Visualize probabilistic labels

In [None]:
curr_auc = roc_auc_score(train_labels[unlabeled != 0], marginals_deps[unlabeled != 0])
print ('auc:', curr_auc)

plt.hist(marginals_deps)
print ("Indices: ", np.where(marginals_deps >= 0.75))

### 7. Save out probabilistic labels

In [None]:
np.save('marginals.npy', marginals)