In [149]:
### Import standard modules ###
import numpy as np
import pandas as pd
import json

In [150]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = str(x[list(x.keys())[0]]['Filename1'].split('_')[1])
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=0
    return x

In [151]:
### Read in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
class_data = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
class_data['userID']                   = class_data['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
class_data['annotations']              = class_data['annotations'].apply(filter_json)
# Extract choice and making it a column
class_data['choice']                   = class_data['annotations'].apply(extract_choice)
# Extract the task entry and making it a column
class_data['tasks']                    = class_data['annotations'].apply(extract_tasks)
# Extract answers and making it a column
class_data['answers']                  = class_data['annotations'].apply(extract_answers)
# Extract zooniverse ID it gave this subject and making it a column
class_data['zooID']                    = class_data['subject_data'].apply(extract_zooID) 
# Extract uniqueID assigned to the image during image creation and making it a column
class_data['imageID']                  = class_data['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
class_data['classification_number']    = class_data.groupby('user_id').cumcount()
# Check that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
class_data['goodUpload']               = class_data['subject_ids'].apply(check_upload)
# Check that the number of annotation is of size 1 (i.e. they did not do multiple annotation)
class_data['numAnnotations']           = class_data['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
class_data = class_data.drop('annotations',1)
class_data = class_data.drop('user_id',1)
class_data = class_data.drop('subject_data',1)
class_data = class_data.drop('subject_ids',1)

In [152]:
### Check if workflow version is acceptable ###
versions = [714.11399999999992] # List of acceptable versions #692.102 is beta with 18 columns
class_data['goodWorkFlow'] = (class_data['workflow_version'].isin(versions)) # Add column of booleans, true means acceptable

In [153]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

# Data for bad golden images
bad_data = pd.read_csv('bad_golden_images.csv',header=None)

# Remove Hanford and Livingston designations
def name_clean(x):
    x = x.split('_')[1]
    return x

# List of bad golden images
bad_images = list(bad_data[0].apply(name_clean))
bad_images.append('4qQfqJV6sH')

not_beta_check = ~class_data['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
new_id_check = class_data['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID
not_bad_id = ~class_data['imageID'].isin(bad_images)

class_data['goodID'] = (not_beta_check | new_id_check) & not_bad_id # Apply bitwise boolean operators,, append to dataframe

In [154]:
### Apply data quality cuts ###
final_check = class_data.goodUpload & class_data.numAnnotations & class_data.goodWorkFlow & class_data.goodID & class_data.userID != 0
class_data  = class_data[final_check]

# Drop unnecessary columns
class_data = class_data.drop('user_ip',1)
class_data = class_data.drop('workflow_name',1)
class_data = class_data.drop('created_at',1)
class_data = class_data.drop('gold_standard',1)
class_data = class_data.drop('expert',1)
class_data = class_data.drop('tasks',1)
class_data = class_data.drop('answers',1)
class_data = class_data.drop('goodUpload',1)
class_data = class_data.drop('numAnnotations',1)
class_data = class_data.drop('goodWorkFlow',1)
class_data = class_data.drop('goodID',1)
class_data = class_data.drop('metadata',1)

In [155]:
### Convert alpha labels to int labels and old to new imageIDs ###

label_dict = {'45MHZLGHTMDLTN':5,'LGHTMDLTN':5,'50HZ':8,'RCMPRSSR50HZ':8,'BLP':9,'CHRP':2,'XTRMLLD':6,'HLX':14,'KFSH':18,
              'LWFRQNCBRST':1,'LWFRQNCLN':7,'NGLTCH':19,'DNTSGLTCH':19,'NNFTHBV':16,'PRDDVS':11,'60HZPWRLN':10,'60HZPWRMNS':10,
              'PWRLN60HZ':10,'RPTNGBLPS':3,'SCTTRDLGHT':4,'SCRTCH':15,'TMT':12,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':13,'WHSTL':0}

def choice_replace(x):
    return label_dict[x[0]]

old_imageID = list(id_data['old_imageID'])
new_imageID = list(id_data['new_imageID'])
id_dict = {}

for a,b in zip(old_imageID,new_imageID):
    id_dict[a] = b

def imageID_replace(x):
    try:
        x = id_dict[x]
        return x
    except:
        return x
    
class_data['choice']      = class_data['choice'].apply(choice_replace)
class_data['imageID']     = class_data['imageID'].apply(imageID_replace)

In [156]:
### Pivot dataframe to make index imageID and get choice, user_id, and workflow_version ###

# Function to aggregate data
def lister(x):
    return list(x)

# Use pandas pivot_table, create columns corresponding to image type and true label
image_values         = ['choice', 'userID','workflow_version','classification_number','zooID']
images               = pd.pivot_table(class_data,index='imageID',values=image_values,aggfunc=lister)
images['zooID']      = images['zooID'].apply(np.unique)
images['type']       = ['T']*len(images)
images['true_label'] = [-1]*len(images)

In [157]:
### Append ML_posterior matrix ###

ML_scores_L       = pd.read_csv('scores_L.csv')
ML_scores_H       = pd.read_csv('scores_H.csv')
ML_scores         = ML_scores_L.append(ML_scores_H)
ML_scores['Name'] = ML_scores['Name'].apply(name_clean)

# Get number of classes
classes = len(ML_scores.columns[2:])

# Create posterior matrix from dataframe columns
ML_posterior = ML_scores['confidence of class 0']

# Iterate over columns of dataframe
for i in range(1,classes): 
    ML_posterior = np.vstack((ML_posterior,ML_scores['confidence of class %s' % str(i)]))

ML_posterior = ML_posterior.T
ML_posterior = list(ML_posterior)
imageIDs = list(ML_scores['Name'])

# Map imageID to ML_posterior
ML_dict = {}
for a,b in zip(imageIDs,ML_posterior):
    ML_dict[a] = b
    
def ML_append(x):
    try:
        return ML_dict[x]
    except:
        return []

images_index = pd.Series(images.index)
ML_posterior = images_index.apply(ML_append)

# Append ML_posterior matrix to corresponding imageID
images['ML_posterior'] = list(ML_posterior)

In [158]:
### Get ML_label and ML_confidence ###

# Function to get index of max value in ML_posterior
def max_index(x):
    x = np.array(x)
    try:
        return np.argmax(x)
    except:
        return -1

# Function to get max confidence value in ML_posterior    
def get_max(x):
    x = np.array(x)
    try:
        return max(x)
    except:
        return -1
    
images['ML_label']          = images['ML_posterior'].apply(max_index)
images['ML_confidence']     = images['ML_posterior'].apply(get_max)

In [159]:
### Read classification of golden images ###

goldendata = pd.read_csv('GLabel.csv')

# Map zooID to true_label
gold_dict = {}
for a,b in zip(goldendata['ZooID'],goldendata['Classification']):
    gold_dict[int(a)] = int(b)

# Change type of golden images 
def type_map(x):
    x = int(x)
    if x in list(gold_dict.keys()):
        return 'G'
    else:
        return 'T'

# Change true_label of golden images  
def label_map(x):
    x = int(x)
    try:
        return gold_dict[x]
    except:
        return -1

images['type']       = images['zooID'].apply(type_map)
images['true_label'] = images['zooID'].apply(label_map)

In [160]:
### Code to check label options for each workflow version ###

for iV in versions:
    version = np.unique(class_data[class_data['workflow_version'] == iV]['choice'])
    print("version {0}".format(iV))
    print("length {0}".format(len(class_data[class_data['workflow_version'] == iV])))
    print(version)
    print("end")

version 714.1139999999999
length 2463
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end


In [161]:
### CC_classifier ###

# Import standard modules
import numpy as np
import pandas as pd
import pickle as pk
from scipy.io import loadmat
import random
from pdb import set_trace

In [162]:
### Initialize constants ###
retired_images = pd.DataFrame({ 'imageID' : [], 'class' : []})
retired_images.set_index('imageID')
pp_matrices    = pd.DataFrame({ 'imageID' : [],'pp_matrix' : []})
pp_matrices.set_index('imageID')

r_lim = 4 # Make 23         # Max citizens who can look at image before it is given to upper class if threshold not reached
c = 20                      # Classes
priors = np.ones((1,c))/c   # Flat priors b/c we do not know what category the image is in
alpha = .4*np.ones((c,1))   # Threshold vector for user promotion
g_c = .6*np.ones((1,c))     # Threshold vector for updating confusion matrix
t = .8*np.ones((c,1))       # Threshold vector for image retirement

In [163]:
more_bad_images = ['BlLWO4kuCi','4qQfqJV6sH']
images = images.drop(more_bad_images)

In [164]:
conf_matrices = {}


def make_conf_matrices(x):
    
    for userID in x:
        
        if userID not in list(conf_matrices.keys()):
            
            conf_matrices[userID] = np.zeros((c,c))


def cc_classifier(x):
    
    if x['type'] == 'G': # Update confusion matrices for users who labeled golden images
        
        for userID,user_label in zip(x['userID'],x['choice']):
        
            true_label = x['true_label']
            conf_matrices[userID][true_label,user_label] += 1
            
            return 0
            
    else: # If image type is T
        
        num_annotators = len(x['userID'])
        pp_matrix = np.zeros((c,num_annotators))
        
        for k in range(num_annotators):
            
            conf_matrix = conf_matrices[x['userID'][k]]
        
            conf_divided,a1,a2,a3 = np.linalg.lstsq(np.diag(np.sum(conf_matrix,axis=1)),conf_matrix)            
            
            for j in range(c):
                
                if sum(conf_divided[:,x['choice'][k]]*priors[0]) == 0:
                    
                    pp_matrix[j,k] = priors[0][j]
                    
                    pass
                    
                else:
                
                    pp_matrix[j,k] = (conf_divided[j,x['choice'][k]]*priors[0][j])/sum(conf_divided[:,x['choice'][k]]*priors[0])          

        pp_matrix2 = np.hstack((pp_matrix, np.array(x['ML_posterior']).reshape(c,1))) # Append ML_posterior to pp_matri
        v = np.sum(pp_matrix2, axis=1)/np.sum(np.sum(pp_matrix2)) # Create vector of normalized sums of pp_matrix2
        maximum = np.amax(v) # Initialize maximum, max value of v
        maxIdx = np.argmax(v) # Initialize maxIdx, index of max value of v

        if maximum >= t[maxIdx]: # If maximum is above threshold for given class, retire image
            
            true_label = maxIdx
            images.set_value(x.name, 'true_label', true_label)
            
            #for userID,user_label in zip(x['userID'],x['choice']):
                
                #conf_matrices[userID][true_label,user_label] += 1
            
            print('Image is retired to class', true_label)
            return 1

        elif num_annotators >= r_lim: # Pass to upper class if more than r_lim annotators and no decision reached
            
            print('Image is given to the upper class')
            return 2
            

        else: # If fewer than r_lim annotators have looked at image, keep image
            
            print('More labels are needed for the image')
            return 3
                           
            
a = images['userID'].apply(make_conf_matrices)                
images['decision'] = images[['userID','true_label','choice','type','ML_posterior','ML_label']].apply(cc_classifier, axis = 1)

Image is retired to class 5
Image is retired to class 12
More labels are needed for the image
Image is retired to class 19
More labels are needed for the image
Image is retired to class 1
Image is retired to class 19
More labels are needed for the image
Image is retired to class 7
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
Image is retired to class 9
Image is retired to class 19
Image is retired to class 7
Image is retired to class 18
Image is retired to class 1
Image is retired to class 1
More labels are needed for the image
Image is retired to class 10
Image is retired to class 1
Image is retired to class 1
Image is retired to class 10
Image is retired to class 15
More labels are needed for the image
Image is retired to class 1
Image is retired to class 1
Image is retired to class 1
More labels are needed for the image
More labels are needed for the image
Image is retired to class

In [165]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors

sample_conf = conf_matrices[4261] # Melina_t confusion matrix
plt.matshow(sample_conf, cmap='Blues',norm=colors.LogNorm(vmin=1, vmax=100))
plt.colorbar()
plt.xlabel('classes')
plt.ylabel('classes')
plt.title('Visualization of confusion matrix \n')
ax = plt.gca()
ax.set_xticks(np.arange(0,20,1))
ax.set_yticks(np.arange(0,20,1))
plt.show()

In [134]:
#conf_matrices[4261] #Whoandwhatitis
a = np.array([[  0.00000000e+00,   0.00000000e+00,   3.83000000e-10],
 [  0.00000000e+00,   0.00000000e+00,   1.74000000e-16],
 [  0.00000000e+00,   0.00000000e+00,   6.24000000e-10],
 [  0.00000000e+00,   0.00000000e+00,   6.01000000e-10],
 [  2.71843221e-02,   0.00000000e+00,   2.69000000e-14],
 [  0.00000000e+00,   0.00000000e+00,   4.76000000e-15],
 [  0.00000000e+00,   0.00000000e+00,   1.82000000e-11],
 [  1.01941208e-02,   0.00000000e+00,   1.81000000e-13],
 [  0.00000000e+00,   0.00000000e+00,   1.20000000e-11],
 [  0.00000000e+00,   0.00000000e+00,   6.89000000e-10],
 [  7.33976696e-01,   6.18181818e-01,   9.59808469e-01],
 [  0.00000000e+00,   0.00000000e+00,   2.27000000e-14],
 [  0.00000000e+00,   0.00000000e+00,   4.22000000e-15],
 [  0.00000000e+00,   0.00000000e+00,   2.20000000e-14],
 [  0.00000000e+00,   0.00000000e+00,   3.81000000e-05],
 [  1.17436271e-01,   0.00000000e+00,   3.50305070e-02],
 [  0.00000000e+00,   0.00000000e+00,   1.22000000e-09],
 [  0.00000000e+00,   0.00000000e+00,   1.66000000e-13],
 [  0.00000000e+00,   0.00000000e+00,   4.72000000e-15],
 [  1.11208590e-01,   3.81818182e-01,   5.12292400e-03]])

plt.matshow(a, cmap='Blues',norm=colors.LogNorm(vmin=.1, vmax=1))
plt.colorbar()
plt.xlabel('p(i|j)')
plt.ylabel('classes')
ax = plt.gca()
ax.set_yticks(np.arange(0,20,1))
plt.show()