In [117]:
### Import standard modules ###
import numpy as np
import pandas as pd
import json

In [118]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = x[list(x.keys())[0]]['Filename1'].split('_')[1]
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=0
    return x

In [126]:
### Read in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
class_data = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
class_data['userID']                   = class_data['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
class_data['annotations']              = class_data['annotations'].apply(filter_json)
# Extract choice and making it a column
class_data['choice']                   = class_data['annotations'].apply(extract_choice)
# Extract the task entry and making it a column
class_data['tasks']                    = class_data['annotations'].apply(extract_tasks)
# Extract answers and making it a column
class_data['answers']                  = class_data['annotations'].apply(extract_answers)
# Extract zooniverse ID it gave this subject and making it a column
class_data['zooID']                    = class_data['subject_data'].apply(extract_zooID) 
# Extract uniqueID assigned to the image during image creation and making it a column
class_data['imageID']                  = class_data['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
class_data['classification_number']    = class_data.groupby('user_id').cumcount()
# Check that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
class_data['goodUpload']               = class_data['subject_ids'].apply(check_upload)
# Check that the number of annotation is of size 1 (i.e. they did not do multiple annotation)
class_data['numAnnotations']           = class_data['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
class_data = class_data.drop('annotations',1)
class_data = class_data.drop('user_id',1)
class_data = class_data.drop('subject_data',1)
class_data = class_data.drop('subject_ids',1)

In [127]:
### Check if workflow version is acceptable ###
versions = [692.102,714.11399999999992] # List of acceptable versions
class_data['goodWorkFlow'] = (class_data['workflow_version'].isin(versions)) # Add column of booleans, true means acceptable

In [128]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

beta_check = ~class_data['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
id_check = class_data['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID

class_data['goodID'] = beta_check | id_check # Apply 'bitwise-or' to checks, append to dataframe

In [129]:
### Apply data quality cuts ###
final_check= class_data.goodUpload & class_data.numAnnotations & class_data.goodWorkFlow & class_data.goodID & class_data.userID != 0
class_data = class_data[final_check]

# Drop unnecessary columns
class_data = class_data.drop('user_ip',1)
class_data = class_data.drop('workflow_name',1)
class_data = class_data.drop('created_at',1)
class_data = class_data.drop('gold_standard',1)
class_data = class_data.drop('expert',1)
class_data = class_data.drop('tasks',1)
class_data = class_data.drop('answers',1)
class_data = class_data.drop('goodUpload',1)
class_data = class_data.drop('numAnnotations',1)
class_data = class_data.drop('goodWorkFlow',1)
class_data = class_data.drop('goodID',1)
class_data = class_data.drop('metadata',1)

In [130]:
### Convert alpha labels to int labels and old to new imageIDs ###

label_dict = {'45MHZLGHTMDLTN':5,'LGHTMDLTN':5,'50HZ':8,'RCMPRSSR50HZ':8,'BLP':9,'CHRP':2,'XTRMLLD':6,'HLX':14,'KFSH':18,
              'LWFRQNCBRST':1,'LWFRQNCLN':7,'NGLTCH':19,'DNTSGLTCH':19,'NNFTHBV':16,'PRDDVS':11,'60HZPWRLN':10,'60HZPWRMNS':10,
              'PWRLN60HZ':10,'RPTNGBLPS':3,'SCTTRDLGHT':4,'SCRTCH':15,'TMT':12,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':13,'WHSTL':0}

def choice_replace(x):
    return label_dict[x[0]]

old_imageID = list(id_data['old_imageID'])
new_imageID = list(id_data['new_imageID'])
id_dict = {}

for a,b in zip(old_imageID,new_imageID):
    id_dict[a] = b

def imageID_replace(x):
    try:
        x = id_dict[x]
        return x
    except:
        return x
    
class_data['choice']      = class_data['choice'].apply(choice_replace)
class_data['imageID']     = class_data['imageID'].apply(imageID_replace)

In [131]:
### Pivot dataframe to make index imageID and get choice, user_id, and workflow_version ###

# Function to aggregate data
def lister(x):
    return list(x)

# Use pandas pivot_table, create columns corresponding to image type and true label
image_values         = ['choice', 'userID','workflow_version','classification_number','zooID']
images               = pd.pivot_table(class_data,index='imageID',values=image_values,aggfunc=lister)
images['zooID']      = images['zooID'].apply(np.unique)
images['type']       = ['T']*len(images)
images['true_label'] = [-1]*len(images)

In [132]:
### Read in ML_scores ###

# Remove Hanford and Livingston designations
def name_clean(x):
    x = x.split('_')[1]
    return x

ML_scores_L       = pd.read_csv('scores_L.csv')
ML_scores_H       = pd.read_csv('scores_H.csv')
ML_scores         = ML_scores_L.append(ML_scores_H)
ML_scores['Name'] = ML_scores['Name'].apply(name_clean)

In [133]:
### Append ML_posterior matrix ###

# Get number of classes
classes = len(ML_scores.columns[2:])

# Create posterior matrix from dataframe columns
ML_posterior = ML_scores['confidence of class 0']

# Iterate over columns of dataframe
for i in range(1,classes): 
    ML_posterior = np.vstack((ML_posterior,ML_scores['confidence of class %s' % str(i)]))

ML_posterior = ML_posterior.T
ML_posterior = list(ML_posterior)
imageIDs = list(ML_scores['Name'])

# Map imageID to ML_posterior
ML_dict = {}
for a,b in zip(imageIDs,ML_posterior):
    ML_dict[a] = b
    
def ML_append(x):
    try:
        return ML_dict[x]
    except:
        return []

images_index = pd.Series(images.index)
ML_posterior = images_index.apply(ML_append)

# Append ML_posterior matrix to corresponding imageID
images['ML_posterior'] = list(ML_posterior)

In [134]:
### Get ML_label and ML_confidence ###

# Function to get index of max value in ML_posterior
def max_index(x):
    x = np.array(x)
    try:
        return np.argmax(x)
    except:
        return -1

# Function to get max confidence value in ML_posterior    
def get_max(x):
    x = np.array(x)
    try:
        return max(x)
    except:
        return -1
    
images['ML_label']          = images['ML_posterior'].apply(max_index)
images['ML_confidence']     = images['ML_posterior'].apply(get_max)

In [135]:
### Read classification of golden images ###

goldendata = pd.read_csv('GLabel.csv')

# Map zooID to true_label
gold_dict = {}
for a,b in zip(goldendata['ZooID'],goldendata['Classification']):
    gold_dict[int(a)] = int(b)

# Change type of golden images 
def type_map(x):
    x = int(x)
    if x in list(gold_dict.keys()):
        return 'G'
    else:
        return 'T'

# Change true_label of golden images  
def label_map(x):
    x = int(x)
    try:
        return gold_dict[x]
    except:
        return -1

images['type']       = images['zooID'].apply(type_map)
images['true_label'] = images['zooID'].apply(label_map)

In [136]:
### Code to check label options for each workflow version ###

for iV in versions:
    version = np.unique(data1[data1['workflow_version'] == iV]['choice'])
    print("version {0}".format(iV))
    print("length {0}".format(len(data1[data1['workflow_version'] == iV])))
    print(version)
    print("end")

version 692.102
length 2687
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end
version 714.1139999999999
length 2463
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end


In [281]:
### CC_classifier ###

# Import standard modules
import numpy as np
import pandas as pd
import pickle as pk
from scipy.io import loadmat
import random
from pdb import set_trace

In [302]:
retired_images = pd.DataFrame({ 'imageID' : [], 'class' : []})
pp_matrices    = pd.DataFrame({ 'imageID' : [],'pp_matrix' : []})
conf_matrices  = pd.DataFrame({ 'userID' : [],'conf_matrix' : []})

### Initialize constants ###

r_lim = 23                  # Max citizens who can look at image before it is given to upper class if threshold not reached
c = 20                      # Classes
priors = np.ones((1,C))/C   # Flat priors b/c we do not know what category the image is in
alpha = .4*np.ones((C,1))   # Threshold vector for user promotion
g_c = .7*np.ones((1,C))     # Threshold vector for updating confusion matrix
t = .4*np.ones((C,1))       # Threshold vector for image retirement

golden

Unnamed: 0_level_0,choice,classification_number,userID,workflow_version,zooID,type,true_label,ML_posterior,ML_label,ML_confidence
imageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0l9skbQAf0,"[1, 1]","[55, 1134]","[18049, 5209]","[692.102, 714.114]",[2212786],G,1,"[0.0, 1.0, 0.0, 0.0, 1.08e-21, 2.25e-24, 2.64e...",1,1.0
0twxOhYrgp,"[1, 1, 1]","[179, 198, 262]","[346679, 1217655, 5209]","[692.102, 692.102, 714.114]",[2212664],G,1,"[3.02e-14, 0.998267174, 1.17e-22, 1.62e-23, 2....",1,0.998267
16uv4leiTt,"[10, 10, 10, 10]","[266, 8, 7, 537]","[123, 346679, 243841, 5209]","[692.102, 692.102, 692.102, 714.114]",[2221780],G,10,"[7.64e-07, 5.83e-13, 6.01e-10, 6.84e-11, 2.14e...",19,0.979778
1JYCFrLMp6,[1],[995],[5209],[714.114],[2221107],G,1,"[0.0, 1.0, 0.0, 0.0, 2.45e-20, 3.83e-23, 7.69e...",1,1.0
1YXuAAkctA,"[11, 11, 11]","[101, 80, 870]","[1498433, 18049, 5209]","[692.102, 692.102, 714.114]",[2199468],G,11,"[5.59e-18, 2.56e-11, 7.15e-09, 1.47e-05, 0.000...",8,0.731112
1r1NEtCJK0,"[1, 1, 1, 1]","[35, 231, 1050, 455]","[529667, 123, 5209, 530281]","[692.102, 692.102, 714.114, 714.114]",[2223326],G,1,"[2.12e-19, 0.999999881, 1.3e-28, 5.97e-26, 4.5...",1,1.0
2v5Y7X23GR,"[9, 9, 9]","[231, 104, 444]","[1498433, 386563, 530281]","[714.114, 714.114, 714.114]",[2219340],G,9,"[3.03e-14, 1.08e-20, 1.22e-11, 1.94e-06, 3.93e...",9,0.999998
33p5uLQJpw,"[7, 16]","[27, 444]","[530281, 4261]","[692.102, 692.102]",[2199202],G,7,"[8.26e-08, 6.31e-07, 7.01e-11, 1.09e-13, 2.55e...",7,0.99929
3CunqHpG7l,[18],[754],[4261],[692.102],[2202627],G,9,"[7.98e-29, 1.42e-20, 3.78e-18, 0.000442051, 1....",18,0.999557
3UH4FMaNKC,"[4, 7, 7]","[120, 542, 348]","[2909, 4261, 1498433]","[692.102, 692.102, 714.114]",[2194178],G,7,"[1.31e-09, 0.005104722, 1.76e-13, 3.68e-14, 0....",7,0.980644


In [303]:
def get_conf_matrix(x):    
    
    for userID,label in zip(x['userID'],x['choice']):
        
        true_label = x['true_label']
        
        if userID in list(conf_matrices['userID']):
            
            conf_matrices['conf_matrix'][true_label,label][conf_matrices['userID'] == userID] += 1
        
        else:
            
            dummy_matrix = np.zeros((c,c))
            dummy_matrix[true_label,label] += 1
            dummy_df = pd.Series({'userID' : userID,'conf_matrix' : dummy_matrix})
            conf_matrices.loc[len(conf_matrices)] = dummy_df
            
golden = images[images['type']=='G'][:10]
a = golden[['userID','true_label','choice']].apply(get_conf_matrix, axis = 1)

#for i in conf_matrices['conf_matrix']:
    #print(i)

KeyError: ((1, 1), 'occurred at index 0l9skbQAf0')