In [125]:
### Import standard modules ###
import numpy as np
import pandas as pd
import json

In [126]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = x[list(x.keys())[0]]['Filename1'].split('_')[1]
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=0
    return x

In [127]:
### Read in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
data1 = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
data1['user_id']        = data1['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
data1['annotations']    = data1['annotations'].apply(filter_json)
# Extract choice and making it a column
data1['choice']         = data1['annotations'].apply(extract_choice)
# Extract the task entry and making it a column
data1['tasks']          = data1['annotations'].apply(extract_tasks)
# Extract answers and making it a column
data1['answers']        = data1['annotations'].apply(extract_answers)
# Extract zooniverse ID it gave this subject and making it a column
data1['zooID']          = data1['subject_data'].apply(extract_zooID) 
# Extract uniqueID assigned to the image during image creation and making it a column
data1['imageID']        = data1['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
data1['classification_number'] = data1.groupby('user_id').cumcount()
# Check that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
data1['goodUpload']     = data1['subject_ids'].apply(check_upload)
# Check that the number of annotation is of size 1 (i.e. they did not do multiple annotation)
data1['numAnnotations'] = data1['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
data1 = data1.drop('annotations',1)
data1 = data1.drop('subject_data',1)
data1 = data1.drop('subject_ids',1)

In [128]:
### Check if workflow version is acceptable ###
versions = [36.7,692.102,714.11399999999992] # List of acceptable versions
data1['goodWorkFlow'] = (data1['workflow_version'].isin(versions)) # Add column of booleans, true means acceptable

In [129]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

beta_check = ~data1['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
id_check = data1['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID

data1['goodID'] = beta_check | id_check # Apply 'bitwise-or' to checks, append to dataframe

In [130]:
### Apply data quality cuts ###
data1 = data1[data1.goodUpload & data1.numAnnotations & data1.goodWorkFlow & data1.goodID & data1.user_id != 0]

# Drop unnecessary columns
data1 = data1.drop('user_ip',1)
data1 = data1.drop('workflow_name',1)
data1 = data1.drop('created_at',1)
data1 = data1.drop('gold_standard',1)
data1 = data1.drop('expert',1)
data1 = data1.drop('tasks',1)
data1 = data1.drop('answers',1)
data1 = data1.drop('goodUpload',1)
data1 = data1.drop('numAnnotations',1)
data1 = data1.drop('goodWorkFlow',1)
data1 = data1.drop('goodID',1)
data1 = data1.drop('metadata',1)

In [131]:
### Convert alpha labels to int labels and old to new imageIDs ###

label_dict = {'45MHZLGHTMDLTN':5,'LGHTMDLTN':5,'50HZ':8,'RCMPRSSR50HZ':8,'BLP':9,'CHRP':2,'XTRMLLD':6,'HLX':14,'KFSH':18,
              'LWFRQNCBRST':1,'LWFRQNCLN':7,'NGLTCH':19,'DNTSGLTCH':19,'NNFTHBV':16,'PRDDVS':11,'60HZPWRLN':10,'60HZPWRMNS':10,
              'PWRLN60HZ':10,'RPTNGBLPS':3,'SCTTRDLGHT':4,'SCRTCH':15,'TMT':12,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':13,'WHSTL':0}

def choice_replace(x):
    return label_dict[x[0]]

old_imageID = list(id_data['old_imageID'])
new_imageID = list(id_data['new_imageID'])
id_dict = {}

for a,b in zip(old_imageID,new_imageID):
    id_dict[a] = b

def imageID_replace(x):
    try:
        x = id_dict[x]
        return x
    except:
        return x
    
data1['choice']      = data1['choice'].apply(choice_replace)
data1['imageID']     = data1['imageID'].apply(imageID_replace)

In [137]:
### Pivot dataframe to make index imageID and get choice, user_id, and workflow_version ###

# Function to aggregate data
def lister(x):
    return list(x)

# Use pandas pivot_table
image_values         = ['choice', 'user_id','workflow_version','classification_number','zooID']
images               = pd.pivot_table(data1,index='imageID',values=image_values,aggfunc=lister)
images['zooID']      = images['zooID'].apply(np.unique)
images['type']       = ['T']*len(images)
images['true_label'] = [-1]*len(images)

In [138]:
### Read in ML_scores ###

# Remove Hanford and Livingston designations
def name_clean(x):
    x = x.split('_')[1]
    return x

ML_scores_L       = pd.read_csv('scores_L.csv')
ML_scores_H       = pd.read_csv('scores_H.csv')
ML_scores         = ML_scores_L.append(ML_scores_H)
ML_scores['Name'] = ML_scores['Name'].apply(name_clean)

In [139]:
### Append ML_posterior matrix ###

# Get number of classes
classes = len(ML_scores.columns[2:])

# Create posterior matrix from dataframe columns
ML_posterior = ML_scores['confidence of class 0']

# Iterate over columns of dataframe
for i in range(1,classes): 
    ML_posterior = np.vstack((ML_posterior,ML_scores['confidence of class %s' % str(i)]))

ML_posterior = ML_posterior.T
ML_posterior = list(ML_posterior)
imageIDs = list(ML_scores['Name'])

# Map imageID to ML_posterior
ML_dict = {}
for a,b in zip(imageIDs,ML_posterior):
    ML_dict[a] = b
    
def ML_append(x):
    try:
        return ML_dict[x]
    except:
        return []

images_index = pd.Series(images.index)
ML_posterior = images_index.apply(ML_append)

# Append ML_posterior matrix to corresponding imageID
images['ML_posterior'] = list(ML_posterior)

In [140]:
### Code to check label options for each workflow version ###

for iV in versions:
    version = np.unique(data1[data1['workflow_version'] == iV]['choice'])
    print("version {0}".format(iV))
    print("length {0}".format(len(data1[data1['workflow_version'] == iV])))
    print(version)
    print("end")

version 36.7
length 2372
[ 0  2  4  9 10 18]
end
version 692.102
length 2687
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end
version 714.1139999999999
length 2463
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end


In [145]:
#data1.dtypes
#images.dtypes
#len(images) == len(np.unique(data1['imageID']))
#len(images)
images

Unnamed: 0_level_0,choice,classification_number,user_id,workflow_version,zooID,type,true_label,ML_posterior
imageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
001riNKlIk,[5],[153],[1502635],[714.114],[2201499],T,-1,"[0.0, 3.49e-16, 6.87e-44, 1.03e-25, 1.22e-34, ..."
00LbZLJfCn,[18],[195],[289539],[36.7],[2180894],T,-1,[]
00cweQNJbb,"[9, 12, 12, 9]","[34, 63, 172, 1219]","[637439, 31153, 1217655, 4261]","[692.102, 692.102, 692.102, 714.114]",[2199741],T,-1,"[2.04e-22, 6.64e-11, 1.69e-10, 1.46e-08, 2.67e..."
03HBLIDhQy,"[19, 19]","[31, 8]","[1831, 530281]","[692.102, 692.102]",[2199170],T,-1,"[7.81e-09, 1.96e-11, 2.42e-13, 1.01e-08, 0.001..."
03qNWhxN7S,"[9, 9]","[196, 14]","[123, 1499083]","[36.7, 36.7]",[2187788],T,-1,[]
06QGka2Lah,"[4, 16, 7, 7]","[220, 653, 388, 370]","[123, 4261, 5209, 1498433]","[692.102, 692.102, 714.114, 714.114]",[2223935],T,-1,"[1.55e-06, 1.46e-06, 1.9e-12, 7.91e-15, 3.44e-..."
06d3eIYFDU,[19],[991],[5209],[714.114],[2208904],T,-1,"[0.000392942, 5.73e-06, 1.78e-08, 1.98e-08, 2...."
08ClYsqQbW,"[7, 7, 7, 7, 4]","[58, 704, 11, 305, 458]","[679725, 4261, 59701, 530281, 1498433]","[692.102, 692.102, 714.114, 714.114, 714.114]",[2221959],T,-1,"[2.08e-06, 3.38e-05, 1.63e-12, 2.11e-14, 2.67e..."
08pHp9bdqf,[9],[188],[123],[36.7],[2187791],T,-1,[]
097G1fXvLO,[9],[547],[289539],[36.7],[2187792],T,-1,[]


In [24]:
#function to create lists of empty lists
def emptylist(x):
    elist = []
    for i in range(x):
        elist.append([])
    return elist

In [267]:
### Read classification of golden images ###

goldendata = pd.read_csv('GLabel.csv')

for i in range(len(goldendata)): #iterate over data
    
    try:
        images_idx = np.where(int(goldendata['zooID'][i]) == np.unique(images['zooID']))[0][0] #find location in images dataframe
        images['truelabel'][images_idx] = int(goldendata['Classification'][i]) #change true label to golden classification
        images['type'][images_idx] = 'G' #change image type to golden
        
    except:
        
        pass #to catch errors caused by images in goldendata not being in images dataframe

In [264]:
images

Unnamed: 0_level_0,choice,classification_number,user_id,workflow_version,zooID,type,true_label
imageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
001riNKlIk,[5],[153],[1502635],[714.114],[2201499],T,-1
00LbZLJfCn,[18],[195],[289539],[36.7],[2180894],T,-1
00cweQNJbb,"[9, 12, 12, 9]","[34, 63, 172, 1219]","[637439, 31153, 1217655, 4261]","[692.102, 692.102, 692.102, 714.114]","[2199741, 2199741, 2199741, 2199741]",T,-1
03HBLIDhQy,"[19, 19]","[31, 8]","[1831, 530281]","[692.102, 692.102]","[2199170, 2199170]",T,-1
03qNWhxN7S,"[9, 9]","[196, 14]","[123, 1499083]","[36.7, 36.7]","[2187788, 2187788]",T,-1
06QGka2Lah,"[4, 16, 7, 7]","[220, 653, 388, 370]","[123, 4261, 5209, 1498433]","[692.102, 692.102, 714.114, 714.114]","[2223935, 2223935, 2223935, 2223935]",T,-1
06d3eIYFDU,[19],[991],[5209],[714.114],[2208904],T,-1
08ClYsqQbW,"[7, 7, 7, 7, 4]","[58, 704, 11, 305, 458]","[679725, 4261, 59701, 530281, 1498433]","[692.102, 692.102, 714.114, 714.114, 714.114]","[2221959, 2221959, 2221959, 2221959, 2221959]",T,-1
08pHp9bdqf,[9],[188],[123],[36.7],[2187791],T,-1
097G1fXvLO,[9],[547],[289539],[36.7],[2187792],T,-1
