In [93]:
### Import standard modules ###
import numpy as np
import pandas as pd
import json

In [94]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = x[list(x.keys())[0]]['Filename1'].split('_')[1]
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=False
    return x

In [95]:
### Reading in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
data1 = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
data1['user_id']        = data1['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
data1['annotations']    = data1['annotations'].apply(filter_json)
# Extracting choice and making it a column
data1['choice']         = data1['annotations'].apply(extract_choice)
# Extracting the task entry and making it a column
data1['tasks']          = data1['annotations'].apply(extract_tasks)
# Extracting answers and making it a column
data1['answers']        = data1['annotations'].apply(extract_answers)
# Extracting zooniverse ID it gave this subject and making it a column
data1['zooID']          = data1['subject_data'].apply(extract_zooID) 
# Extracting uniqueID assigned to the image during image creation and making it a column
data1['imageID']        = data1['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
data1['classification_number'] = data1.groupby('user_id').cumcount()
# Making sure that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
data1['goodUpload']     = data1['subject_ids'].apply(check_upload)
# Making sure the number of annotation is of size 1 (i.e. they did not do multiple annotation)
data1['numAnnotations'] = data1['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
data1 = data1.drop('annotations',1)
data1 = data1.drop('subject_data',1)
data1 = data1.drop('subject_ids',1)

In [96]:
### Check if workflow version is acceptable ###
versions = [36.7,380.7,692.102,714.11399999999992] # List of acceptable versions
data1['goodWorkFlow'] = (data1['workflow_version'].isin(versions)) # Add column of booleans, true means acceptable

In [97]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

beta_check = ~data1['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
id_check = data1['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID

data1['goodID'] = beta_check | id_check # Apply 'bitwise-or' to checks, append to dataframe

In [98]:
### Apply data quality cuts ###
data1 = data1[data1.goodUpload & data1.numAnnotations & data1.goodWorkFlow & data1.goodID & data1.user_id != False]

# Drop unnecessary columns
data1 = data1.drop('user_ip',1)
data1 = data1.drop('workflow_name',1)
data1 = data1.drop('created_at',1)
data1 = data1.drop('gold_standard',1)
data1 = data1.drop('expert',1)
data1 = data1.drop('tasks',1)
data1 = data1.drop('answers',1)
data1 = data1.drop('goodUpload',1)
data1 = data1.drop('numAnnotations',1)
data1 = data1.drop('goodWorkFlow',1)
data1 = data1.drop('goodID',1)
data1 = data1.drop('metadata',1)

In [101]:
def lister(x):
    return list(x)

#data1.pivot(index='imageID',columns='choice',values='choice')
#data1.groupby('imageID').sum()
images = pd.pivot_table(data1,index='imageID',values=['choice', 'user_id','workflow_version'],aggfunc=lister)
images

Unnamed: 0_level_0,choice,user_id,workflow_version
imageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
001JFg1zZn,"[[BLP], [DNTSGLTCH]]","[1461442, 322252]","[380.7, 380.7]"
001PElpkuv,"[[BLP], [BLP], [BLP], [BLP], [BLP], [BLP], [BL...","[322285, 1464491, 322252, 5209, 3462, 1430492,...","[380.7, 380.7, 380.7, 380.7, 380.7, 380.7, 380..."
001rVzbSPh,"[[PWRLN60HZ], [DNTSGLTCH]]","[1460714, 322252]","[380.7, 380.7]"
002U5q9uFK,[[BLP]],[322252],[380.7]
008zHCoBcj,"[[DNTSGLTCH], [DNTSGLTCH], [DNTSGLTCH]]","[1428840, 704538, 322252]","[380.7, 380.7, 380.7]"
009CGIYfPp,"[[BLP], [BLP]]","[1347018, 322252]","[380.7, 380.7]"
009S1uci6w,[[BLP]],[322252],[380.7]
009SMTyP3u,"[[DNTSGLTCH], [DNTSGLTCH], [DNTSGLTCH]]","[129311, 1430522, 322252]","[380.7, 380.7, 380.7]"
00By4Fnnm6,"[[LWFRQNCBRST], [NNFTHBV], [LWFRQNCBRST], [LWF...","[1431517, 5209, 1464491, 1347018, 322252, 1396...","[380.7, 380.7, 380.7, 380.7, 380.7, 380.7]"
00CGhqJj1a,"[[DNTSGLTCH], [DNTSGLTCH]]","[317055, 322252]","[380.7, 380.7]"


In [55]:
### Code to check label options for each workflow version ###

for iV in versions:
    version = np.unique(data1[data1['workflow_version'] == iV]['choice'])
    print("version {0}".format(iV))
    print("length {0}".format(len(data1[data1['workflow_version'] == iV])))
    print(version)
    print("end")

version 36.7
length 4184
[['BLP'] ['CHRP'] ['KFSH'] ['PWRLN60HZ'] ['SCTTRDLGHT'] ['WHSTL']]
end
version 380.7
length 26047
[['45MHZLGHTMDLTN'] ['BLP'] ['CHRP'] ['CLBRTNLN300HZ'] ['DNTSGLTCH']
 ['KFSH'] ['LWFRQNCBRST'] ['NNFTHBV'] ['PWRLN60HZ'] ['SCRTCH']
 ['SCTTRDLGHT'] ['VLNHRMNC500HZ'] ['WHSTL'] ['WNDRNGLN'] ['XTRMLLD']]
end
version 692.102
length 9755
[['45MHZLGHTMDLTN'] ['50HZ'] ['BLP'] ['CHRP'] ['HLX'] ['KFSH']
 ['LWFRQNCBRST'] ['LWFRQNCLN'] ['NGLTCH'] ['NNFTHBV'] ['PRDDVS']
 ['PWRLN60HZ'] ['RPTNGBLPS'] ['SCRTCH'] ['SCTTRDLGHT'] ['TMT']
 ['VLNMDHRMNC500HZ'] ['WHSTL'] ['WNDRNGLN'] ['XTRMLLD']]
end
version 714.1139999999999
length 5223
[['BLP'] ['CHRP'] ['HLX'] ['KFSH'] ['LGHTMDLTN'] ['LWFRQNCBRST']
 ['LWFRQNCLN'] ['NGLTCH'] ['NNFTHBV'] ['PRDDVS'] ['PWRLN60HZ']
 ['RCMPRSSR50HZ'] ['RPTNGBLPS'] ['SCRTCH'] ['SCTTRDLGHT'] ['TMT']
 ['VLNMDHRMNC500HZ'] ['WHSTL'] ['WNDRNGLN'] ['XTRMLLD']]
end


In [56]:
### Dict for converting alpha labels to int labels ###
label_dict = {'45MHZLGHTMDLTN':0,'LGHTMDLTN':0,'50HZ':1,'RCMPRSSR50HZ':1,'BLP':2,'CHRP':3,'XTRMLLD':4,'HLX':5,'KFSH':6,
              'LWFRQNCBRST':7,'LWFRQNCLN':8,'NGLTCH':9,'DNTSGLTCH':9,'NNFTHBV':10,'PRDDVS':11,'60HZPWRLN':12,'60HZPWRMNS':12,
              'PWRLN60HZ':12,'RPTNGBLPS':13,'SCTTRDLGHT':14,'SCRTCH':15,'TMT':16,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':18,'WHSTL':19}

In [91]:
data1.dtypes
images.dtypes
#len(images) == len(np.unique(data1['imageID']))

choice              object
user_id             object
workflow_version    object
dtype: object

In [47]:
### READ FILE FOR CONVERTING imageIDs ###
id_match = pd.read_csv('IDmatchall.txt') #read in file
tmp_new = []
tmp_old = []
no_match = [] #list of imageIDs with no match (timestamp errors)
id_dict = {} #empty dict for converting imageIDs

for i in id_match['# New       Old']:
    if len(i)>10:
        i = i.split(' ')
        tmp_new.append(i[0])
        tmp_old.append(i[1])

id_match = pd.DataFrame({'new':tmp_new,'old':tmp_old})

for a,b in zip(id_match['new'],id_match['old']):
    id_dict[b] = a

In [30]:
### APPEND ALL PERTINENT DATA TO LIST ###

tmp_user= []
tmp_user_id = []
tmp_workflow = []
tmp_task = []
tmp_choice = []
tmp_retired = []
tmp_unique_id = []
tmp_zoo_id = []

for i in range(len(data)):
    # create list to hold to output information of each classification
    output = []
    # Check that there was only 1 choice made...
    annotations = json.loads(data[i][11])
    idcheck = data[i][2]
    if str(annotations).count('choice') == 1 and not np.isnan(idcheck):
        user = data[i,1]
        user_id = data[i,2]
        workflow = data[i,5]
        
        # annotations
        task = annotations[0]["task"]
        choice = annotations[0]["value"][0]["choice"]
        
        # subject data
        subject_data = json.loads(data[i][12])
        for key in subject_data:
            zoo_id = key
            retired = subject_data[key]['retired']
            unique_id = subject_data[key]['subject_id']
        
        # Append this information into a temporary output file
        tmp_user_id.append(user_id)
        tmp_workflow.append(workflow)
        tmp_task.append(task)
        tmp_choice.append(choice)
        tmp_retired.append(retired)
        tmp_unique_id.append(unique_id)
        tmp_zoo_id.append(zoo_id)
        
# Store each of the classification data
classifications = pd.DataFrame({'imageID':tmp_unique_id,'userID':tmp_user_id,'workflow':tmp_workflow,
                                'task':tmp_task,'label':tmp_choice,'type':tmp_retired, 'zooID':tmp_zoo_id})

In [67]:
data1

Unnamed: 0,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,choice,tasks,answers,zooID,imageID,classification_number,goodUpload,numAnnotations
0,9525037,0 lmaesampson\n1 crow...,1414392,5f156a8c2b8d034ee3ad,1479,Apprentice - Jump in here for advanced Glitch ...,20.300,2016-01-28 19:43:53 UTC,,,"{""session"":""11eb1a69d6e760c329984c2e8315e0b93d...",[{}],T1,{},1075876,Lqw3QB36UT,0,True,True
1,9604193,0 lmaesampson\n1 crow...,336603,dbc82e33a3b04ead00b1,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:10:39 UTC,,,"{""session"":""75c5ec8ed51acbe57ebbf4832889138f12...",[BLP],T1,{},1076285,HTlgwFjWZn,0,True,True
2,9604196,0 lmaesampson\n1 crow...,336603,b520b59e40070c7f7b20,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:10:50 UTC,,,"{""session"":""75c5ec8ed51acbe57ebbf4832889138f12...",[60HZPWRMNS],T1,{},1077316,suKWBhith1,1,True,True
3,9604454,0 lmaesampson\n1 crow...,336603,b520b59e40070c7f7b20,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:21:35 UTC,,,"{""session"":""cb966ac7ba0eaf72b90dd9b3a38a850bf9...",[BLP],T1,{},1076889,6rkFgYSBKl,2,True,True
4,9604601,0 lmaesampson\n1 crow...,336603,b520b59e40070c7f7b20,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:26:48 UTC,,,"{""session"":""cb966ac7ba0eaf72b90dd9b3a38a850bf9...",[BLP],T1,{},1079043,IbPP81GsPI,3,True,True
5,9604606,0 lmaesampson\n1 crow...,336603,b520b59e40070c7f7b20,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:26:59 UTC,,,"{""session"":""cb966ac7ba0eaf72b90dd9b3a38a850bf9...",[NNFTHBV],T1,{},1077431,q9qihm2DAZ,4,True,True
6,9604614,0 lmaesampson\n1 crow...,336603,dbc82e33a3b04ead00b1,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-02 16:27:14 UTC,,,"{""session"":""cb966ac7ba0eaf72b90dd9b3a38a850bf9...",[SCTTRDLGHT],T1,{},1078615,wIKCHx7qng,5,True,True
7,9625530,0 lmaesampson\n1 crow...,1303502,dbc82e33a3b04ead00b1,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-03 16:57:34 UTC,,,"{""session"":""7b60c2f156e7ae1a05fe243bcea06c82ba...",[HRMNCS],T1,{},1077431,q9qihm2DAZ,0,True,True
8,9625533,0 lmaesampson\n1 crow...,1303502,dbc82e33a3b04ead00b1,1479,Apprentice - Jump in here for advanced Glitch ...,44.600,2016-02-03 16:57:44 UTC,,,"{""session"":""7b60c2f156e7ae1a05fe243bcea06c82ba...",[SCTTRDLGHT],T1,{},1075876,Lqw3QB36UT,1,True,True
9,10295716,0 lmaesampson\n1 crow...,796717,e0e08dbd7223d98f05c1,1479,Apprentice - Jump in here for advanced Glitch ...,77.360,2016-03-04 16:40:18 UTC,,,"{""session"":""65e4559affff1d4e2f0592b12c50a37aaf...",[NNFTHBV],T1,{},1688528,,0,False,True


In [85]:
uniques = set(np.unique(classifications['imageID'])) #create set of unique imageIDs
keys = set(id_dict.keys()) #create set of new imageIDs from id_dict
uniques = list(uniques.intersection(keys)) #find intersection of sets, convert to list

In [24]:
#function to create lists of empty lists
def emptylist(x):
    elist = []
    for i in range(x):
        elist.append([])
    return elist

In [120]:
### READ CLASSIFICATIONS FROM GRAVSPY BETA ###

#turn off unnecessary warning about setting values to slice of dataframe
pd.options.mode.chained_assignment = None  # default='warn'

#create dataframe, length of uniques, without labels or userIDs
images = pd.DataFrame({'type':['T']*len(uniques),'labels':emptylist(len(uniques)),
                        'userIDs':emptylist(len(uniques)),'ML_posterior':emptylist(len(uniques)),
                        'truelabel':[-1]*len(uniques),'imageID':uniques,'zooID':emptylist(len(uniques))})

for i in range(len(uniques)): #iterate over unique imageIDs
    
    classifications_idx = np.where((uniques[i] == classifications['imageID']))[0][0]
    
    images['zooID'][i] = int(classifications.loc[[classifications_idx], 'zooID'])
    
    for locations in np.where(uniques[i] == classifications['imageID']): #iterate over arrays of where unique imageID appears
        
        images_idx = np.where(uniques[i] == images['imageID'])[0][0] #find index of line in images where unique imageID appears
        
        for location in locations: #iterate over elements in array of locations in classifications where unique imageID appears
        
            images['labels'][images_idx].append(label_dict[classifications['label'][location]]) #append numeric label
            images['userIDs'][images_idx].append(int(classifications['userID'][location])) #append userID
            
for imageID in images['imageID']:

    imageID = id_dict[imageID]

In [133]:
### READ CLASSIFICATIONS OF GOLDEN IMAGES ###

goldendata = pd.read_csv('GLabel.csv')

for i in range(len(goldendata)): #iterate over data
    
    try:
        images_idx = np.where(int(goldendata['zooID'][i]) == images['zooID'])[0][0] #find location in images dataframe
        images['truelabel'][images_idx] = int(goldendata['Classification'][i]) #change true label to golden classification
        images['type'][images_idx] = 'G' #change image type to golden
        
    except:
        pass #to catch errors caused by images in goldendata not being in images dataframe

In [134]:
images

Unnamed: 0,ML_posterior,imageID,labels,truelabel,type,userIDs,zooID
0,[],HauRDnEd8q,"[10, 10, 10, 10, 10, 10, 9, 10]",-1,T,"[835158, 1498433, 637439, 239792, 132, 82, 227...",2210216
1,[],qOOEUMNyGi,[7],-1,T,[239792],2207683
2,[],GGfpUmYtJD,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498701, 530281, 239792, 243841]",2209323
3,[],N70PSqtshJ,[11],-1,T,[322252],2200834
4,[],j99whZS7hL,"[9, 15, 15, 15]",-1,T,"[1047240, 239792, 82, 497320]",2211647
5,[],tRBVjF55Ah,"[11, 8, 8, 11, 8]",-1,T,"[1498519, 239792, 82, 322252, 2274]",2219760
6,[],NCCpMZbVi0,[9],-1,T,[102001],2215367
7,[],EPTjErvJJi,"[15, 13, 13, 11]",-1,T,"[835158, 239792, 2909, 82]",2209896
8,[],FBQL7ephxf,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498433, 239792, 82, 322252]",2196270
9,[],RFhohs0FxA,"[13, 13, 13, 11]",-1,T,"[82, 239792, 2274, 322252]",2215595
