In [12]:
### Import standard functions ###
import os, time, csv, optparse
import numpy as np
from pylab import *
import json
import pandas as pd

In [107]:
#####################################
### Define command line arguments ###
#####################################

def parse_commandline():
    parser = optparse.OptionParser()
    parser.add_option("--file", help="Type in the path to the csv file \
    containing the classifications from the vet training set \
    workflow",default="example.csv")
    
    opts, args = parser.parse_args()
    
    return opts

#opts = parse_commandline()
class_file = "gravity-spy-classifications.csv"

In [3]:
### READ IN DATA ###
data = pd.read_csv(class_file, skiprows=29573, nrows=20008) # This starts with Beta 2.0
data = np.asarray(data)

In [5]:
### READ FILE FOR CONVERTING imageIDs ###
id_match = pd.read_csv('IDmatchall.txt') #read in file
tmp_new = []
tmp_old = []
no_match = [] #list of imageIDs with no match (timestamp errors)
id_dict = {} #empty dict for converting imageIDs

for i in id_match['# New       Old']:
    if len(i)>10:
        i = i.split(' ')
        tmp_new.append(i[0])
        tmp_old.append(i[1])

id_match = pd.DataFrame({'new':tmp_new,'old':tmp_old})

for a,b in zip(id_match['new'],id_match['old']):
    id_dict[b] = a

In [109]:
### DICT FOR CONVERTING LABELS TO INTEGERS ###
label_dict = {'45MHZLGHTMDLTN':1,'LGHTMDLTN':1,'50HZ':2,'RCMPRSSR50HZ':2,'BLP':3,'CHRP':4,'XTRMLLD':5,'HLX':6,'KFSH':7,
              'LWFRQNCBRST':8,'LWFRQNCLN':9,'NGLTCH':10,'DNTSGLTCH':10,'NNFTHBV':11,'PRDDVS':12,'60HZPWRLN':13,'60HZPWRMNS':13,
              'PWRLN60HZ':13,'RPTNGBLPS':14,'SCTTRDLGHT':15,'SCRTCH':16,'TMT':17,'VLNHRMNC500HZ':18,'VLNMDHRMNC500HZ':18,
              'HRMNCS':18,'WNDRNGLN':19,'WHSTL':20}

In [30]:
### APPEND ALL PERTINENT DATA TO LIST ###

tmp_user= []
tmp_user_id = []
tmp_workflow = []
tmp_task = []
tmp_choice = []
tmp_retired = []
tmp_unique_id = []
tmp_zoo_id = []

for i in range(len(data)):
    # create list to hold to output information of each classification
    output = []
    # Check that there was only 1 choice made...
    annotations = json.loads(data[i][11])
    idcheck = data[i][2]
    if str(annotations).count('choice') == 1 and not np.isnan(idcheck):
        user = data[i,1]
        user_id = data[i,2]
        workflow = data[i,5]
        
        # annotations
        task = annotations[0]["task"]
        choice = annotations[0]["value"][0]["choice"]
        
        # subject data
        subject_data = json.loads(data[i][12])
        for key in subject_data:
            zoo_id = key
            retired = subject_data[key]['retired']
            unique_id = subject_data[key]['subject_id']
        
        # Append this information into a temporary output file
        tmp_user_id.append(user_id)
        tmp_workflow.append(workflow)
        tmp_task.append(task)
        tmp_choice.append(choice)
        tmp_retired.append(retired)
        tmp_unique_id.append(unique_id)
        tmp_zoo_id.append(zoo_id)
        
# Store each of the classification data
classifications = pd.DataFrame({'imageID':tmp_unique_id,'userID':tmp_user_id,'workflow':tmp_workflow,
                                'task':tmp_task,'label':tmp_choice,'type':tmp_retired, 'zooID':tmp_zoo_id})

In [127]:
images

Unnamed: 0,ML_posterior,imageID,labels,truelabel,type,userIDs,zooID
0,[],HauRDnEd8q,"[10, 10, 10, 10, 10, 10, 9, 10]",-1,T,"[835158, 1498433, 637439, 239792, 132, 82, 227...",2210216
1,[],qOOEUMNyGi,[7],-1,T,[239792],2207683
2,[],GGfpUmYtJD,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498701, 530281, 239792, 243841]",2209323
3,[],N70PSqtshJ,[11],-1,T,[322252],2200834
4,[],j99whZS7hL,"[9, 15, 15, 15]",-1,T,"[1047240, 239792, 82, 497320]",2211647
5,[],tRBVjF55Ah,"[11, 8, 8, 11, 8]",-1,T,"[1498519, 239792, 82, 322252, 2274]",2219760
6,[],NCCpMZbVi0,[9],-1,T,[102001],2215367
7,[],EPTjErvJJi,"[15, 13, 13, 11]",-1,T,"[835158, 239792, 2909, 82]",2209896
8,[],FBQL7ephxf,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498433, 239792, 82, 322252]",2196270
9,[],RFhohs0FxA,"[13, 13, 13, 11]",-1,T,"[82, 239792, 2274, 322252]",2215595


In [85]:
uniques = set(np.unique(classifications['imageID'])) #create set of unique imageIDs
keys = set(id_dict.keys()) #create set of new imageIDs from id_dict
uniques = list(uniques.intersection(keys)) #find intersection of sets, convert to list

In [24]:
#function to create lists of empty lists
def emptylist(x):
    elist = []
    for i in range(x):
        elist.append([])
    return elist

In [120]:
### READ CLASSIFICATIONS FROM GRAVSPY BETA ###

#turn off unnecessary warning about setting values to slice of dataframe
pd.options.mode.chained_assignment = None  # default='warn'

#create dataframe, length of uniques, without labels or userIDs
images = pd.DataFrame({'type':['T']*len(uniques),'labels':emptylist(len(uniques)),
                        'userIDs':emptylist(len(uniques)),'ML_posterior':emptylist(len(uniques)),
                        'truelabel':[-1]*len(uniques),'imageID':uniques,'zooID':emptylist(len(uniques))})

for i in range(len(uniques)): #iterate over unique imageIDs
    
    classifications_idx = np.where((uniques[i] == classifications['imageID']))[0][0]
    
    images['zooID'][i] = int(classifications.loc[[classifications_idx], 'zooID'])
    
    for locations in np.where(uniques[i] == classifications['imageID']): #iterate over arrays of where unique imageID appears
        
        images_idx = np.where(uniques[i] == images['imageID'])[0][0] #find index of line in images where unique imageID appears
        
        for location in locations: #iterate over elements in array of locations in classifications where unique imageID appears
        
            images['labels'][images_idx].append(label_dict[classifications['label'][location]]) #append numeric label
            images['userIDs'][images_idx].append(int(classifications['userID'][location])) #append userID
            
for imageID in images['imageID']:

    imageID = id_dict[imageID]

In [133]:
### READ CLASSIFICATIONS OF GOLDEN IMAGES ###

goldendata = pd.read_csv('GLabel.csv')

for i in range(len(goldendata)): #iterate over data
    
    try:
        images_idx = np.where(int(goldendata['zooID'][i]) == images['zooID'])[0][0] #find location in images dataframe
        images['truelabel'][images_idx] = int(goldendata['Classification'][i]) #change true label to golden classification
        images['type'][images_idx] = 'G' #change image type to golden
        
    except:
        pass #to catch errors caused by images in goldendata not being in images dataframe

In [134]:
images

Unnamed: 0,ML_posterior,imageID,labels,truelabel,type,userIDs,zooID
0,[],HauRDnEd8q,"[10, 10, 10, 10, 10, 10, 9, 10]",-1,T,"[835158, 1498433, 637439, 239792, 132, 82, 227...",2210216
1,[],qOOEUMNyGi,[7],-1,T,[239792],2207683
2,[],GGfpUmYtJD,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498701, 530281, 239792, 243841]",2209323
3,[],N70PSqtshJ,[11],-1,T,[322252],2200834
4,[],j99whZS7hL,"[9, 15, 15, 15]",-1,T,"[1047240, 239792, 82, 497320]",2211647
5,[],tRBVjF55Ah,"[11, 8, 8, 11, 8]",-1,T,"[1498519, 239792, 82, 322252, 2274]",2219760
6,[],NCCpMZbVi0,[9],-1,T,[102001],2215367
7,[],EPTjErvJJi,"[15, 13, 13, 11]",-1,T,"[835158, 239792, 2909, 82]",2209896
8,[],FBQL7ephxf,"[9, 9, 9, 9, 9]",-1,T,"[1498519, 1498433, 239792, 82, 322252]",2196270
9,[],RFhohs0FxA,"[13, 13, 13, 11]",-1,T,"[82, 239792, 2274, 322252]",2215595
