# Preprocess human classification data downloaded from Zooniverse

In [364]:
%matplotlib inline
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

## The human classification data

Two of the more important columns in the exported data are in JSON format: the subject and the annotations. So, some simple scripts to extract the relevant data. Unfortunately, the JSON seems to have changed over time, so the functions have to be flexible.

In [367]:
def ID_from_subject_data(subject_data):
    # given a string of JSON representing the subject data, extract and return the ID
    sd = json.loads(subject_data)
    sd1 = list(sd.values())[0]
    sd1k = sd1.keys()
    if 'ID' in sd1k:
        # easiest case: there's a subject ID in the JSON
        v = sd1['ID']
    elif 'Filename1' in sd1k:
        # otherwise, we can get it from the filename (I hope)
        v = sd1['Filename1'][3:13]
    else:
        # about 28 case where the subject data are in some weird format that this doesn't catch, which we ignore
        v = ''
        
    return v

In [368]:
# an example
ID_from_subject_data(classification_data['subject_data'][40000])

'BMOSh2lK88'

_need to check that this is really working reliably given the number of observations that don't connect_

In [369]:
def user_choice_from_annotations(annotations):
    # given a string of JSON representing the annotation, extract and return the user's annotation
    a = json.loads(annotations)

    # not all users actually recorded a choice
    try:
        v = a[0]['value'][0]['choice']
    except (IndexError, KeyError): 
        # volunteer apparently didn't pick anything 
        v = ''
    
    return v

In [370]:
# example
user_choice_from_annotations(classification_data['annotations'][2])

'60HZPWRMNS'

## Read data downloaded from Zooniverse

In [365]:
classification_data = pd.read_csv('160626 gravity-spy-classifications.csv',parse_dates=[7,],infer_datetime_format=True)

In [366]:
len(classification_data)

51363

## Add new columns to the data frame with the extracted data. 

In [371]:
classification_data['subject_ID'] = classification_data['subject_data'].map(ID_from_subject_data)

In [372]:
classification_data['annotation'] = classification_data['annotations'].map(user_choice_from_annotations)

The uncaught cases. 

In [373]:
classification_data[classification_data.subject_ID == '']

Unnamed: 0,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,annotations,subject_data,subject_ids,subject_ID,annotation
9,10295716,mzevin1,796717.0,eb63336142b3fbe405a2,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 16:40:18,,,"{""session"":""65e4559affff1d4e2f0592b12c50a37aaf...","[{""task"":""T1"",""value"":[{""choice"":""NNFTHBV"",""an...","{""1688528"":{""retired"":null,""1"":34,""20160302"":2...",1688528;1688528;1688528,,NNFTHBV
10,10296637,mzevin1,796717.0,eb63336142b3fbe405a2,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 17:06:20,,,"{""session"":""65e4559affff1d4e2f0592b12c50a37aaf...","[{""task"":""T1"",""value"":[{""choice"":""XTRMLLD"",""an...","{""1688148"":{""retired"":null,""1"":20,""20160302"":2...",1688148;1688148;1688148,,XTRMLLD
11,10296695,mzevin1,796717.0,eb63336142b3fbe405a2,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 17:07:57,,,"{""session"":""d7610273636c2f3b10e72e803f719e11b3...","[{""task"":""T1"",""value"":[{""choice"":""NNFTHBV"",""an...","{""1688086"":{""retired"":null,""1"":136,""20160302"":...",1688086;1688086;1688086,,NNFTHBV
12,10297006,sbc538,386563.0,7c5f17b05d4804969561,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 17:13:56,,,"{""session"":""bc89d79827eaa6352a0d07afd2d0d8f52c...","[{""task"":""T1"",""value"":[{""choice"":""BLP"",""answer...","{""1688085"":{""retired"":null,""1"":135,""20160302"":...",1688085;1688085;1688085,,BLP
13,10300327,crowston,336603.0,6084cdabaff2399fa800,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 19:08:00,,,"{""session"":""8f4f780411ec563e0f42a271272c8b1209...","[{""task"":""T1"",""value"":[{""choice"":""KFSH"",""answe...","{""1687896"":{""retired"":null,""1"":3,""20160302"":20...",1687896;1687896;1687896,,KFSH
14,10300333,crowston,336603.0,6084cdabaff2399fa800,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 19:08:23,,,"{""session"":""8f4f780411ec563e0f42a271272c8b1209...","[{""task"":""T1"",""value"":[{""choice"":""NNFTHBV"",""an...","{""1688376"":{""retired"":null,""1"":13,""20160302"":2...",1688376;1688376;1688376,,NNFTHBV
15,10300339,crowston,336603.0,6084cdabaff2399fa800,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-04 19:08:45,,,"{""session"":""8f4f780411ec563e0f42a271272c8b1209...","[{""task"":""T1"",""value"":[{""choice"":""60HZPWRMNS"",...","{""1689006"":{""retired"":null,""1"":239,""20160302"":...",1689006;1689006;1689006,,60HZPWRMNS
16,10342622,mzevin1,796717.0,6c2a9dc19b43f157ea04,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-06 18:56:05,,,"{""session"":""9c44968912498ed3665e738ffc40467159...","[{""task"":""T1"",""value"":[{""choice"":""60HZPWRMNS"",...","{""1689037"":{""retired"":null,""1"":270,""20160302"":...",1689037;1689037;1689037,,60HZPWRMNS
17,10342630,mzevin1,796717.0,6c2a9dc19b43f157ea04,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-06 18:56:25,,,"{""session"":""9c44968912498ed3665e738ffc40467159...","[{""task"":""T1"",""value"":[]}]","{""1688289"":{""retired"":null,""1"":227,""20160302"":...",1688289;1688289;1688289,,
18,10342635,mzevin1,796717.0,6c2a9dc19b43f157ea04,1479,Apprentice - Jump in here for advanced Glitch ...,77.36,2016-03-06 18:56:42,,,"{""session"":""9c44968912498ed3665e738ffc40467159...","[{""task"":""T1"",""value"":[{""choice"":""SCTTRDLGHT"",...","{""1688981"":{""retired"":null,""1"":135,""20160302"":...",1688981;1688981;1688981,,SCTTRDLGHT


## A few sample statistics...

In [374]:
classification_by_user = classification_data.groupby('user_name').count()

In [375]:
classification_by_user['user_ip'].median()

5.0

In [376]:
classification_data.groupby('annotation').count()['classification_id']

annotation
                     103
45MHZLGHTMDLTN       458
50HZ                 195
60HZPWRLN              1
60HZPWRMNS             3
BLP                12715
CHRP                 655
CLBRTNLN300HZ        386
DNTSGLTCH           9897
HLX                   54
HRMNCS                 1
KFSH                2856
LGHTMDLTN             24
LWFRQNCBRST         6154
LWFRQNCLN           2300
NGLTCH              1670
NNFTHBV             4658
PRDDVS               139
PWRLN60HZ           3593
RCMPRSSR50HZ          21
RPTNGBLPS            214
SCRTCH              1356
SCTTRDLGHT          1810
TMT                  318
VLNHRMNC500HZ        192
VLNMDHRMNC500HZ       36
WHSTL                731
WNDRNGLN             363
XTRMLLD              460
Name: classification_id, dtype: int64

## Glitch classes
The coding for the classes is different in the ML and Human classification, so a file to translate between them. Also, there are multiple codes for the same glitch class in the human file, so some lines have repeats. I marked a preferred code in case we ever want to move from the ML data to the code in Gravity Spy. Note that pandas can't handle an NA in an integer column, which is why Model_number is a float.

In [377]:
glitch_classes = pd.read_csv('glitch-classes.csv')

In [378]:
glitch_classes

Unnamed: 0,MLID,Name,HCID,Preferred,Model_number
0,1,45Mhz_Light_Modulation,45MHZLGHTMDLTN,1,6.0
1,1,45Mhz_Light_Modulation,LGHTMDLTN,0,6.0
2,2,50_Hz,50HZ,1,17.0
3,2,50_Hz,RCMPRSSR50HZ,0,17.0
4,3,Blip,BLP,1,2.0
5,4,Blob,,1,
6,5,Chirp,CHRP,1,
7,6,Extremely_Loud,XTRMLLD,1,10.0
8,7,Helix,HLX,1,14.0
9,8,Koi_Fish,KFSH,1,4.0


Example use: make the mean confidences more readable

In [379]:
mean_scores = ML_data.loc[:,['label','confidence']].groupby('label').agg([np.mean,len])
mean_scores['MLID'] = mean_scores.index
mean_scores = pd.merge(mean_scores, glitch_classes[glitch_classes.Preferred==1], on='MLID')
mean_scores

Unnamed: 0,MLID,"(confidence, mean)","(confidence, len)","(MLID, )",Name,HCID,Preferred,Model_number
0,1,0.46423,39.0,1,45Mhz_Light_Modulation,45MHZLGHTMDLTN,1,6.0
1,2,0.355556,14.0,2,50_Hz,50HZ,1,17.0
2,3,0.659265,11098.0,3,Blip,BLP,1,2.0
3,6,0.873134,2630.0,6,Extremely_Loud,XTRMLLD,1,10.0
4,7,0.317086,689.0,7,Helix,HLX,1,14.0
5,8,0.413845,75.0,8,Koi_Fish,KFSH,1,4.0
6,9,0.508804,58.0,9,Low_Frequency_Burst,LWFRQNCBRST,1,7.0
7,10,0.294796,25.0,10,Low_Frequency_Lines,LWFRQNCLN,1,3.0
8,10,0.294796,25.0,10,Low_Frequency_Noise_Fluctuations,LWFRQNCLN,1,3.0
9,11,0.571973,33206.0,11,No_Glitch,DNTSGLTCH,1,16.0


## Add ML labels to classification data
Merge the human data with the translation to the ML coding system and drop most of the columns. I chose to keep user_id instead of user_name in an attempt to make the data more private. It occurs to me that we should filter out internal users who've been debugging or doing demos, but I was assured that they all were serious. Still, the learning parameters are probably different. There are 51224 classifications in total but 27 without a subject_ID due to the problem mentioned above. 

In [380]:
human_data_cols = ['user_id', 'subject_ID', 'created_at', 'annotation', 'workflow_id', 'workflow_version']
human_data = pd.merge(classification_data.loc[classification_data.annotation!='', human_data_cols], \
                      glitch_classes, left_on='annotation', right_on='HCID')
human_data.drop(['annotation','HCID','Preferred'], inplace=True, axis=1)

In [388]:
human_data.head()

Unnamed: 0,user_id,subject_ID,created_at,workflow_id,workflow_version,MLID,Name,Model_number
0,336603.0,HTlgwFjWZn,2016-02-02 16:10:39,1479,44.6,3,Blip,2.0
1,336603.0,6rkFgYSBKl,2016-02-02 16:21:35,1479,44.6,3,Blip,2.0
2,336603.0,IbPP81GsPI,2016-02-02 16:26:48,1479,44.6,3,Blip,2.0
3,386563.0,,2016-03-04 17:13:56,1479,77.36,3,Blip,2.0
4,2156.0,,2016-03-16 22:11:40,1479,186.49,3,Blip,2.0


In [382]:
len(human_data)

53524

## Save data for reuse

In [389]:
classifications_store = pd.HDFStore('160626 data.h5')
# classifications_store['classification_data'] = classification_data
del classifications_store['human_data']
classifications_store['human_data'] = human_data
classifications_store.close()