Skip to content

Commit

Permalink
Transitioned promotion to new database, it is astronomically faster t…
Browse files Browse the repository at this point in the history
…he way it is coded now and now that it is on the new database
  • Loading branch information
scottcoughlin2014 committed Apr 3, 2020
1 parent b430a20 commit 99e473c
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 47 deletions.
11 changes: 7 additions & 4 deletions bin/parse_classifications
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ answers_dict_level4['SCATTERED'] = 16
answers_dict_level4['KOI FISH'] = 7
answers_dict_level4['VIOLIN MODE HARMONIC'] = 19
answers_dict_level4['POWER LINE (60 HZ)'] = 14
answers_dict_level4['BLP'] = answers_dict_level4['BLIP']
answers_dict_level4['WHSTL'] = answers_dict_level4['WHISTLE']
answers_dict_level4['NNFTHBV'] = answers_dict_level4['NONEOFTHEABOVE']


answers_virgo = gspyproject.get_answers(workflow=7501)
Expand All @@ -108,10 +111,10 @@ level_workflow_dict = dict(enumerate(workflow_order))

# Load last_id that was parsed
#last_id = "16822410"
engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu'
':5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],
os.environ['GRAVITYSPY_DATABASE_PASSWD']))
if os.path.isfile(args.id_file):
engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu'
':5432/gravityspy'.format(os.environ['GRAVITYSPYPLUS_DATABASE_USER'],
os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD']))
if args.id_file is not None and os.path.isfile(args.id_file):
last_id = pd.read_csv('{0}'.format(args.id_file)).iloc[0].iloc[0]
elif args.last_id is None:
last_id = pd.read_sql("select max(id) from classificationsdev",engine).iloc[0].iloc[0]
Expand Down
4 changes: 2 additions & 2 deletions bin/promote_users
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ level_workflow_dict[2117] = 7
# Obtain what level a user should be on based ont he highest level
# they have done a classificaiton in before
init_user_levels = EventTable.fetch('gravityspy', 'classificationsdev GROUP BY links_user, links_workflow',
columns = ['links_user', 'links_workflow']).to_pandas()
columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu').to_pandas()
init_user_levels['init_level'] = init_user_levels['links_workflow'].apply(lambda x: level_workflow_dict[x])
init_levels = init_user_levels.groupby('links_user').max().init_level
user_status_init = pandas.DataFrame({'userID' : init_levels.index.tolist(),
Expand All @@ -60,7 +60,7 @@ user_confusion_matrices = gspyproject.calculate_confusion_matrices()

user_levels = gspyproject.determine_level()

engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD']))
engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD']))
query = 'WITH foo as (SELECT id, links_user, links_workflow FROM classificationsdev WHERE links_workflow IN (2360, 7766)) SELECT links_user, count(id) FROM foo GROUP BY links_user HAVING count(id) > 25'
virgo_promotion = pandas.read_sql(query, engine)
# Special virgo promotion
Expand Down
77 changes: 36 additions & 41 deletions gravityspy/api/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from panoptes_client import SubjectSet, Project, Workflow
from scipy.sparse import coo_matrix
from gwpy.table import EventTable
from sqlalchemy.engine import create_engine

import re, pickle
import pandas as pd
import os
import pandas
import numpy as np

__all__ = ['ZooProject', 'flatten', 'GravitySpyProject',
Expand Down Expand Up @@ -305,60 +307,53 @@ def calculate_confusion_matrices(self):
# Ignore NONEOFTHEABOVE classificatios when constructing confusion
# matrix
# Make sure to the subject classified was a golden image
query = 'classificationsdev WHERE \"annotations_value_choiceINT\" != \
-1 AND \"links_user\" != 0 AND \
\"annotations_value_choiceINT\" != 12 AND \
CAST(links_subjects AS FLOAT) IN \
(SELECT \"links_subjects\" FROM goldenimages)'

columns = ['id', 'links_user', 'links_subjects', 'links_workflow',
'annotations_value_choiceINT']
classifications = EventTable.fetch('gravityspy', query,
columns = columns)

classifications = classifications.to_pandas()
classifications = classifications.sort_values('id')
golden_images = EventTable.fetch('gravityspy', 'goldenimages')
golden_images_df = golden_images.to_pandas()
query = ("SELECT classificationsdev.id, classificationsdev.links_user, "
"classificationsdev.links_subjects, classificationsdev.links_workflow, "
"classificationsdev.\"annotations_value_choiceINT\", goldenimages.goldlabel "
"FROM classificationsdev INNER JOIN goldenimages ON classificationsdev.links_subjects = goldenimages.links_subjects "
"WHERE classificationsdev.\"annotations_value_choiceINT\" != -1 AND "
"classificationsdev.\"annotations_value_choiceINT\" != 12 AND "
"classificationsdev.links_user != 0")

# From answers Dict determine number of classes
numClasses = len(self.get_answers(workflow=7766).values()[0])
engine = create_engine(
'postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format(
os.environ['GRAVITYSPYPLUS_DATABASE_USER'],
os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD']))

# merge the golden image DF with th classification (this merge is on
# links_subject (i.e. the zooID of the image classified)
image_and_classification = classifications.merge(golden_images_df,
on=['links_subjects'])
classifications = pandas.read_sql(query, engine)
classifications = classifications.sort_values('id')

# This is where the power of pandas comes in...on the fly in very quick
# order we can fill all users confusion matrices
# by smartly chosen groupby
test = image_and_classification.groupby(['links_user',
'annotations_value_choiceINT',
'GoldLabel'])
test = classifications.groupby(['links_user',
'annotations_value_choiceINT',
'goldlabel'])
test = test.count().links_subjects.to_frame().reset_index()

# From answers Dict determine number of classes
numClasses = len(self.get_answers(workflow=7766).values()[0])

# Create "Sparse Matrices" and perform a normalization task on them.
# Afterwards determine if the users diagonal
# is above the threshold set above
confusion_matrices = pd.DataFrame()
conf_dict = {'userID' : [], 'conf_matrix' : [], 'alpha' : []}
for iUser in test.groupby('links_user'):
columns = iUser[1].annotations_value_choiceINT
rows = iUser[1]['GoldLabel']
rows = iUser[1]['goldlabel']
entry = iUser[1]['links_subjects']
tmp = coo_matrix((entry, (rows,columns)), shape=(numClasses,
numClasses))
conf_divided, a1, a2, a3 = \
np.linalg.lstsq(np.diagflat(tmp.sum(axis=1)),
tmp.todense())

conf_dict = {'userID' : iUser[0], 'conf_matrix' : [conf_divided],
'alpha' : [np.diag(conf_divided)]}

confusion_matrices = \
confusion_matrices.append(pd.DataFrame(
conf_dict,
index=[iUser[0]]))
conf_dict['userID'].append(iUser[0])
conf_dict['conf_matrix'].append([conf_divided])
conf_dict['alpha'].append([np.diag(conf_divided)])

confusion_matrices = pandas.DataFrame(conf_dict).set_index('userID', drop=False)
self.confusion_matrices = confusion_matrices
return confusion_matrices

Expand Down Expand Up @@ -387,11 +382,11 @@ def calculate_confusion_matrices_per_classification(self):
columns = ['id', 'links_user', 'links_subjects', 'links_workflow',
'annotations_value_choiceINT']
classifications = EventTable.fetch('gravityspy', query,
columns = columns)
columns = columns, host='gravityspyplus.ciera.northwestern.edu')

classifications = classifications.to_pandas()
classifications = classifications.sort_values('id')
golden_images = EventTable.fetch('gravityspy', 'goldenimages')
golden_images = EventTable.fetch('gravityspy', 'goldenimages', host='gravityspyplus.ciera.northwestern.edu')
golden_images_df = golden_images.to_pandas()

# From answers Dict determine number of classes
Expand All @@ -403,7 +398,7 @@ def calculate_confusion_matrices_per_classification(self):
on=['links_subjects'])

# groupby users to get there gold classifications
tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','GoldLabel', 'id']]
tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','goldlabel', 'id']]
user_confusion_matrices = {}
for key, item in tmp:
user_confusion_matrices[key] = {}
Expand Down Expand Up @@ -441,7 +436,7 @@ def determine_level(self, alpha=None):
promotion_Level4 = set([answers_dict[answer] for answer in answers['7765'].keys() if answer not in ['NONEOFTHEABOVE']])
promotion_Level5 = set([answers_dict[answer] for answer in answers['7766'].keys() if answer not in ['NONEOFTHEABOVE']])


level_dict = dict(enumerate(self.workflow_order))
workflow_level_dict = dict((v, k + 1) for k, v in
level_dict.items())
Expand All @@ -463,7 +458,7 @@ def determine_level(self, alpha=None):
for (iuser, ialpha) in zip(self.confusion_matrices.userID,
self.confusion_matrices.alpha):

proficiencyidx = set(np.where(ialpha > alpha)[0])
proficiencyidx = set(np.where(ialpha > alpha)[1])
# determine whether a user is proficient at >= number
# of answers on a level. If yes, the check next level
# until < at which point you know which level the user
Expand All @@ -485,7 +480,7 @@ def determine_level(self, alpha=None):
level.append([curr_workflow, curr_level, iuser])

columns = ['curr_workflow', 'curr_level', 'userID']
return pd.DataFrame(level, columns = columns)
return pandas.DataFrame(level, columns = columns)


def check_level_by_classification(self):
Expand All @@ -497,7 +492,7 @@ def check_level_by_classification(self):

query = 'classificationsdev GROUP BY links_user, links_workflow'
userlevels = EventTable.fetch('gravityspy', query,
columns = ['links_user', 'links_workflow'])
columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu')

userlevels = userlevels.to_pandas()
userlevels['Level'] = userlevels.links_workflow.apply(
Expand All @@ -508,6 +503,6 @@ def check_level_by_classification(self):
init_user_levels_dict = {'userID' : init_user_levels.index.tolist(),
'workflowInit' : init_user_levels.tolist()}

userStatusInit = pd.DataFrame(init_user_levels_dict)
userStatusInit = pandas.DataFrame(init_user_levels_dict)
self.userStatusInit = userStatusInit
return userStatusInit

0 comments on commit 99e473c

Please sign in to comment.