From 99e473caed596374684fa8c5a55ef3b200ca2c95 Mon Sep 17 00:00:00 2001 From: scottcoughlin2014 Date: Fri, 3 Apr 2020 13:31:14 -0700 Subject: [PATCH] Transitioned promotion to new database, it is astronomically faster the way it is coded now and now that it is on the new database --- bin/parse_classifications | 11 ++++-- bin/promote_users | 4 +- gravityspy/api/project.py | 77 ++++++++++++++++++--------------------- 3 files changed, 45 insertions(+), 47 deletions(-) diff --git a/bin/parse_classifications b/bin/parse_classifications index 3988318..d424335 100755 --- a/bin/parse_classifications +++ b/bin/parse_classifications @@ -92,6 +92,9 @@ answers_dict_level4['SCATTERED'] = 16 answers_dict_level4['KOI FISH'] = 7 answers_dict_level4['VIOLIN MODE HARMONIC'] = 19 answers_dict_level4['POWER LINE (60 HZ)'] = 14 +answers_dict_level4['BLP'] = answers_dict_level4['BLIP'] +answers_dict_level4['WHSTL'] = answers_dict_level4['WHISTLE'] +answers_dict_level4['NNFTHBV'] = answers_dict_level4['NONEOFTHEABOVE'] answers_virgo = gspyproject.get_answers(workflow=7501) @@ -108,10 +111,10 @@ level_workflow_dict = dict(enumerate(workflow_order)) # Load last_id that was parsed #last_id = "16822410" -engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu' - ':5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'], - os.environ['GRAVITYSPY_DATABASE_PASSWD'])) -if os.path.isfile(args.id_file): +engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu' + ':5432/gravityspy'.format(os.environ['GRAVITYSPYPLUS_DATABASE_USER'], + os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD'])) +if args.id_file is not None and os.path.isfile(args.id_file): last_id = pd.read_csv('{0}'.format(args.id_file)).iloc[0].iloc[0] elif args.last_id is None: last_id = pd.read_sql("select max(id) from classificationsdev",engine).iloc[0].iloc[0] diff --git a/bin/promote_users b/bin/promote_users index 65d322f..ec326a9 100755 --- a/bin/promote_users +++ b/bin/promote_users @@ -47,7 +47,7 @@ level_workflow_dict[2117] = 7 # Obtain what level a user should be on based ont he highest level # they have done a classificaiton in before init_user_levels = EventTable.fetch('gravityspy', 'classificationsdev GROUP BY links_user, links_workflow', - columns = ['links_user', 'links_workflow']).to_pandas() + columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu').to_pandas() init_user_levels['init_level'] = init_user_levels['links_workflow'].apply(lambda x: level_workflow_dict[x]) init_levels = init_user_levels.groupby('links_user').max().init_level user_status_init = pandas.DataFrame({'userID' : init_levels.index.tolist(), @@ -60,7 +60,7 @@ user_confusion_matrices = gspyproject.calculate_confusion_matrices() user_levels = gspyproject.determine_level() -engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD'])) +engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD'])) query = 'WITH foo as (SELECT id, links_user, links_workflow FROM classificationsdev WHERE links_workflow IN (2360, 7766)) SELECT links_user, count(id) FROM foo GROUP BY links_user HAVING count(id) > 25' virgo_promotion = pandas.read_sql(query, engine) # Special virgo promotion diff --git a/gravityspy/api/project.py b/gravityspy/api/project.py index 98afc5d..ed1047a 100644 --- a/gravityspy/api/project.py +++ b/gravityspy/api/project.py @@ -5,9 +5,11 @@ from panoptes_client import SubjectSet, Project, Workflow from scipy.sparse import coo_matrix from gwpy.table import EventTable +from sqlalchemy.engine import create_engine import re, pickle -import pandas as pd +import os +import pandas import numpy as np __all__ = ['ZooProject', 'flatten', 'GravitySpyProject', @@ -305,45 +307,41 @@ def calculate_confusion_matrices(self): # Ignore NONEOFTHEABOVE classificatios when constructing confusion # matrix # Make sure to the subject classified was a golden image - query = 'classificationsdev WHERE \"annotations_value_choiceINT\" != \ - -1 AND \"links_user\" != 0 AND \ - \"annotations_value_choiceINT\" != 12 AND \ - CAST(links_subjects AS FLOAT) IN \ - (SELECT \"links_subjects\" FROM goldenimages)' - - columns = ['id', 'links_user', 'links_subjects', 'links_workflow', - 'annotations_value_choiceINT'] - classifications = EventTable.fetch('gravityspy', query, - columns = columns) - classifications = classifications.to_pandas() - classifications = classifications.sort_values('id') - golden_images = EventTable.fetch('gravityspy', 'goldenimages') - golden_images_df = golden_images.to_pandas() + query = ("SELECT classificationsdev.id, classificationsdev.links_user, " + "classificationsdev.links_subjects, classificationsdev.links_workflow, " + "classificationsdev.\"annotations_value_choiceINT\", goldenimages.goldlabel " + "FROM classificationsdev INNER JOIN goldenimages ON classificationsdev.links_subjects = goldenimages.links_subjects " + "WHERE classificationsdev.\"annotations_value_choiceINT\" != -1 AND " + "classificationsdev.\"annotations_value_choiceINT\" != 12 AND " + "classificationsdev.links_user != 0") - # From answers Dict determine number of classes - numClasses = len(self.get_answers(workflow=7766).values()[0]) + engine = create_engine( + 'postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format( + os.environ['GRAVITYSPYPLUS_DATABASE_USER'], + os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD'])) - # merge the golden image DF with th classification (this merge is on - # links_subject (i.e. the zooID of the image classified) - image_and_classification = classifications.merge(golden_images_df, - on=['links_subjects']) + classifications = pandas.read_sql(query, engine) + classifications = classifications.sort_values('id') # This is where the power of pandas comes in...on the fly in very quick # order we can fill all users confusion matrices # by smartly chosen groupby - test = image_and_classification.groupby(['links_user', - 'annotations_value_choiceINT', - 'GoldLabel']) + test = classifications.groupby(['links_user', + 'annotations_value_choiceINT', + 'goldlabel']) test = test.count().links_subjects.to_frame().reset_index() + # From answers Dict determine number of classes + numClasses = len(self.get_answers(workflow=7766).values()[0]) + # Create "Sparse Matrices" and perform a normalization task on them. # Afterwards determine if the users diagonal # is above the threshold set above - confusion_matrices = pd.DataFrame() + conf_dict = {'userID' : [], 'conf_matrix' : [], 'alpha' : []} for iUser in test.groupby('links_user'): columns = iUser[1].annotations_value_choiceINT - rows = iUser[1]['GoldLabel'] + rows = iUser[1]['goldlabel'] entry = iUser[1]['links_subjects'] tmp = coo_matrix((entry, (rows,columns)), shape=(numClasses, numClasses)) @@ -351,14 +349,11 @@ def calculate_confusion_matrices(self): np.linalg.lstsq(np.diagflat(tmp.sum(axis=1)), tmp.todense()) - conf_dict = {'userID' : iUser[0], 'conf_matrix' : [conf_divided], - 'alpha' : [np.diag(conf_divided)]} - - confusion_matrices = \ - confusion_matrices.append(pd.DataFrame( - conf_dict, - index=[iUser[0]])) + conf_dict['userID'].append(iUser[0]) + conf_dict['conf_matrix'].append([conf_divided]) + conf_dict['alpha'].append([np.diag(conf_divided)]) + confusion_matrices = pandas.DataFrame(conf_dict).set_index('userID', drop=False) self.confusion_matrices = confusion_matrices return confusion_matrices @@ -387,11 +382,11 @@ def calculate_confusion_matrices_per_classification(self): columns = ['id', 'links_user', 'links_subjects', 'links_workflow', 'annotations_value_choiceINT'] classifications = EventTable.fetch('gravityspy', query, - columns = columns) + columns = columns, host='gravityspyplus.ciera.northwestern.edu') classifications = classifications.to_pandas() classifications = classifications.sort_values('id') - golden_images = EventTable.fetch('gravityspy', 'goldenimages') + golden_images = EventTable.fetch('gravityspy', 'goldenimages', host='gravityspyplus.ciera.northwestern.edu') golden_images_df = golden_images.to_pandas() # From answers Dict determine number of classes @@ -403,7 +398,7 @@ def calculate_confusion_matrices_per_classification(self): on=['links_subjects']) # groupby users to get there gold classifications - tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','GoldLabel', 'id']] + tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','goldlabel', 'id']] user_confusion_matrices = {} for key, item in tmp: user_confusion_matrices[key] = {} @@ -441,7 +436,7 @@ def determine_level(self, alpha=None): promotion_Level4 = set([answers_dict[answer] for answer in answers['7765'].keys() if answer not in ['NONEOFTHEABOVE']]) promotion_Level5 = set([answers_dict[answer] for answer in answers['7766'].keys() if answer not in ['NONEOFTHEABOVE']]) - + level_dict = dict(enumerate(self.workflow_order)) workflow_level_dict = dict((v, k + 1) for k, v in level_dict.items()) @@ -463,7 +458,7 @@ def determine_level(self, alpha=None): for (iuser, ialpha) in zip(self.confusion_matrices.userID, self.confusion_matrices.alpha): - proficiencyidx = set(np.where(ialpha > alpha)[0]) + proficiencyidx = set(np.where(ialpha > alpha)[1]) # determine whether a user is proficient at >= number # of answers on a level. If yes, the check next level # until < at which point you know which level the user @@ -485,7 +480,7 @@ def determine_level(self, alpha=None): level.append([curr_workflow, curr_level, iuser]) columns = ['curr_workflow', 'curr_level', 'userID'] - return pd.DataFrame(level, columns = columns) + return pandas.DataFrame(level, columns = columns) def check_level_by_classification(self): @@ -497,7 +492,7 @@ def check_level_by_classification(self): query = 'classificationsdev GROUP BY links_user, links_workflow' userlevels = EventTable.fetch('gravityspy', query, - columns = ['links_user', 'links_workflow']) + columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu') userlevels = userlevels.to_pandas() userlevels['Level'] = userlevels.links_workflow.apply( @@ -508,6 +503,6 @@ def check_level_by_classification(self): init_user_levels_dict = {'userID' : init_user_levels.index.tolist(), 'workflowInit' : init_user_levels.tolist()} - userStatusInit = pd.DataFrame(init_user_levels_dict) + userStatusInit = pandas.DataFrame(init_user_levels_dict) self.userStatusInit = userStatusInit return userStatusInit