Transitioned promotion to new database, it is astronomically faster t…

…he way it is coded now and now that it is on the new database
Gravity-Spy · Apr 3, 2020 · 99e473c · 99e473c
1 parent b430a20
commit 99e473c
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 47 deletions.
diff --git a/bin/parse_classifications b/bin/parse_classifications
@@ -92,6 +92,9 @@ answers_dict_level4['SCATTERED'] = 16
 answers_dict_level4['KOI FISH'] = 7
 answers_dict_level4['VIOLIN MODE HARMONIC'] = 19
 answers_dict_level4['POWER LINE (60 HZ)'] = 14
+answers_dict_level4['BLP'] = answers_dict_level4['BLIP']
+answers_dict_level4['WHSTL'] = answers_dict_level4['WHISTLE']
+answers_dict_level4['NNFTHBV'] = answers_dict_level4['NONEOFTHEABOVE']
 
 
 answers_virgo = gspyproject.get_answers(workflow=7501)
@@ -108,10 +111,10 @@ level_workflow_dict = dict(enumerate(workflow_order))
 
 # Load last_id that was parsed
 #last_id = "16822410"
-engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu'
-                       ':5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],
-                                                 os.environ['GRAVITYSPY_DATABASE_PASSWD']))
-if os.path.isfile(args.id_file):
+engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu'
+                       ':5432/gravityspy'.format(os.environ['GRAVITYSPYPLUS_DATABASE_USER'],
+                                                 os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD']))
+if args.id_file is not None and os.path.isfile(args.id_file):
     last_id = pd.read_csv('{0}'.format(args.id_file)).iloc[0].iloc[0]
 elif args.last_id is None:
     last_id = pd.read_sql("select max(id) from classificationsdev",engine).iloc[0].iloc[0]

diff --git a/bin/promote_users b/bin/promote_users
@@ -47,7 +47,7 @@ level_workflow_dict[2117] = 7
 # Obtain what level a user should be on based ont he highest level
 # they have done a classificaiton in before
 init_user_levels = EventTable.fetch('gravityspy', 'classificationsdev GROUP BY links_user, links_workflow',
-                                     columns = ['links_user', 'links_workflow']).to_pandas()
+                                     columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu').to_pandas()
 init_user_levels['init_level'] = init_user_levels['links_workflow'].apply(lambda x: level_workflow_dict[x])
 init_levels = init_user_levels.groupby('links_user').max().init_level
 user_status_init = pandas.DataFrame({'userID' : init_levels.index.tolist(),
@@ -60,7 +60,7 @@ user_confusion_matrices = gspyproject.calculate_confusion_matrices()
 
 user_levels = gspyproject.determine_level()
 
-engine = create_engine('postgresql://{0}:{1}@gravityspy.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD']))
+engine = create_engine('postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format(os.environ['GRAVITYSPY_DATABASE_USER'],os.environ['GRAVITYSPY_DATABASE_PASSWD']))
 query = 'WITH foo as (SELECT id, links_user, links_workflow FROM classificationsdev WHERE links_workflow IN (2360, 7766)) SELECT links_user, count(id) FROM foo GROUP BY links_user HAVING count(id) > 25'
 virgo_promotion = pandas.read_sql(query, engine)
 # Special virgo promotion

diff --git a/gravityspy/api/project.py b/gravityspy/api/project.py
@@ -5,9 +5,11 @@
 from panoptes_client import SubjectSet, Project, Workflow
 from scipy.sparse import coo_matrix
 from gwpy.table import EventTable
+from sqlalchemy.engine import create_engine
 
 import re, pickle
-import pandas as pd
+import os
+import pandas
 import numpy as np
 
 __all__ = ['ZooProject', 'flatten', 'GravitySpyProject',
@@ -305,60 +307,53 @@ def calculate_confusion_matrices(self):
         # Ignore NONEOFTHEABOVE classificatios when constructing confusion
         # matrix
         # Make sure to the subject classified was a golden image
-        query = 'classificationsdev WHERE \"annotations_value_choiceINT\" != \
-            -1 AND \"links_user\" != 0 AND \
-            \"annotations_value_choiceINT\" != 12 AND \
-            CAST(links_subjects AS FLOAT) IN \
-            (SELECT \"links_subjects\" FROM goldenimages)'
-
-        columns = ['id', 'links_user', 'links_subjects', 'links_workflow',
-                   'annotations_value_choiceINT']
-        classifications = EventTable.fetch('gravityspy', query,
-                                           columns = columns)
 
-        classifications = classifications.to_pandas()
-        classifications = classifications.sort_values('id')
-        golden_images = EventTable.fetch('gravityspy', 'goldenimages')
-        golden_images_df = golden_images.to_pandas()
+        query = ("SELECT classificationsdev.id, classificationsdev.links_user, "
+                "classificationsdev.links_subjects, classificationsdev.links_workflow, "
+                "classificationsdev.\"annotations_value_choiceINT\", goldenimages.goldlabel "
+                "FROM classificationsdev INNER JOIN goldenimages ON classificationsdev.links_subjects = goldenimages.links_subjects "
+                "WHERE classificationsdev.\"annotations_value_choiceINT\" != -1 AND "
+                "classificationsdev.\"annotations_value_choiceINT\" != 12 AND "
+                "classificationsdev.links_user != 0")
 
-        # From answers Dict determine number of classes
-        numClasses = len(self.get_answers(workflow=7766).values()[0])
+        engine = create_engine(
+                               'postgresql://{0}:{1}@gravityspyplus.ciera.northwestern.edu:5432/gravityspy'.format(
+                                                os.environ['GRAVITYSPYPLUS_DATABASE_USER'],
+                                                os.environ['GRAVITYSPYPLUS_DATABASE_PASSWD']))
 
-        # merge the golden image DF with th classification (this merge is on
-        # links_subject (i.e. the zooID of the image classified)
-        image_and_classification = classifications.merge(golden_images_df,
-                                                         on=['links_subjects'])
+        classifications = pandas.read_sql(query, engine)
+        classifications = classifications.sort_values('id')
 
         # This is where the power of pandas comes in...on the fly in very quick
         # order we can fill all users confusion matrices
         # by smartly chosen groupby
-        test = image_and_classification.groupby(['links_user',
-                                                 'annotations_value_choiceINT',
-                                                 'GoldLabel'])
+        test = classifications.groupby(['links_user',
+                                         'annotations_value_choiceINT',
+                                         'goldlabel'])
         test = test.count().links_subjects.to_frame().reset_index()
 
+        # From answers Dict determine number of classes
+        numClasses = len(self.get_answers(workflow=7766).values()[0])
+
         # Create "Sparse Matrices" and perform a normalization task on them.
         # Afterwards determine if the users diagonal
         # is above the threshold set above
-        confusion_matrices = pd.DataFrame()
+        conf_dict = {'userID' : [], 'conf_matrix' : [], 'alpha' : []}
         for iUser in test.groupby('links_user'):
             columns = iUser[1].annotations_value_choiceINT
-            rows = iUser[1]['GoldLabel']
+            rows = iUser[1]['goldlabel']
             entry = iUser[1]['links_subjects']
             tmp = coo_matrix((entry, (rows,columns)), shape=(numClasses,
                                                              numClasses))
             conf_divided, a1, a2, a3 = \
                 np.linalg.lstsq(np.diagflat(tmp.sum(axis=1)),
                                             tmp.todense())
 
-            conf_dict = {'userID' : iUser[0], 'conf_matrix' : [conf_divided],
-                  'alpha' : [np.diag(conf_divided)]}
-
-            confusion_matrices = \
-                confusion_matrices.append(pd.DataFrame(
-                                                       conf_dict,
-                                                       index=[iUser[0]]))
+            conf_dict['userID'].append(iUser[0])
+            conf_dict['conf_matrix'].append([conf_divided])
+            conf_dict['alpha'].append([np.diag(conf_divided)])
 
+        confusion_matrices = pandas.DataFrame(conf_dict).set_index('userID', drop=False)
         self.confusion_matrices = confusion_matrices
         return confusion_matrices
 
@@ -387,11 +382,11 @@ def calculate_confusion_matrices_per_classification(self):
         columns = ['id', 'links_user', 'links_subjects', 'links_workflow',
                    'annotations_value_choiceINT']
         classifications = EventTable.fetch('gravityspy', query,
-                                           columns = columns)
+                                           columns = columns, host='gravityspyplus.ciera.northwestern.edu')
 
         classifications = classifications.to_pandas()
         classifications = classifications.sort_values('id')
-        golden_images = EventTable.fetch('gravityspy', 'goldenimages')
+        golden_images = EventTable.fetch('gravityspy', 'goldenimages', host='gravityspyplus.ciera.northwestern.edu')
         golden_images_df = golden_images.to_pandas()
 
         # From answers Dict determine number of classes
@@ -403,7 +398,7 @@ def calculate_confusion_matrices_per_classification(self):
                                                          on=['links_subjects'])
 
         # groupby users to get there gold classifications
-        tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','GoldLabel', 'id']]
+        tmp = image_and_classification.groupby('links_user')[['annotations_value_choiceINT','goldlabel', 'id']]
         user_confusion_matrices = {}
         for key, item in tmp:
             user_confusion_matrices[key] = {}
@@ -441,7 +436,7 @@ def determine_level(self, alpha=None):
         promotion_Level4 = set([answers_dict[answer] for answer in answers['7765'].keys() if answer not in ['NONEOFTHEABOVE']])
         promotion_Level5 = set([answers_dict[answer] for answer in answers['7766'].keys() if answer not in ['NONEOFTHEABOVE']])
 
-        
+
         level_dict = dict(enumerate(self.workflow_order))
         workflow_level_dict = dict((v, k + 1) for k, v in
                                    level_dict.items())
@@ -463,7 +458,7 @@ def determine_level(self, alpha=None):
         for (iuser, ialpha) in zip(self.confusion_matrices.userID,
                                    self.confusion_matrices.alpha):
 
-            proficiencyidx = set(np.where(ialpha > alpha)[0])
+            proficiencyidx = set(np.where(ialpha > alpha)[1])
             # determine whether a user is proficient at >= number
             # of answers on a level. If yes, the check next level
             # until < at which point you know which level the user
@@ -485,7 +480,7 @@ def determine_level(self, alpha=None):
             level.append([curr_workflow, curr_level, iuser])
 
         columns = ['curr_workflow', 'curr_level', 'userID']
-        return pd.DataFrame(level, columns = columns)
+        return pandas.DataFrame(level, columns = columns)
 
 
     def check_level_by_classification(self):
@@ -497,7 +492,7 @@ def check_level_by_classification(self):
 
         query = 'classificationsdev GROUP BY links_user, links_workflow'
         userlevels = EventTable.fetch('gravityspy', query,
-                         columns = ['links_user', 'links_workflow'])
+                         columns = ['links_user', 'links_workflow'], host='gravityspyplus.ciera.northwestern.edu')
 
         userlevels = userlevels.to_pandas()
         userlevels['Level'] = userlevels.links_workflow.apply(
@@ -508,6 +503,6 @@ def check_level_by_classification(self):
         init_user_levels_dict = {'userID' : init_user_levels.index.tolist(),
                                 'workflowInit' : init_user_levels.tolist()}
 
-        userStatusInit = pd.DataFrame(init_user_levels_dict)
+        userStatusInit = pandas.DataFrame(init_user_levels_dict)
         self.userStatusInit = userStatusInit
         return userStatusInit