# CREATING THE SOAM

In [1]:
# THIS is the approach used in building up a final SOAM data structure using 
# the Soam Class

#Homemade methods and classes (includes astropy tools, not written by me)
import classes_and_methods.soam_class as soam
from classes_and_methods.simbad_alias_search import online_alias_search

from sklearn.feature_extraction.text import CountVectorizer 

import pandas as pd
import re
import json

In [2]:
#My new SOAM
my_soam = soam.Soam() #initiating my SOAM

#STARTING with seed associations scraped from the web (wikipedia lists and the like...)
file_path = "AstroCatelogues.xlsm" #the file containing all the "seed" associations

SOAM Started ---------------- 
SOAM Cleaning Method Test -- 
original test string: "   #][!,@ ^&*NGc224-.99+9abc. ...   "
cleaned test string: "ngc 224 99 9 abc"


## LOADING SEED ASSOCIATIONS INTO THE SOAM

In [3]:
#Start Loading in various Catalogs with some associations
sheet_name = "Messier"
df = pd.read_excel(file_path,sheet_name)
select_cols = ["Messier number","NGC/IC number","Common name"]
listed_values = df[select_cols].values.tolist()
associations = [[item.replace('\xa0', '') for item in sublist if item != '–'] for sublist in listed_values]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

282 names / aliases mapped across 109 objects.


In [4]:
import math
sheet_name = "Caldwell"
df = pd.read_excel(file_path,sheet_name)
select_cols = ["Caldwell number","NGC number"]
listed_values = df[select_cols].values.tolist()
associations = [[item.replace('\xa0', '') for item in sublist if item != '-' ] for sublist in listed_values]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

500 names / aliases mapped across 218 objects.


In [5]:
sheet_name = "Planets_and_DwarfPlanets"
df = pd.read_excel(file_path,sheet_name)
# we need to change "(45) Eugenia I (Petit-Prince)" into ['', '45',' Eugenia I ', 'Petit-Prince', '' ]
associations = list(df["Body"].str.split(r'[\(\)]'))
#now we need to change ['', '45',' Eugenia I ', 'Petit-Prince', '' ] into ['Eugenia I', 'Petit-Prince']
associations = [[word.replace('\xa0', ' ').rstrip().lstrip() for word in sublist if (word != '') and not (word.isdigit())] for sublist in associations]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

693 names / aliases mapped across 411 objects.


In [6]:
sheet_name = "Galactic Wolf Rayet"
df = pd.read_excel(file_path,sheet_name)
df['WR'] = "WR " + (df['WR#'])
select_cols = ["WR","HD","Alias1","Alias2","Alias3"]
listed_values = df[select_cols].astype(str).values.tolist()
#cleaning out the bad characters, removing 'nan's, and stripping out right hand spaces
listed_values = [[word.replace('\xa0', ' ').rstrip() for word in sublist if word != 'nan'] for sublist in listed_values]
#getting rid of the last value (not a real association)
associations = listed_values[:-1]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2349 names / aliases mapped across 1077 objects.


In [7]:
sheet_name = "Sharpless"
df = pd.read_excel(file_path,sheet_name)

#part 1 of the sharpless associations
c = "Names and designations"
designations_list = df[c].str.split(r',').values.tolist()
associations_1 = [[word.replace('\xa0', '').lstrip() for word in sublist] for sublist in designations_list]    

#part 2 of the sharpless associations
df["Sh2"] = "Sh2 " + df['Sh2 No.'].astype(str)
select_cols = ["Sh2", "A common name"]
associations_2 = df[select_cols].astype(str).values.tolist()

associations = [set(associations_1[i] + associations_2[i]) for i in range(len(associations_1))]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2479 names / aliases mapped across 1101 objects.


In [8]:
sheet_name = "Gum"
df = pd.read_excel(file_path,sheet_name)

#part 1 of the gum associations
c = "Names & Designations"
designations_list = df[c].str.split(r',').values.tolist()
associations_1 = [[word.replace('\xa0', '').lstrip() for word in sublist] for sublist in designations_list]    

#part 2 of the gum associations
select_cols = ["Gum", "A common name"]
associations_2 = df[select_cols].astype(str).values.tolist()

associations = [set(associations_1[i] + associations_2[i]) for i in range(len(associations_1))]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2490 names / aliases mapped across 1106 objects.


## LOAD SELECT CATALOG IDS FROM ASTROBIN TITLES 

In [9]:
# Comets and other minor bodies could be added to this alias map if desired...
#Check out https://sbpy.readthedocs.io/en/latest/api/sbpy.data.Names.html#sbpy.data.Names
#from sbpy.data import Names
#... Not really sure how useful it would be

In [10]:
print("--- START LOADING---")
# Data Load (provided by Salvatore from AstroBin site)
df = pd.read_csv("data/astrobin_titles_to_subject_types.csv") #the main starting data frame
print("--- END LOADING---")
print("--- START CLEANING---")
#HOUSE KEEPING -- Casting 'subject_type' and 'solar_system_main_subject' as categories
cats = ['subject_type','solar_system_main_subject']
df[cats] = df[cats].astype('category')

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(how='all') #droping only the rows with 'na' across ALL columns
after = df.shape[0] #row count after

print(str(before-after)+" Null Rows Removed")

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(subset=['title']) #droping only the rows with 'na' ONLY in title column
df = df[~df['title'].str.isspace()] #droping only the rows with '  ' (white space) as title
df = df[df['title']!=''] #droping only the rows with '' (dead space) as title
after = df.shape[0] #row count after

print(str(before-after)+" Null Title Rows Removed")

#CLEANING the unwanted words out of the string column (use all lower case)
unwanted_words = ['the', 'and', "in", "of"] #there are other words like " on " 
# ...but those may be more indicative of the category they are found in... 
#... (like " on " being a very common word in GEAR)...
#... (or like " at " being a very common word in STAR_TRAILS)

#CLEANING some other language words out
other_unwanted_words = [
    'du', 'la', 'de', 'le'
]
#If you want to add this extra cleaning step for other languages, leave this line
unwanted_words = unwanted_words+other_unwanted_words

# Regular expression pattern to match any unwanted word
pattern = r'\b(?:{})\b'.format('|'.join(unwanted_words))
# Remove unwanted words using the pattern with str.replace() and put cleaned strings their own col
df['cleaned_title'] = df['title'].str.lower().replace(pattern, '', regex=True)
print("---CLEANING DONE---")

--- START LOADING---
--- END LOADING---
--- START CLEANING---
1837 Null Rows Removed
2394 Null Title Rows Removed
---CLEANING DONE---


In [11]:
#make a vectorizer with ngram min/max set of uni and bi ngrams
vectorizer = CountVectorizer(ngram_range = (1,2)) 

ngrams = vectorizer.fit_transform(df.cleaned_title) #vectorize that string column

#collect frequencies of counts from vectorized values
ngram_freq = pd.DataFrame(ngrams.sum(axis=0), columns=vectorizer.get_feature_names_out())

all_ngrams = ngram_freq.T.sort_values(by=0, ascending=False) #return all n ngrams

 

In [12]:
#EXPLORATION -- review cleaned title data for most popular catalog IDs

## -- first, a dict to help recognize catlog IDs
catalog_tags = {
    'messier':['messier','m'],
    'ic':['ic','index'],
    'ngc':['ngc','new general'],
    'hd':['hd','henry draper'],
    'caldwell':['caldwell','c'],
    'gum':['gum'],
    'sh2':['sharpless','sh2'],
    'rcw':['rcw']
}

## -- then, lets merge all tags into a check list
checklist = [x for every_x in catalog_tags.values() for x in every_x]

In [13]:
## -- then, we understand the following and apply it to a regex search of each 1 and 2 ngram
#  -- -- each tag should be followed by a series of number characters [0-9]
#  -- -- each series of number characters of an objects ID will end with a non-numeric character.                 
#  -- -- this will not capture multiple IDs that were 'cleverly' written like:
#        "Messier 24,16,17 and m23" (our regex will only catch "Messier 24" and "m23")

#HELPER METHOD -- regex for catalog patterns in titles, default values are only for testing purposes
def id_search(tags = ['messier','m'], s = "This string 'messier 24,16, 17 and m23, m-05' will not return 16 nor 17"):      
    #ASSUMES tags and s are cleaned accordingly (cap sensitive)
    # Create a regular expression pattern with capturing groups for the keyword and numeric characters
    pattern = r'(' + '|'.join(re.escape(keyword) for keyword in tags) + r')\D*(\d+)'
    # Use re.findall to find all matching substrings
    matches = re.findall(pattern, s)
    # return the matched keyword and numeric characters
    return([' '.join(match) for match in matches])

In [14]:
#WORK -- pull IDs from title data
title_ids = [id_search(tags = checklist, s = x) for x in df.cleaned_title] #collect id_search results, includes empty '[]' and nested '[A, B]'      
title_ids = [x for every_x in title_ids for x in every_x] #remove empties  and un-nest values, still contains duplicates
title_ids = set(title_ids) #removes duplicates and makes title ids a set
#title_ids #we now have a set of IDS listed from titles data (some ids are still nonsence, like 'm 20190716')    

In [15]:
#TITLE ASSISTED SELECTIONS
ngc_selections = [x for x in title_ids if 'ngc' in x]+[x for x in title_ids if 'new' in x]
ic_selections = [x for x in title_ids if 'ic' in x]+[x for x in title_ids if 'index' in x]
hd_selections = [x for x in title_ids if 'hd' in x]+[x for x in title_ids if 'draper' in x]
#... these selections still need to be cleaned", but the rules of these cleanings may differ between the catalog selection types 
#simple cleaning -- filtering out the values that are obviously too long to be real ids
ngc_selections = [x for x in ngc_selections if len(x)<=8] 
ic_selections = [x for x in ic_selections if len(x)<=7]
hd_selections = [x for x in hd_selections if len(x)<=9]

In [16]:
astrobin_select_ids = ngc_selections+ic_selections+hd_selections

## DO NOT UNCOMMENT unless you need to... will run for hours... --------
#search_return = online_alias_search(astrobin_select_ids)
#bulk_associations = [x["found_associations"] for x in search_return["associations"]]
## ---------------------------------------------------------------------

#INSTEAD...
# laod a json file that I already took the time to pull from 
file_name = 'data/ngc_ic_hd_associations_data.json'
# Read the JSON file and convert it back to a list of lists
bulk_associations = []
with open(file_name, 'r') as file:
    bulk_associations = json.load(file)

In [17]:
cleaned_bulk = []

#clean out certain two-character leads in associations
bad_leads = [
    "M ", #we already have Messier IDs in the SOAM from above steps
    "Z " #not a common ID
]
for association in bulk_associations:
    cleaned_bulk.append([name for name in association if not name[:2] in bad_leads])  

In [18]:
#remove all ids that are just numerical characters
cleaned_bulk = [[item for item in inner_list if not item.isnumeric()] for inner_list in cleaned_bulk]

In [19]:
#remove all ids that are shorter than 3 characters long
cleaned_bulk = [[item for item in inner_list if len(item)>=3] for inner_list in cleaned_bulk]

In [20]:
#remove all ids that start with a numeric character
cleaned_bulk = [[item for item in inner_list if not item[0].isnumeric()] for inner_list in cleaned_bulk]

In [21]:
# Remove the excessivly long "Gaia " ids
cleaned_bulk = [[item for item in inner_list if item[:5] != 'Gaia '] for inner_list in cleaned_bulk]

In [22]:
#drop the "HIDDEN " tag from any id that has this...
cleaned_bulk = [[item.replace('HIDDEN ', '') for item in inner_list] for inner_list in cleaned_bulk]

In [23]:
cleaned_bulk

[['LEDA 69401', 'MAPS-PP O-778-826778', 'NGC 7345', 'UGC 12130'],
 ['C 0746-211', 'NGC 2455', 'MWSC 1350'],
 ['GLEAM J202507-244839',
  'HIPASS J2025-24',
  'SINGG HIPASS J2025-24',
  'AGC 33399',
  'PMN J2025-2448',
  'NVSS J202507-244832',
  'ESO 528-3',
  'ESO-LV 528-0030',
  'IRAS 20221-2458',
  'IRAS F20221-2458',
  'LEDA 64650',
  'MCG-04-48-006',
  'NGC 6907',
  'UGCA 418',
  'PSCz Q20221-2458'],
 ['Ark 582', 'GIN 675', 'LEDA 71011', 'NGC 7598', 'ACO 2572 B'],
 ['Ka 111', 'LEDA 59344', 'NGC 6285', 'PRC C-51', 'APG 293A', 'PN VV 527'],
 ['LEDA 53933', 'NGC 5866', 'UGC 9723'],
 ['TIC 150971494', 'UCAC2 39602310', 'UCAC3 225-1775', 'NGC 91'],
 ['Ark 1', 'LEDA 565', 'NGC 3', 'UGC 58'],
 ['CGMW 5-6149', 'LEDA 62178', 'NGC 6674', 'TC 868', 'UGC 11308'],
 ['LH 80', 'NGC 2028', 'OGLE-CL LMC 594', 'ESO 56-152'],
 ['NGC 1917', 'OGLE-CL LMC 343', 'ESO 56-100'],
 ['LEDA 1187',
  'MAPS-PP O-1257-202235C',
  'NGC 68',
  'UGC 170',
  'VV 166',
  'VV 166b'],
 ['KPG 529b', 'LEDA 61404', 'NGC 654

## ADDING BULK ASSOCIATIONS TO THE SOAM

In [24]:
#Adding the bulk of NGC, IC, and HD along with all of SIMBADs associated ids to my SOAM
# ... This will take a minute...
print(my_soam)
my_soam.add_associations(cleaned_bulk)
print(my_soam)
#uncomment only if you really want to redo your cleaned export
#my_soam.export_soam(file_name="soam_cleaned_bulk_export")

print(my_soam)
my_soam.add_associations(bulk_associations)
print(my_soam)
my_soam.export_soam(file_name="soam_bulk_export")

2490 names / aliases mapped across 1106 objects.
25631 names / aliases mapped across 4264 objects.
25631 names / aliases mapped across 4264 objects.
32172 names / aliases mapped across 4243 objects.


### COMPARING my_soam AND cleaned_soam

In [25]:
cleaned_soam = soam.Soam()
cleaned_soam.import_soam(file_name="soam_cleaned_bulk_export")
print(cleaned_soam)
#print(my_soam)

SOAM Started ---------------- 
SOAM Cleaning Method Test -- 
original test string: "   #][!,@ ^&*NGc224-.99+9abc. ...   "
cleaned test string: "ngc 224 99 9 abc"
25631 names / aliases mapped across 4264 objects.


In [26]:
#my_soam.get_aliases("horseshoe nebula")
#cleaned_soam.get_aliases("horseshoe nebula")
#online_alias_search(["m 17"])

In [27]:
#for name in cleaned_soam.all_names():
#    key1 = int(my_soam.all_names()[name])
#    key2 = int(cleaned_soam.all_names()[name])
#    if key1 != key2:
#        print(f'{name} -- my_soam: {key1} -- cleaned_soam: {key2}')
#        #print(my_soam.all_aliases()[key1])

In [28]:
for name in my_soam.all_names():
    try:
        key1 = int(my_soam.all_names()[name])
        key2 = int(cleaned_soam.all_names()[name])
    except:
        key1 = int(my_soam.all_names()[name])
        key2 = -1
    if key1 != key2:
        print(f'{name} -- my_soam: {key1} -- cleaned_soam: {key2}')

3 cr 144 -- my_soam: 0 -- cleaned_soam: -1
2 c 481 -- my_soam: 0 -- cleaned_soam: -1
3 c 144 -- my_soam: 0 -- cleaned_soam: -1
2 e 1309 -- my_soam: 0 -- cleaned_soam: -1
theia 874 -- my_soam: 5 -- cleaned_soam: 37
m 43 -- my_soam: 5 -- cleaned_soam: 42
starfish cluster -- my_soam: 5 -- cleaned_soam: 37
mwsc 0557 -- my_soam: 5 -- cleaned_soam: 37
mairan's nebula -- my_soam: 5 -- cleaned_soam: 42
m 38 -- my_soam: 5 -- cleaned_soam: 37
ngc 1982 -- my_soam: 5 -- cleaned_soam: 42
ngc 1912 -- my_soam: 5 -- cleaned_soam: 37
rafgl 779 -- my_soam: 5 -- cleaned_soam: 42
de mairan's nebula -- my_soam: 5 -- cleaned_soam: 42
m 548 -- my_soam: 17 -- cleaned_soam: -1
m 215 -- my_soam: 19 -- cleaned_soam: -1
m 208 -- my_soam: 19 -- cleaned_soam: -1
m 248 -- my_soam: 20 -- cleaned_soam: -1
m 662 -- my_soam: 24 -- cleaned_soam: -1
m 667 -- my_soam: 24 -- cleaned_soam: -1
gaia dr 2 1827256624493300096 -- my_soam: 26 -- cleaned_soam: -1
2 e 4308 -- my_soam: 26 -- cleaned_soam: -1
gaia dr 3 182725662449330

gaia dr 2 3669118238019164288 -- my_soam: 2757 -- cleaned_soam: -1
gaia dr 3 3669118238019403008 -- my_soam: 2757 -- cleaned_soam: -1
z 387 63 -- my_soam: 2758 -- cleaned_soam: -1
gaia dr 3 2494894966934528512 -- my_soam: 2758 -- cleaned_soam: -1
2 masx j 02155850 0042486 -- my_soam: 2758 -- cleaned_soam: -1
gaia dr 2 2494894966934406144 -- my_soam: 2758 -- cleaned_soam: -1
gaia dr 3 2630285839480983680 -- my_soam: 2759 -- cleaned_soam: -1
2 masx j 23190480 0829065 -- my_soam: 2759 -- cleaned_soam: -1
gaia dr 3 6823992787487206016 -- my_soam: 2760 -- cleaned_soam: -1
2 masx j 22022161 1854596 -- my_soam: 2760 -- cleaned_soam: -1
2 dfgrs tgs 813 z 199 -- my_soam: 2760 -- cleaned_soam: -1
m 472 -- my_soam: 2762 -- cleaned_soam: -1
gaia dr 3 1462528864998742784 -- my_soam: 2762 -- cleaned_soam: -1
z 161 43 -- my_soam: 2762 -- cleaned_soam: -1
2 mfgc 10735 -- my_soam: 2762 -- cleaned_soam: -1
z 47 22 -- my_soam: 2764 -- cleaned_soam: -1
z 99 113 -- my_soam: 2765 -- cleaned_soam: -1
gaia dr

# A Word of Warning...

In [29]:
#Unfortunately there are some pockets where the associations fed into the SOAM were incorrect:
#EXAMPLE:
my_soam.get_aliases("m38")
#returns
#  {'butterfly cluster',
#   ...
#   'm 38',
#   'm 43',
#   'm 6',
#   "mairan's nebula",
#   ...
#   'starfish cluster',
#   ...}

# we know that those are three distinct objects...

#HOW DOES THIS HAPPEN?
# -- These types of "over-agressive" association between distinctly different
#    objects appears to stem from SIMBAD
# -- Recall there was a SOAM building step where SIMBAD was used to 
#    query all associated ids and names given either an ngc, ic, or hd catalog id
# -- It appears that SIMBAD has some bad data here...

#EXAMPLE:
#an online simbad search (as performed through my old method)
#online_alias_search(["NGC 6405"])
#returns
#'associations': [{'searched': 'NGC 6405',
#   'found_associations': [
#     ...
#    'M 6',
#    'NGC 6405',
#    'M 38', #this should not be here
#    'M 43', #this should not be here
#     ...
#    'Butterfly Cluster']}]

{'butterfly cluster',
 'c 1736 321',
 'cl vdbh 242',
 "de mairan's nebula",
 'gc 128',
 'gc 130',
 'gc 137',
 'm 38',
 'm 43',
 'm 6',
 "mairan's nebula",
 'mis 588',
 'mis 589',
 'mis 607',
 'mwsc 0557',
 'mwsc 2661',
 'ngc 1912',
 'ngc 1982',
 'ngc 6405',
 'rafgl 779',
 'starfish cluster',
 'theia 122',
 'theia 874'}

In [30]:
#I fixed the above issue with possible mis-associated Messier objects by 
#  by cleaning the JSON file from the initial SIMBAD query
cleaned_bulk = []
for association in bulk_associations:
    cleaned_bulk.append([name for name in association if not name.startswith("M ")])
    
#... But there could be other issues similar to the Messier object error... 

# Instead of re-running the SIMBAD query, I chose to filter out all "M " words from the 
# JSON file instead (as these were already collected in the Seed portion of building the SOAM)

## Continues Cleaning of the SOAM

In [31]:
print(cleaned_soam)

#pulling names that contain "and" or "or" (we will likely need to address the conjunction "or")
[x for x in cleaned_soam.all_names() if (len(x.split()) > 3) and (("or" in x) or ("and" in x))]

conjuncted_names = ['omega swan horseshoe lobster or checkmark nebula',
 'beehive cluster or praesepe',
 'pleiades seven sisters or subaru',
 'cetus a or squid galaxy',
 'virgo a or smoking gun galaxy',
 "crocodile eye or cat's eye galaxy"]

[x.split(" or ") for x in conjuncted_names]

associations = split_names = [
    ['omega swan horseshoe lobster or checkmark nebula','omega nebula','swan nebula','horseshoe nebula','lobster nebula', 'checkmark nebula'],
    [ 'beehive cluster or praesepe','beehive cluster', 'praesepe'],
    ['pleiades seven sisters or subaru','pleiades','seven sisters', 'subaru'],
    ['cetus a or squid galaxy', 'cetus a galaxy', 'squid galaxy'],
    ['virgo a or smoking gun galaxy', 'virgo a galaxy', 'smoking gun galaxy'],
    [ "crocodile eye or cat's eye galaxy",'crocodile eye galaxy', "cat's eye galaxy"]]

cleaned_soam.add_associations(associations)
print(cleaned_soam)

#uncomment only if you really want to redo your cleaned export
#cleaned_soam.export_soam(file_name="soam_cleaned_bulk_export") 

25631 names / aliases mapped across 4264 objects.
25642 names / aliases mapped across 4264 objects.


# TESTING BLOCKS

In [35]:
cleaned_soam.get_aliases("m 1")

{'ajg 1',
 'crab',
 'crab neb',
 'crab nebula',
 'cta 36',
 'ctb 18',
 'da 179',
 'db 38',
 'lbn 833',
 'm 1',
 'messier 1',
 'ngc 1952',
 'nrao 214',
 'nrl 2',
 'sh 2 244',
 'tau a',
 'taurus a',
 'w 9',
 'x tau x 1',
 'x tau xr 1'}