# CREATING THE SOAM
This notebook outlines the approach I used in building up a final SOAM data structure using the Soam Class

In [37]:
#Imports
#Homemade methods and classes (includes astropy tools, not written by me)
import methods_and_classes.soam_class as soam
from methods_and_classes.simbad_alias_search import online_alias_search

from sklearn.feature_extraction.text import CountVectorizer 

import pandas as pd
import re
import json

In [38]:
#My new SOAM
my_soam = soam.Soam() #initiating my SOAM

#STARTING with seed associations scraped from the web (wikipedia lists and the like...)
file_path = ".\data\AstroCatelogues.xlsm" #the file containing all the "seed" associations

SOAM Started ---------------- 
SOAM Cleaning Method Test -- 
original test string: "   #][!,@ ^&*NGc224-.99+9abc. ...   "
cleaned test string: "ngc 224 99 9 abc"


## LOADING SEED ASSOCIATIONS INTO THE SOAM

In [39]:
#Start Loading in various Catalogs with some associations
sheet_name = "Messier"
df = pd.read_excel(file_path,sheet_name)
select_cols = ["Messier number","NGC/IC number","Common name"]
listed_values = df[select_cols].values.tolist()
associations = [[item.replace('\xa0', '') for item in sublist if item != '–'] for sublist in listed_values]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

282 names / aliases mapped across 109 objects.


In [40]:
import math
sheet_name = "Caldwell"
df = pd.read_excel(file_path,sheet_name)
select_cols = ["Caldwell number","NGC number"]
listed_values = df[select_cols].values.tolist()
associations = [[item.replace('\xa0', '') for item in sublist if item != '-' ] for sublist in listed_values]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

500 names / aliases mapped across 218 objects.


In [41]:
sheet_name = "Planets_and_DwarfPlanets"
df = pd.read_excel(file_path,sheet_name)
# we need to change "(45) Eugenia I (Petit-Prince)" into ['', '45',' Eugenia I ', 'Petit-Prince', '' ]
associations = list(df["Body"].str.split(r'[\(\)]'))
#now we need to change ['', '45',' Eugenia I ', 'Petit-Prince', '' ] into ['Eugenia I', 'Petit-Prince']
associations = [[word.replace('\xa0', ' ').rstrip().lstrip() for word in sublist if (word != '') and not (word.isdigit())] for sublist in associations]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

693 names / aliases mapped across 411 objects.


In [42]:
sheet_name = "Galactic Wolf Rayet"
df = pd.read_excel(file_path,sheet_name)
df['WR'] = "WR " + (df['WR#'])
select_cols = ["WR","HD","Alias1","Alias2","Alias3"]
listed_values = df[select_cols].astype(str).values.tolist()
#cleaning out the bad characters, removing 'nan's, and stripping out right hand spaces
listed_values = [[word.replace('\xa0', ' ').rstrip() for word in sublist if word != 'nan'] for sublist in listed_values]
#getting rid of the last value (not a real association)
associations = listed_values[:-1]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2349 names / aliases mapped across 1077 objects.


In [43]:
sheet_name = "Sharpless"
df = pd.read_excel(file_path,sheet_name)

#part 1 of the sharpless associations
c = "Names and designations"
designations_list = df[c].str.split(r',').values.tolist()
associations_1 = [[word.replace('\xa0', '').lstrip() for word in sublist] for sublist in designations_list]    

#part 2 of the sharpless associations
df["Sh2"] = "Sh2 " + df['Sh2 No.'].astype(str)
select_cols = ["Sh2", "A common name"]
associations_2 = df[select_cols].astype(str).values.tolist()

associations = [set(associations_1[i] + associations_2[i]) for i in range(len(associations_1))]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2479 names / aliases mapped across 1101 objects.


In [44]:
sheet_name = "Gum"
df = pd.read_excel(file_path,sheet_name)

#part 1 of the gum associations
c = "Names & Designations"
designations_list = df[c].str.split(r',').values.tolist()
associations_1 = [[word.replace('\xa0', '').lstrip() for word in sublist] for sublist in designations_list]    

#part 2 of the gum associations
select_cols = ["Gum", "A common name"]
associations_2 = df[select_cols].astype(str).values.tolist()

associations = [set(associations_1[i] + associations_2[i]) for i in range(len(associations_1))]

#Add the associations to the SOAM
my_soam.add_associations(associations)
print(my_soam)

2490 names / aliases mapped across 1106 objects.


## LOAD SELECT CATALOG IDS FROM ASTROBIN TITLES 

In [45]:
# Comets and other minor bodies could be added to this alias map if desired...
#Check out https://sbpy.readthedocs.io/en/latest/api/sbpy.data.Names.html#sbpy.data.Names
#from sbpy.data import Names
#... Not really sure how useful it would be

In [46]:
print("--- START LOADING---")
# Data Load (provided by Salvatore from AstroBin site)
df = pd.read_csv(".\\data\\astrobin_titles_to_subject_types.csv") #the main starting data frame
print("--- END LOADING---")
print("--- START CLEANING---")
#HOUSE KEEPING -- Casting 'subject_type' and 'solar_system_main_subject' as categories
cats = ['subject_type','solar_system_main_subject']
df[cats] = df[cats].astype('category')

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(how='all') #droping only the rows with 'na' across ALL columns
after = df.shape[0] #row count after

print(str(before-after)+" Null Rows Removed")

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(subset=['title']) #droping only the rows with 'na' ONLY in title column
df = df[~df['title'].str.isspace()] #droping only the rows with '  ' (white space) as title
df = df[df['title']!=''] #droping only the rows with '' (dead space) as title
after = df.shape[0] #row count after

print(str(before-after)+" Null Title Rows Removed")

#CLEANING the unwanted words out of the string column (use all lower case)
unwanted_words = ['the', 'and', "in", "of"] #there are other words like " on " 
# ...but those may be more indicative of the category they are found in... 
#... (like " on " being a very common word in GEAR)...
#... (or like " at " being a very common word in STAR_TRAILS)

#CLEANING some other language words out
other_unwanted_words = [
    'du', 'la', 'de', 'le'
]
#If you want to add this extra cleaning step for other languages, leave this line
unwanted_words = unwanted_words+other_unwanted_words

# Regular expression pattern to match any unwanted word
pattern = r'\b(?:{})\b'.format('|'.join(unwanted_words))
# Remove unwanted words using the pattern with str.replace() and put cleaned strings their own col
df['cleaned_title'] = df['title'].str.lower().replace(pattern, '', regex=True)
print("---CLEANING DONE---")

--- START LOADING---
--- END LOADING---
--- START CLEANING---
1837 Null Rows Removed
2394 Null Title Rows Removed
---CLEANING DONE---


In [47]:
#make a vectorizer with ngram min/max set of uni and bi ngrams
vectorizer = CountVectorizer(ngram_range = (1,2)) 

ngrams = vectorizer.fit_transform(df.cleaned_title) #vectorize that string column

#collect frequencies of counts from vectorized values
ngram_freq = pd.DataFrame(ngrams.sum(axis=0), columns=vectorizer.get_feature_names_out())

all_ngrams = ngram_freq.T.sort_values(by=0, ascending=False) #return all n ngrams

 

In [48]:
#EXPLORATION -- review cleaned title data for most popular catalog IDs

## -- first, a dict to help recognize catlog IDs
catalog_tags = {
    'messier':['messier','m'],
    'ic':['ic','index'],
    'ngc':['ngc','new general'],
    'hd':['hd','henry draper'],
    'caldwell':['caldwell','c'],
    'gum':['gum'],
    'sh2':['sharpless','sh2'],
    'rcw':['rcw']
}

## -- then, lets merge all tags into a check list
checklist = [x for every_x in catalog_tags.values() for x in every_x]

In [49]:
## -- then, we understand the following and apply it to a regex search of each 1 and 2 ngram
#  -- -- each tag should be followed by a series of number characters [0-9]
#  -- -- each series of number characters of an objects ID will end with a non-numeric character.                 
#  -- -- this will not capture multiple IDs that were 'cleverly' written like:
#        "Messier 24,16,17 and m23" (our regex will only catch "Messier 24" and "m23")

#HELPER METHOD -- regex for catalog patterns in titles, default values are only for testing purposes
def id_search(tags = ['messier','m'], s = "This string 'messier 24,16, 17 and m23, m-05' will not return 16 nor 17"):      
    #ASSUMES tags and s are cleaned accordingly (cap sensitive)
    # Create a regular expression pattern with capturing groups for the keyword and numeric characters
    pattern = r'(' + '|'.join(re.escape(keyword) for keyword in tags) + r')\D*(\d+)'
    # Use re.findall to find all matching substrings
    matches = re.findall(pattern, s)
    # return the matched keyword and numeric characters
    return([' '.join(match) for match in matches])

In [50]:
#WORK -- pull IDs from title data
title_ids = [id_search(tags = checklist, s = x) for x in df.cleaned_title] #collect id_search results, includes empty '[]' and nested '[A, B]'      
title_ids = [x for every_x in title_ids for x in every_x] #remove empties  and un-nest values, still contains duplicates
title_ids = set(title_ids) #removes duplicates and makes title ids a set
#title_ids #we now have a set of IDS listed from titles data (some ids are still nonsence, like 'm 20190716')    

In [51]:
#TITLE ASSISTED SELECTIONS
ngc_selections = [x for x in title_ids if 'ngc' in x]+[x for x in title_ids if 'new' in x]
ic_selections = [x for x in title_ids if 'ic' in x]+[x for x in title_ids if 'index' in x]
hd_selections = [x for x in title_ids if 'hd' in x]+[x for x in title_ids if 'draper' in x]
#... these selections still need to be cleaned", but the rules of these cleanings may differ between the catalog selection types 
#simple cleaning -- filtering out the values that are obviously too long to be real ids
ngc_selections = [x for x in ngc_selections if len(x)<=8] 
ic_selections = [x for x in ic_selections if len(x)<=7]
hd_selections = [x for x in hd_selections if len(x)<=9]

In [52]:
astrobin_select_ids = ngc_selections+ic_selections+hd_selections

## DO NOT UNCOMMENT unless you need to... will run for hours... --------
#search_return = online_alias_search(astrobin_select_ids)
#bulk_associations = [x["found_associations"] for x in search_return["associations"]]
## ---------------------------------------------------------------------

#INSTEAD...
# laod a json file that I already took the time to pull from 
file_name = '.\\data\\ngc_ic_hd_associations_data.json'
# Read the JSON file and convert it back to a list of lists
bulk_associations = []
with open(file_name, 'r') as file:
    bulk_associations = json.load(file)

In [53]:
cleaned_bulk = []

#clean out certain two-character leads in associations
bad_leads = [
    "M ", #we already have Messier IDs in the SOAM from above steps
    "Z " #not a common ID
]
for association in bulk_associations:
    cleaned_bulk.append([name for name in association if not name[:2] in bad_leads])  

In [54]:
#remove all ids that are just numerical characters
cleaned_bulk = [[item for item in inner_list if not item.isnumeric()] for inner_list in cleaned_bulk]

In [55]:
#remove all ids that are shorter than 3 characters long
cleaned_bulk = [[item for item in inner_list if len(item)>=3] for inner_list in cleaned_bulk]

In [56]:
#remove all ids that start with a numeric character
cleaned_bulk = [[item for item in inner_list if not item[0].isnumeric()] for inner_list in cleaned_bulk]

In [57]:
# Remove the excessivly long "Gaia " ids
cleaned_bulk = [[item for item in inner_list if item[:5] != 'Gaia '] for inner_list in cleaned_bulk]

In [58]:
#drop the "HIDDEN " tag from any id that has this...
cleaned_bulk = [[item.replace('HIDDEN ', '') for item in inner_list] for inner_list in cleaned_bulk]

In [59]:
cleaned_bulk[0:10] #example

[['LEDA 69401', 'MAPS-PP O-778-826778', 'NGC 7345', 'UGC 12130'],
 ['C 0746-211', 'NGC 2455', 'MWSC 1350'],
 ['GLEAM J202507-244839',
  'HIPASS J2025-24',
  'SINGG HIPASS J2025-24',
  'AGC 33399',
  'PMN J2025-2448',
  'NVSS J202507-244832',
  'ESO 528-3',
  'ESO-LV 528-0030',
  'IRAS 20221-2458',
  'IRAS F20221-2458',
  'LEDA 64650',
  'MCG-04-48-006',
  'NGC 6907',
  'UGCA 418',
  'PSCz Q20221-2458'],
 ['Ark 582', 'GIN 675', 'LEDA 71011', 'NGC 7598', 'ACO 2572 B'],
 ['Ka 111', 'LEDA 59344', 'NGC 6285', 'PRC C-51', 'APG 293A', 'PN VV 527'],
 ['LEDA 53933', 'NGC 5866', 'UGC 9723'],
 ['TIC 150971494', 'UCAC2 39602310', 'UCAC3 225-1775', 'NGC 91'],
 ['Ark 1', 'LEDA 565', 'NGC 3', 'UGC 58'],
 ['CGMW 5-6149', 'LEDA 62178', 'NGC 6674', 'TC 868', 'UGC 11308'],
 ['LH 80', 'NGC 2028', 'OGLE-CL LMC 594', 'ESO 56-152']]

## ADDING BULK ASSOCIATIONS TO THE SOAM

In [60]:
""" 
As a brief reminder of where we are at, see the below example:

EXAMPLE:
        t = Soam()
        cleaned_bulk = [["a","b","c"],["c","d"],["e","f"] # snippits of associations (not all fully associated yet)
        t.add_associations(cleaned_bulk)
        print(t.all_names())
        print(t.all_aliases())
        print(f"'c' is also known as {t.get_aliases('c')}")
    RETURNS:
        {'d': 0, 'b': 0, 'a': 0, 'c': 0, 'f': 1, 'e': 1}
        {0: {'d', 'b', 'a', 'c'}, 1: {'f', 'e'}}
        'c' is also known as {'d', 'b', 'a', 'c'}
"""
print(my_soam)
my_soam.add_associations(cleaned_bulk)
print(my_soam)
my_soam.export_soam(file_name="soam_cleaned_bulk_export")

2490 names / aliases mapped across 1106 objects.
25631 names / aliases mapped across 4264 objects.


# A Word of Warning...

In [61]:
#Unfortunately there were some pockets where the associations fed into the SOAM were incorrect. I've correct what I could, but other pockets may still be in here, unknown to me:
#EXAMPLE:
print("Fixed Example:")
my_soam.get_aliases("m38")
# used to return:
#  {'butterfly cluster',
#   ...
#   'm 38',
#   'm 43',
#   'm 6',
#   "mairan's nebula",
#   ...
#   'starfish cluster',
#   ...}

# we know that those are three distinct objects, because there were 3 Messier catalogue IDs in the same space object alias bin (M38, M43, and M6) 
# And those ID-ed space objects are not even close in proximaty to eachother in the sky...

#HOW DOES THIS HAPPEN?
# -- These types of "over-agressive" association between distinctly different
#    objects appears to stem from SIMBAD
# -- Recall there was a SOAM building step where SIMBAD was used to 
#    query all associated ids and names given either an ngc, ic, or hd catalog id
# -- It appears that SIMBAD has some bad data here...

#EXAMPLE:
#an online simbad search (as performed through my old method)
#online_alias_search(["NGC 6405"])
#returns
#'associations': [{'searched': 'NGC 6405',
#   'found_associations': [
#     ...
#    'M 6',
#    'NGC 6405',
#    'M 38', #this should not be here
#    'M 43', #this should not be here
#     ...
#    'Butterfly Cluster']}]

Fixed Example:


{'m 38', 'mwsc 0557', 'ngc 1912', 'starfish cluster', 'theia 874'}

In [62]:
#I fixed the above issue of having possible mis-associated Messier objects by 
#  cleaning the JSON file from the initial SIMBAD query
cleaned_bulk = []
for association in bulk_associations:
    cleaned_bulk.append([name for name in association if not name.startswith("M ")])
    
#... But there could be other issues similar to the Messier object error... 

# Instead of re-running the SIMBAD query, I chose to filter out all "M " words from the 
# JSON file instead (as these were already collected in the Seed portion of building the SOAM)

## Other Cleaning Tasks for the SOAM

In [63]:
"""
print(cleaned_soam)

#pulling names that contain "and" or "or" (we will likely need to address the conjunction "or")
[x for x in cleaned_soam.all_names() if (len(x.split()) > 3) and (("or" in x) or ("and" in x))]

conjuncted_names = ['omega swan horseshoe lobster or checkmark nebula',
 'beehive cluster or praesepe',
 'pleiades seven sisters or subaru',
 'cetus a or squid galaxy',
 'virgo a or smoking gun galaxy',
 "crocodile eye or cat's eye galaxy"]

[x.split(" or ") for x in conjuncted_names]

associations = split_names = [
    ['omega swan horseshoe lobster or checkmark nebula','omega nebula','swan nebula','horseshoe nebula','lobster nebula', 'checkmark nebula'],
    [ 'beehive cluster or praesepe','beehive cluster', 'praesepe'],
    ['pleiades seven sisters or subaru','pleiades','seven sisters', 'subaru'],
    ['cetus a or squid galaxy', 'cetus a galaxy', 'squid galaxy'],
    ['virgo a or smoking gun galaxy', 'virgo a galaxy', 'smoking gun galaxy'],
    [ "crocodile eye or cat's eye galaxy",'crocodile eye galaxy', "cat's eye galaxy"]]

cleaned_soam.add_associations(associations)
print(cleaned_soam)

#uncomment only if you really want to redo your cleaned export
#cleaned_soam.export_soam(file_name="soam_cleaned_bulk_export") 
"""
print()


