# Classifiction Methods Notebook
*Zach Jacobson -- started 10/12/2023*

## IMPORTS

In [1]:
# THIS approach (standardizing space object names found in titles) was 
# not incorporated into the final workflow, as it does not appear to improve results...

#Homemade methods and classes (includes use of astropy tools, not written by me)
import classes_and_methods.soam_class as soam
from classes_and_methods.simbad_alias_search import online_alias_search

#Libraries for data manipulation
import pandas as pd
import re
import json

#... for modeling
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## DATA LOADING and CLEANING

In [2]:
print("---START LOAD---")
# Data Load (provided by Salvatore from AstroBin site)
df = pd.read_csv("data/astrobin_titles_to_subject_types.csv") #the main starting data frame
print(df.info())
print("---LOAD COMPLETE---")

print("---START CLEANING---")
#HOUSE KEEPING -- Casting 'subject_type' and 'solar_system_main_subject' as categories
cats = ['subject_type','solar_system_main_subject']
df[cats] = df[cats].astype('category')

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(how='all') #droping only the rows with 'na' across ALL columns
after = df.shape[0] #row count after

print(str(before-after)+" Null Rows Removed")

#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(subset=['title']) #droping only the rows with 'na' ONLY in title column
df = df[~df['title'].str.isspace()] #droping only the rows with just '  ' (white space) as title
df = df[df['title']!=''] #droping only the rows with just '' (dead space) as title
after = df.shape[0] #row count after

print(str(before-after)+" Null Title Rows Removed")
#CLEANING -- remove all null rows
before = df.shape[0] #row count before
df = df.dropna(subset=['subject_type']) #droping only the rows with 'na' subject_type
after = df.shape[0] #row count after

print(str(before-after)+" Null subject_type Rows Removed")

#CLEANING the unwanted words out of the string column (use all lower case)
unwanted_words = ['the', 'and', "in", "of"] #there are other words like " on " 
# ...but those may be more indicative of the category they are found in... 
#... (like " on " being a very common word in GEAR)...
#... (or like " at " being a very common word in STAR_TRAILS)

#CLEANING some other language words out (... limited, could be better...)
other_unwanted_words = [
    'du', 'la', 'de', 'le'
]
#If you want to add this extra cleaning step for other languages, leave this line
unwanted_words = unwanted_words+other_unwanted_words

# Regular expression pattern to match any unwanted word
pattern = r'\b(?:{})\b'.format('|'.join(unwanted_words))
# Remove unwanted words using the pattern with str.replace() and put cleaned strings their own col
df['cleaned_title'] = df['title'].str.lower().replace(pattern, '', regex=True)

print(df.info())
print("---CLEANING DONE---")

---START LOAD---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600268 entries, 0 to 600267
Data columns (total 3 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   title                      596041 non-null  object
 1   subject_type               593711 non-null  object
 2   solar_system_main_subject  120306 non-null  object
dtypes: object(3)
memory usage: 13.7+ MB
None
---LOAD COMPLETE---
---START CLEANING---
1837 Null Rows Removed
2394 Null Title Rows Removed
4720 Null subject_type Rows Removed
<class 'pandas.core.frame.DataFrame'>
Int64Index: 591317 entries, 1 to 600267
Data columns (total 4 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   title                      591317 non-null  object  
 1   subject_type               591317 non-null  category
 2   solar_system_main_subject  120306 non-null  category
 3   cleaned_title

In [16]:
#TESTING BLOCK
import math


In [17]:
#TESTING BLOCK
max_title = df['cleaned_title'].apply(lambda x: len(str(x).split())).max()
mean_title = df['cleaned_title'].apply(lambda x: len(str(x).split())).mean()

text = (df[df['cleaned_title'].apply(lambda x: len(str(x).split())) == math.ceil(mean_title)].title)

print([t for t in text])

['Sun and the giant spot 3363', 'Nebulosa del Águila M16', 'NGC 6729 Corona Australis', 'Western Veil Blowing in the Winds', 'The Western Veil Torn Asunder', 'Solar Prom III 7.8.23', 'The spooky Rigel region in 135mm', 'M56 Globular Cluster in Lyra', 'The Crescent Nebula - 2023', "Barnard 87 and the parrot's head", 'The War and Peace Nebula in LRGB+HOO', 'NGC 45 in the constellation of Cetus', 'IC5070 Pelican Nebula 2', 'NGC 7318, Stefans Quintett', 'Six Panels of the Virgo Supercluster', 'NGC 7023 Iris Nebula', 'NGC7380: The Wizard Nebula in SHO', 'Wizard Nebula - HaRGB', "NGC6979 - Pickering's Triangle", 'M27 - Dumbbell Nebula', 'M92 Globular Cluster RGB', 'Orion and Running Man Nebula', 'NGC 6888 The Crescent Nebula', 'M42 - The Orion Nebula', 'The Heart Nebula (IC 1805)', 'IC 5146, the Cocoon Nebula', 'SH2 155 with Stella', 'Partial Disk and Proms of 6/4/2023', 'SH2 162 Bubble Nebula', 'M109 - Spiral Galaxy', 'NGC 1499: The California Nebula', 'Melotte 15 in Heart Nebula', 'Orion N

In [18]:
mean_title

3.603339663835134

#### Incorporating the Space Object Alias Map (SOAM)
See 'soam_exploration_and_methods.ipynb' notebook for more details on the SOAM
See 'soam_methods.ipynb' notebook for export details of the SOAM imported below

In [2]:
#LOADING the cleaned Space Object Alias Map (SOAM)
cleaned_soam = soam.Soam()
cleaned_soam.import_soam(file_name="soam_cleaned_bulk_export")
print(cleaned_soam)

SOAM Started ---------------- 
SOAM Cleaning Method Test -- 
original test string: "   #][!,@ ^&*NGc224-.99+9abc. ...   "
cleaned test string: "ngc 224 99 9 abc"
32159 names / aliases mapped across 4245 objects.


In [10]:
#TEST -------------------------------
cleaned_soam.all_aliases()
# -----------------------------------

{0: {'2 c 481',
  '2 e 1309',
  '3 c 144',
  '3 cr 144',
  'ajg 1',
  'crab',
  'crab neb',
  'crab nebula',
  'cta 36',
  'ctb 18',
  'da 179',
  'db 38',
  'lbn 833',
  'm 1',
  'messier 1',
  'ngc 1952',
  'nrao 214',
  'nrl 2',
  'sh 2 244',
  'tau a',
  'taurus a',
  'w 9',
  'x tau x 1',
  'x tau xr 1'},
 1: {'bd 01 4175',
  'c 2130 010',
  'gcl 121',
  'gcrv 13546',
  'hd 205146',
  'm 2',
  'mwsc 3526',
  'ngc 7089'},
 2: {'gcl 25',
  'gcrv 8101',
  'hd 119333',
  'm 3',
  'mwsc 2152',
  'ngc 5272',
  'ubv m 19645'},
 3: {'c 1620 264',
  'cd 26 11314',
  'gcl 41',
  'gcrv 5569 e',
  'hd 147552',
  'm 4',
  'mwsc 2396',
  'ngc 6121',
  'spider globular'},
 4: {'gcl 34', 'gcrv 5244 e', 'm 5', 'mwsc 2286', 'ngc 5904', 'rose cluster'},
 5: {'butterfly cluster',
  'c 1736 321',
  'cl vdbh 242',
  'gc 128',
  'gc 130',
  'gc 137',
  'm 6',
  'mis 588',
  'mis 589',
  'mis 607',
  'mwsc 2661',
  'ngc 6405',
  'theia 122'},
 6: {'c 1750 348',
  'cl vdbh 254',
  'gc 228',
  'gc 243',
  

In [4]:
#Note that I made a text cleaning method and stored it in the soam file
# This can be used as a standard cleaning method here on out, or you can use your own
# Here is what mine is doing (see code in 'soam_class.py')
# -- standardize on lowercase
# -- Remove all non-alpha, non-numerics except for apostrophe 
#      (maintain 1 space between segments)
# -- Split all alpha chunks from numeric chunks with 1 space
# -- Remove all excess spaces

s = " #$Cat^&^88 Dog99 M51     Mouse-M31 101 your's-and-mine  "
print(f'Original (uncleaned) string = "{s}"')
s = soam.provided_cleaning_method(s)
print(f'Cleaned string = "{s}"')

Original (uncleaned) string = " #$Cat^&^88 Dog99 M51     Mouse-M31 101 your's-and-mine  "
Cleaned string = "cat 88 dog 99 m 51 mouse m 31 101 your's and mine"


## Standardizing Celestial Object Names Found in Titles
With the help of the SOAM and a few additional helper methods inside Soam Class, we will now standardize the names of celestial (space) obejects found in the title data from AstroBin to create a flavor of training data the puts a bit more weight for classification on the actual object typed into the title (rather than just the words). This way we can have 'm 101', 'ngc 5457', and 'pinwheel galaxy' all get transformed into 'pinwheel galaxy' through out all titles. Generally speaking, this should be good, as the word 'galaxy' is indicative of 'DEEP_SKY' category... We will try out and compare various flavors of training data to see what works best here (simply cleaned vs standardized)...   

#### Assumptions
I would assume that the standardized training set may actually be too "domesticated", as in the model will loose out on recognizing certain id conventions that were missed in the standardized training set (example would be a 'clever' title "2024 on Fire!" may be recognized as a DEEP_SKY in the simple / non-domesticated training data because NGC 2024 is the Flame Nebula, but the standardized / domesticated training data would have scrubbed out all NGC 2024 patterns with "flame nebula")

In [5]:
cleaned_soam.get_standard_name("ngc 2024")

'flame nebula'

In [6]:
# DISREGARD --- ADDED TO THE SOAM CLASS
#def generate_all_ngrams(sentence,max_word_count):
#    """
#    Returns all ngrams generated from the given string, up to the max word count provided 
#    (max word count will be equal to the total number of words in the string if no max_word_count is provided)
#    EXAMPLE:
#    generate_all_ngrams("Can you jump?",2)
#    returns:
#    ['Can', 'you', 'jump?', 'Can you', 'you jump?']
#    """
#    words = sentence.split()
#    ngrams = []
#    for n in range(1,max_word_count+1):
#        for i in range(len(words) - n + 1):
#            ngram = " ".join(words[i:i + n])
#            ngrams.append(ngram)
#    return ngrams

In [7]:
# DISREGARD --- ADDED TO THE SOAM CLASS
#def switch_in_standard_names(s,max_word_count,soam):
#    """
#    Takes in a string 's', 'max_word_count', and a SOAM 'soam'
#    generates all ngrams with 's' and 'max_word_count'
#    look through all ngrams and switch in the "standard names" of an name found in both the ngram and in the soam
#    return the string with standard names switched in.
#    EXAMPLE:
#    switch_in_standard_names("Jump to m31",2,cleaned_soam)
#    returns:
#    'Jump to andromeda galaxy'
#    """
#    all_ngrams = generate_all_ngrams(s,max_word_count)
#    standard_names = [cleaned_soam.get_standard_name(ngram) for ngram in all_ngrams]
#    switches = dict(zip(all_ngrams,standard_names))  
#    cleaned_switches = {key:value for key,value in switches.items() if value != None}
#    
#    for key, value in cleaned_switches.items():
#        s = s.replace(key, value)
#    return s    

In [8]:
#EXAMPLE OF Translating (switching in) standard names into a sentance containing a space object name
cleaned_soam.switch_in_standard_names("   Jump to m31")

'jump to andromeda galaxy'

### MAKING SOME TRAINING DATA SETS

In [9]:
#THIS TAKES A MINUTE TO RUN --- Uncomment only if needed (importing saved df further down)
#training_titles = [soam.provided_cleaning_method(title) for title in df.cleaned_title]
#training_titles_standardized = [cleaned_soam.switch_in_standard_names(t) for t in training_titles]
#df['training_titles'] = training_titles 
#df['training_titles_standardized'] = training_titles_standardized  
#df

In [10]:
#THIS TAKES A MINUTE TO RUN --- Uncomment only if needed (importing saved df further down)
#Store the final df in the data file as "cleaned_titles_df"
#df.to_csv('data/cleaned_titles_df.csv', index=False)

Unnamed: 0,title,subject_type,solar_system_main_subject,cleaned_title,training_titles,training_titles_standardized
1,Zigarren Galaxie,DEEP_SKY,,zigarren galaxie,zigarren galaxie,zigarren galaxie
2,M 16,DEEP_SKY,,m 16,m 16,eagle nebula
3,M71 - Ammasso globulare - Costellazione della ...,DEEP_SKY,,m71 - ammasso globulare - costellazione della ...,m 71 ammasso globulare costellazione della fre...,angelfish cluster ammasso globulare costellazi...
4,NGC2903 A bright Galaxy in Leo,DEEP_SKY,,ngc2903 a bright galaxy leo,ngc 2903 a bright galaxy leo,ngc 2903 a bright galaxy leo
5,Omicron1 Cygni Region,DEEP_SKY,,omicron1 cygni region,omicron 1 cygni region,omicron 1 cygni region
...,...,...,...,...,...,...
600263,"IC 1318, take 2",DEEP_SKY,,"ic 1318, take 2",ic 1318 take 2,gam cyg nebula take 2
600264,M33,DEEP_SKY,,m33,m 33,triangulum pinwheel galaxy
600265,M109 - 100% crop,DEEP_SKY,,m109 - 100% crop,m 109 100 crop,vacuum cleaner galaxy 100 crop
600266,NGC663 and NGC457 in Cassiopeia,DEEP_SKY,,ngc663 ngc457 cassiopeia,ngc 663 ngc 457 cassiopeia,ngc 663 owl cluster cassiopeia


In [21]:
df = pd.read_csv("data/cleaned_titles_df.csv")
df

Unnamed: 0,title,subject_type,solar_system_main_subject,cleaned_title,training_titles,training_titles_standardized
0,Zigarren Galaxie,DEEP_SKY,,zigarren galaxie,zigarren galaxie,zigarren galaxie
1,M 16,DEEP_SKY,,m 16,m 16,eagle nebula
2,M71 - Ammasso globulare - Costellazione della ...,DEEP_SKY,,m71 - ammasso globulare - costellazione della ...,m 71 ammasso globulare costellazione della fre...,angelfish cluster ammasso globulare costellazi...
3,NGC2903 A bright Galaxy in Leo,DEEP_SKY,,ngc2903 a bright galaxy leo,ngc 2903 a bright galaxy leo,ngc 2903 a bright galaxy leo
4,Omicron1 Cygni Region,DEEP_SKY,,omicron1 cygni region,omicron 1 cygni region,omicron 1 cygni region
...,...,...,...,...,...,...
591312,"IC 1318, take 2",DEEP_SKY,,"ic 1318, take 2",ic 1318 take 2,gam cyg nebula take 2
591313,M33,DEEP_SKY,,m33,m 33,triangulum pinwheel galaxy
591314,M109 - 100% crop,DEEP_SKY,,m109 - 100% crop,m 109 100 crop,vacuum cleaner galaxy 100 crop
591315,NGC663 and NGC457 in Cassiopeia,DEEP_SKY,,ngc663 ngc457 cassiopeia,ngc 663 ngc 457 cassiopeia,ngc 663 owl cluster cassiopeia


In [22]:
df['subject_type'].value_counts()

DEEP_SKY              444255
SOLAR_SYSTEM          111233
WIDE_FIELD             20108
OTHER                   8496
GEAR                    3587
STAR_TRAILS             1628
600                     1174
NORTHERN_LIGHTS          674
NOCTILUCENT_CLOUDS       162
Name: subject_type, dtype: int64

## MODELS
A quick refresher on what is meant by what:

Precision: Precision measures the accuracy of the positive predictions made by the model, specifically the ratio of true positive predictions to the total number of positive predictions (both true positives and false positives).

Recall: Recall, also known as sensitivity or true positive rate, measures the model's ability to identify all relevant instances within a dataset, particularly positive instances. It is the ratio of true positive predictions to the total number of actual positive instances (both true positives and false negatives).

In [14]:
# Rough Method Outline

# Split the data into training and testing sets
X = df['training_titles']  # Input text
y = df['subject_type']  # Target labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TfidfVectorizer to convert text data to numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create and train a categorization model (e.g., Multinomial Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, you can print a classification report for more detailed metrics
print(classification_report(y_test, y_pred))

# Now, you can use the trained model to categorize new text data
new_text = ["This is a new text to categorize"]
new_text_tfidf = vectorizer.transform(new_text)
predicted_category = model.predict(new_text_tfidf)
print("Predicted category:", predicted_category[0])


Accuracy: 0.9268416424271122


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

               600       0.00      0.00      0.00       220
          DEEP_SKY       0.94      0.98      0.96     88854
              GEAR       0.96      0.08      0.14       710
NOCTILUCENT_CLOUDS       0.00      0.00      0.00        30
   NORTHERN_LIGHTS       1.00      0.03      0.06       135
             OTHER       0.84      0.03      0.06      1715
      SOLAR_SYSTEM       0.89      0.96      0.92     22234
       STAR_TRAILS       0.83      0.02      0.03       332
        WIDE_FIELD       0.65      0.32      0.43      4034

          accuracy                           0.93    118264
         macro avg       0.68      0.27      0.29    118264
      weighted avg       0.92      0.93      0.91    118264

Predicted category: DEEP_SKY


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Rough Method Outline

# Split the data into training and testing sets
X = df['training_titles_standardized']  # Input text
y = df['subject_type']  # Target labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TfidfVectorizer to convert text data to numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create and train a categorization model (e.g., Multinomial Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Optionally, you can print a classification report for more detailed metrics
print(classification_report(y_test, y_pred))

# Now, you can use the trained model to categorize new text data
new_text = ["This is a new text to categorize"]
new_text_tfidf = vectorizer.transform(new_text)
predicted_category = model.predict(new_text_tfidf)
print("Predicted category:", predicted_category[0])


Accuracy: 0.9289809240343638


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

               600       0.00      0.00      0.00       220
          DEEP_SKY       0.94      0.98      0.96     88854
              GEAR       0.97      0.08      0.15       710
NOCTILUCENT_CLOUDS       0.00      0.00      0.00        30
   NORTHERN_LIGHTS       1.00      0.03      0.06       135
             OTHER       0.83      0.03      0.06      1715
      SOLAR_SYSTEM       0.89      0.96      0.92     22234
       STAR_TRAILS       0.83      0.02      0.03       332
        WIDE_FIELD       0.64      0.32      0.43      4034

          accuracy                           0.93    118264
         macro avg       0.68      0.27      0.29    118264
      weighted avg       0.92      0.93      0.91    118264

Predicted category: DEEP_SKY


  _warn_prf(average, modifier, msg_start, len(result))
