In [28]:
import random

# List of stars and planets
stars = [
    "Sirius", "Canopus", "Rigil Kentaurus", "Arcturus", "Vega", "Capella", "Rigel",
    "Procyon", "Achernar", "Betelgeuse", "Hadar", "Altair", "Aldebaran", "Spica", 
    "Antares", "Pollux", "Fomalhaut", "Deneb", "Mimosa", "Regulus", "Adhara", 
    "Shaula", "Castor", "Gacrux", "Bellatrix", "Alnilam", "Alnair", "Alnitak", 
    "Eta Carinae", "Alphard", "Algol", "Dubhe", "Wezen", "Sargas", "Alhena", 
    "Kaus Australis", "Peacock", "Atria", "Markab", "Alcor", "Menkalinan", "Mirfak",
    "Gienah", "Acrux", "Alcyone", "Pherkad", "Maia", "Menkent", "Merak", "Dschubba", 
    "Zeta Reticuli", "Izar", "Zubenelgenubi", "Denebola", "Mintaka", "Caph", "Arctans",
    "Enif", "Ruchbah", "Shedar", "Scheat", "Sualocin", "Tarazed", "Ankaa", "Nunki",
    "Mirzam", "Saiph", "Hamal", "Acamar", "Phecda", "Baten Kaitos", "Almach", "Kochab",
    "Navi", "Unukalhai", "Sabik", "Tiaki", "Furud", "Aldhibah", "Mira", "Castula",
    "Tureis", "Maia", "Merope", "Sterope", "Atlas", "Pleione", "Electra", "Celaeno", 
    "Taygeta", "Alpheratz", "Rasalhague", "Rastaban", "Muphrid", "Kornephoros", 
    "Sadalmelik", "Venus", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune", "Pluto"
]

# List of sentence templates
sentence_templates = [
    "Point the telescope at {}.",
    "Show me {} in the sky.",
    "Where is {} located?",
    "Find the position of {}.",
    "What are the coordinates of {}?",
    "{} is one of the brightest stars.",
    "Can you locate {}?",
    "Is {} visible tonight?",
    "We need to find {}.",
    "The star {} is quite famous.",
    "Astronomers often observe {}."
]

# Function to generate training data with lowercase augmentation
def generate_training_data(star_list, template_list):
    train_data = []
    for star in star_list:
        # Create both title case and lowercase versions of the sentences
        for case_type in [star, star.lower()]:
            sentence_template = random.choice(template_list)
            sentence = sentence_template.format(case_type)
            start_idx = sentence.index(case_type)
            end_idx = start_idx + len(case_type)
            train_data.append((sentence, {"entities": [(start_idx, end_idx, "CELESTIAL_OBJECT")]}))
    return train_data

# Generate the training data with lowercase augmentation
train_data = generate_training_data(stars, sentence_templates)

# Check the first 5 examples
for entry in train_data[:5]:
    print(entry)


('Is Sirius visible tonight?', {'entities': [(3, 9, 'CELESTIAL_OBJECT')]})
('Find the position of sirius.', {'entities': [(21, 27, 'CELESTIAL_OBJECT')]})
('What are the coordinates of Canopus?', {'entities': [(28, 35, 'CELESTIAL_OBJECT')]})
('Astronomers often observe canopus.', {'entities': [(26, 33, 'CELESTIAL_OBJECT')]})
('Show me Rigil Kentaurus in the sky.', {'entities': [(8, 23, 'CELESTIAL_OBJECT')]})


In [29]:
import pickle

# Save the training data to a pickle file
with open("train_data.pkl", "wb") as f:
    pickle.dump(train_data, f)


In [30]:
import spacy
from spacy.training import Example

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Get the pipeline component (NER)
ner = nlp.get_pipe("ner")


In [31]:
ner.add_label("CELESTIAL_OBJECT")


1

In [32]:
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


In [33]:
examples = []
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)


In [34]:
import random
from spacy.util import minibatch, compounding

# Start training
with nlp.disable_pipes(*unaffected_pipes):  # Only train NER
    optimizer = nlp.resume_training()
    for itn in range(20):  # Number of iterations
        random.shuffle(examples)
        losses = {}
        batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(batch, drop=0.5, losses=losses)
        print(f"Iteration {itn}, Losses: {losses}")


Iteration 0, Losses: {'ner': np.float32(315.90884)}
Iteration 1, Losses: {'ner': np.float32(113.98766)}
Iteration 2, Losses: {'ner': np.float32(57.23144)}
Iteration 3, Losses: {'ner': np.float32(40.25379)}
Iteration 4, Losses: {'ner': np.float32(28.831171)}
Iteration 5, Losses: {'ner': np.float32(31.551624)}
Iteration 6, Losses: {'ner': np.float32(22.636852)}
Iteration 7, Losses: {'ner': np.float32(13.427785)}
Iteration 8, Losses: {'ner': np.float32(6.9583025)}
Iteration 9, Losses: {'ner': np.float32(3.2288153)}
Iteration 10, Losses: {'ner': np.float32(8.250778)}
Iteration 11, Losses: {'ner': np.float32(0.01729602)}
Iteration 12, Losses: {'ner': np.float32(4.8855643)}
Iteration 13, Losses: {'ner': np.float32(0.76181114)}
Iteration 14, Losses: {'ner': np.float32(1.6894152)}
Iteration 15, Losses: {'ner': np.float32(0.06056088)}
Iteration 16, Losses: {'ner': np.float32(0.8324091)}
Iteration 17, Losses: {'ner': np.float32(0.09921158)}
Iteration 18, Losses: {'ner': np.float32(6.382583)}
Ite

In [35]:
# Test the custom model
doc = nlp("Betelgeuse is a red supergiant star.")
for ent in doc.ents:
    print(ent.text, ent.label_)


Betelgeuse CELESTIAL_OBJECT


In [36]:
# Save the model
nlp.to_disk("custom_celestial_ner")

# Load the saved model later
nlp_custom = spacy.load("custom_celestial_ner")
