In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification,AutoModelForCausalLM
from transformers import pipeline
import torch
from spacy.training import Example
import spacy
from collections import Counter
from nltk.util import ngrams

## Fine tuning data

In [189]:
import random

class AstroNERDataset:
    def __init__(self):
        self.templates = [
            "Astrophotographers often visit {location} to capture images of {astro_object}.",
            "The {astro_object} appears exceptionally bright when viewed from {location}.",
            "During certain seasons, {astro_object} is best observed from {location}.",
            "Many stargazers prefer {location} for its clear views of {astro_object}.",
            "{location} provides an unobstructed horizon for photographing {astro_object}.",
            "The altitude of {location} makes it ideal for observing {astro_object}.",
            "Photographers set up telescopes at {location} to capture {astro_object}.",
            "The lack of light pollution at {location} enhances views of {astro_object}.",
            "Guided tours at {location} offer night sky observations of {astro_object}.",
            "The {astro_object} was clearly visible above the horizon at {location}.",
            "Astro enthusiasts recommend {location} for its dark skies and view of {astro_object}.",
            "The remote location of {location} makes it perfect for {astro_object} photography.",
            "Star parties at {location} often focus on observing {astro_object}.",
            "The clear atmosphere at {location} provides stunning views of {astro_object}.",
            "From {location}, the {astro_object} can be seen with the naked eye.",
            "The observatory at {location} offers public viewing nights for {astro_object}.",
            "Camping at {location} allows for overnight observation of {astro_object}.",
            "The {astro_object} rises early in the morning as seen from {location}.",
            "Astrophotography workshops at {location} teach how to capture {astro_object}.",
            '{location} is the number 1 spot for {astro_object} viewing.',
        ]

        self.locations = [
            "Mauna Kea", "Atacama", "Atacama Desert", "Mauna Loa", "Death Valley", 
            "Mont Mégantic", "Pic du Midi", "Lake Tekapo", "Okavango Delta", 
            "Himalayas", "Cerro Paranal", "Uluru", "Big Bend National Park", 
            "Lick Observatory", "Lowell Observatory", "Meteora in Greece", 
            "Arcetri Astrophysical Observatory", "Kitt Peak"
        ]
        self.locations = [location.lower() for location in self.locations]

        self.astro_objects = [
            "Milky Way", "Andromeda Galaxy", "Orion Nebula", "Pleiades Star Cluster", 
            "Saturn's Rings", "Halley's Comet", "Perseid Meteor Shower", "Aurora Borealis", 
            "Venus", "Mars", "Supernova Remnant", "Black Hole", 
            "Exoplanet Kepler-22b", "Horsehead Nebula", "Crab Nebula", "Betelgeuse", 
            "Sirius", "Alpha Centauri"
        ]
        self.astro_objects = [astro_object.lower() for astro_object in self.astro_objects]

    def generate_example(self):
        """Generates a single labeled training example."""
        template = random.choice(self.templates)
        location = random.choice(self.locations)
        astro_object = random.choice(self.astro_objects)
        
        # Ensure both location and astro_object are in lowercase
        sentence = template.format(location=location, astro_object=astro_object)
        
        # Ensure formatting consistency and case-sensitivity
        location_start = sentence.find(location)
        location_end = location_start + len(location)
        
        Obj_start = sentence.find(astro_object)
        Obj_end = Obj_start + len(astro_object) if Obj_start != -1 else -1
        
        # If the astro_object is not found, we can print it for debugging purposes
        if Obj_start == -1:
            print(f"Error: {astro_object} not found in the sentence: {sentence}")
        
        return (sentence, {"entities": [(location_start, location_end, "LOC"), (Obj_start, Obj_end, "Astro_Obj")]} if Obj_start != -1 else None)

    def generate_dataset(self, size=500):
        """Generates a dataset with the specified number of examples."""
        dataset = [self.generate_example() for _ in range(size)]
        return dataset

# Create the dataset
astro_dataset = AstroNERDataset()
train_data = astro_dataset.generate_dataset(size=500)

# Preview a few examples
for example in train_data[:5]:
    print(example)


('During certain seasons, venus is best observed from big bend national park.', {'entities': [(52, 74, 'LOC'), (24, 29, 'Astro_Obj')]})
('Photographers set up telescopes at uluru to capture venus.', {'entities': [(35, 40, 'LOC'), (52, 57, 'Astro_Obj')]})
('mauna loa provides an unobstructed horizon for photographing supernova remnant.', {'entities': [(0, 9, 'LOC'), (61, 78, 'Astro_Obj')]})
('Many stargazers prefer uluru for its clear views of supernova remnant.', {'entities': [(23, 28, 'LOC'), (52, 69, 'Astro_Obj')]})
('The observatory at lake tekapo offers public viewing nights for pleiades star cluster.', {'entities': [(19, 30, 'LOC'), (64, 85, 'Astro_Obj')]})


## Spacy

In [239]:
from spacy.pipeline import EntityRuler
# Load the base model
model = spacy.load("en_core_web_sm")
ner = model.get_pipe("ner")

ner.add_label("Astro_Obj")  # Add the label if not already present
print(ner.labels)
ruler = model.add_pipe("entity_ruler", before="ner")
patterns = [{"label": "LOC", "pattern": "atacama"},{"label": "Astro_Obj", "pattern": "the moon"},{"label": "Astro_Obj", "pattern": "jupiter"},{"label": "Astro_Obj", "pattern": "earth"}]
ruler.add_patterns(patterns)
print(ruler.labels)

('Astro_Obj', 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
('Astro_Obj', 'LOC')


In [240]:
def fine_tune(model,train_data):# Convert to spaCy's Example objects
    examples = [Example.from_dict(model.make_doc(text), ann) for text, ann in train_data]

    # Fine-tune the model
    optimizer = model.resume_training()
    for epoch in range(7):
        losses = {}
        model.update(examples, drop=0.3, losses=losses)
fine_tune(model,train_data)

In [241]:
# Save the fine-tuned model
model.to_disk("fine_tuned_astro_ner")

In [242]:
tokens = [
    "The", "Mauna", "Kea", "Observatory", "in", "Hawaii", "offers", "unparalleled", "views", "of", "the", "Milky", "Way", "due", "to", "its", "high", "altitude", "and", "clear", "skies", ".",
    "The", "Atacama", "Desert", "in", "Chile", "is", "one", "of", "the", "best", "places", "on", "Earth", "for", "stargazing", ",", "thanks", "to", "its", "dry", "climate", "and", "minimal", "light", "pollution", ".",
    "Photographers", "love", "capturing", "the", "Aurora", "Borealis", "from", "Tromsø", ",", "Norway", ",", "especially", "during", "the", "winter", "months", "when", "the", "lights", "are", "most", "vivid", ".",
    "The", "Namib", "Desert", "in", "Namibia", "provides", "a", "spectacular", "view", "of", "the", "Southern", "Hemisphere", "'s", "stars", ",", "with", "almost", "no", "artificial", "light", "interference", ".",
    "The", "Canary", "Islands", "are", "popular", "for", "astrophotography", ",", "especially", "at", "the", "Roque", "de", "los", "Muchachos", "Observatory", "on", "La", "Palma", ".",
    "Utah", "'s", "Bryce", "Canyon", "National", "Park", "is", "a", "favorite", "for", "night", "photography", ",", "offering", "some", "of", "the", "darkest", "skies", "in", "the", "United", "States", ".",
    "The", "Dark", "Sky", "Park", "in", "Cherry", "Springs", "State", "Park", ",", "Pennsylvania", ",", "is", "renowned", "for", "its", "pristine", "night", "skies", "and", "excellent", "stargazing", "opportunities", ".",
    "Mount", "Cook", "National", "Park", "in", "New", "Zealand", "is", "part", "of", "the", "Aoraki", "Mackenzie", "International", "Dark", "Sky", "Reserve", ",", "making", "it", "a", "perfect", "spot", "for", "astrophotography", "."
]


In [243]:
model_t = spacy.load("fine_tuned_astro_ner")

In [244]:
model = spacy.load("en_core_web_sm")
ner2 = model.get_pipe("ner")
ner2.add_label("LOC")  # Add the label if not already present

0

In [198]:
text = """
thank you
these posts inevitably have a line in them that says i know what thing looks like but most people actually do not so here is a quick guide to what they usually are very bright star not moving planet typically jupiter or venus very quick flash that may or may not leave a momentary trail meteor brighter quick flash that leaves a trail that may last several seconds bolide faint fuzzy ball with or without tail not moving appears every night for days or weeks comet anything with blinking bits aircraft smoothly moving star satellite brighter smoothly moving star iss star that brightens for a moment then dims again satellite flare line of smoothly moving stars satellite train starlink launch something going upwards with or without tail rocket launch crazy spiral or other pattern failed rocket launch bunch of fiery dots with trails all moving together rocket debris reentry bunch of white lights in formation just hanging in sky sometimes blinking drone test fiery burst that appears and disappears randomly hot air balloon glowing saucer with cow hovering beneath it aliens
south carolina
atacama is a great location for photography
uluru is a great place to do this
saw 7 an hour
also only watched one hour
another beautiful image of cosmic proportions
i see the skull face too endless wonders all around us
this is why space and everything in it will forever have endless beauty and sights that draws everyone to seek what else lies beyond what we see
tnx for that guide but sadly the object i saw does not match anything of listed above
wow
skysafari app search tonights best
i use skyguide but i will check it out
andromeda can be seen from florida
naked eye on a good night
do you have a sky app on your phone
"""

# Convert the text into a list of words
word_list = text.split()

# Print the list
print(word_list)


['thank', 'you', 'these', 'posts', 'inevitably', 'have', 'a', 'line', 'in', 'them', 'that', 'says', 'i', 'know', 'what', 'thing', 'looks', 'like', 'but', 'most', 'people', 'actually', 'do', 'not', 'so', 'here', 'is', 'a', 'quick', 'guide', 'to', 'what', 'they', 'usually', 'are', 'very', 'bright', 'star', 'not', 'moving', 'planet', 'typically', 'jupiter', 'or', 'venus', 'very', 'quick', 'flash', 'that', 'may', 'or', 'may', 'not', 'leave', 'a', 'momentary', 'trail', 'meteor', 'brighter', 'quick', 'flash', 'that', 'leaves', 'a', 'trail', 'that', 'may', 'last', 'several', 'seconds', 'bolide', 'faint', 'fuzzy', 'ball', 'with', 'or', 'without', 'tail', 'not', 'moving', 'appears', 'every', 'night', 'for', 'days', 'or', 'weeks', 'comet', 'anything', 'with', 'blinking', 'bits', 'aircraft', 'smoothly', 'moving', 'star', 'satellite', 'brighter', 'smoothly', 'moving', 'star', 'iss', 'star', 'that', 'brightens', 'for', 'a', 'moment', 'then', 'dims', 'again', 'satellite', 'flare', 'line', 'of', 'smo

In [232]:

def get_locations(model, tokens, max_tokens=2):
    """
    Extract location-related entities from a list of tokens using a spaCy model,
    with an optional limit on the number of tokens in each entity.
    
    Parameters:
    - model: A spaCy language model.
    - tokens (list): List of tokens (words) from the input text.
    - max_tokens (int, optional): Maximum number of tokens allowed in each returned entity.
    
    Returns:
    - locations (list): List of location names identified in the text, 
      each with a token count less than or equal to max_tokens.
    """
    doc = model(" ".join(tokens))
    location_labels = {"LOC", "GPE",}
    locations = [
        ent.text for ent in doc.ents 
        if ent.label_ in location_labels and (max_tokens is None or len(ent) <= max_tokens)
    ]
    return locations


l1 = get_locations(model_t,word_list)
l2 = get_locations(model_t,l1)
print(l2)

['trail meteor', 'air balloon', 'south carolina', 'atacama', 'uluru skysafari', 'florida']


In [245]:
def process_file_s(file_path, model):
    """
    Read sentences from a file and process them using spaCy's NER.
    
    Parameters:
    - file_path (str): Path to the input text file.
    - model: The spaCy NLP model.
    
    Returns:
    - recomended_locs (list): List of extracted locations.
    """
    i = 0
    recomended_locs = []
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    for line in lines:
        sentence = line.strip()  # Remove leading/trailing whitespace
        if sentence:  # Skip empty lines
            
            
            # Tokenize the sentence into words
            tokens = sentence.split()  # Split the sentence into tokens
            # Extract locations
            loc = get_locations(model, tokens)
            if loc:  # Check if loc is not empty
                recomended_locs.extend(loc)
        #     i += 1  
        # if i == 9000:
        #     break
    return recomended_locs


# Example Usage: Replace 'input.txt' with your text file
rec_loc_s0 = process_file_s("preprocessed_comments_sentences",model_t)
rec_loc_s = get_locations(model_t,rec_loc_s0)

In [255]:
rec_loc_s2 = get_locations(model_t,rec_loc_s)
rec_loc_s3 = get_locations(model_t,rec_loc_s2)
rec_loc_s4 = get_locations(model_t,rec_loc_s3)
print(rec_loc_s3)

['air balloon', 'south carolina', 'florida', 'jacksonville', 'georgia', 'arizona', 'cold dark', 'london', 'mass extinction', 'california', 'alaska', 'mars venera', 'mars asteroids', 'mars mass', 'mo titan', 'nasa keeps', 'mount moon', 'phd2', 'mexico', 'new wallpaper', 'faint halo', 'taba black', 'italy magellan', 'contradict themselves', 'no moon', 'new zealand', 'new epoch', 'baby beach', 'iraq', 'vermana complexes', 'sao 99805', 'previouslyunknown andor', 'systems bird', 'rgb antero', 'colorado springs', 'moon bird', 'youeiferundehre hydra', 'google sun', 'veeegaaa vega', 'google drive', 'space rock', 'lrgb cluster', 'sun xray', 'telescope knowledge', 'big dipper', 'alcor which', 'mount longer', 'arkansas', 'quantum scales', 'neil degrasse', 'lightroom mobile', 'this moon', 'france observe', 'make ursa', 'nova astrophotog', 'mars', 'india 15000', 'vortex 0249', 'astral pervs', 'astroberry canon', 'latin mode', 'google searches', 'rome', 'lrgb spain', 'heavier load', 'italy', 'egypt 

In [256]:
from HelperFunctions import write_to_text_file

write_to_text_file(rec_loc_s4, "NER_loc_tuned_comments_v2_sentences")
#write_to_text_file(rec_loc_s, "NER_loc__submissions_sentences")


In [259]:
from collections import Counter

def rank_loc(file_path):# Initialize a Counter to keep track of occurrences
    counter = Counter()

    # Open the text file in read mode
    with open(file_path, 'r', encoding='utf-8') as file:
        # Iterate over each line in the file
        for line in file:
            # Remove leading/trailing whitespace and convert to lowercase
            line = line.strip().lower()
            # Increment the count for this line
            counter[line] += 1
    top_5 = counter.most_common(5)
    # Print the counts for each line
    # for line, count in counter.items():
    #     print(f"'{line}': {count}")'
    return top_5
rank_loc('NER_loc_tuned_comments_v2_sentences')

[('arizona', 25),
 ('atacama', 20),
 ('new zealand', 18),
 ('toronto', 18),
 ('france', 15)]

In [None]:
def process_file_w(file_path, model):
    """
    Read words from a file and process them using spaCy's NER.
    
    Parameters:
    - file_path (str): Path to the input text file.
    - model: The spaCy NLP model.
    
    Returns:
    - recommended_locs (list): List of extracted locations.
    """
    i = 0
    recommended_locs = []
    with open(file_path, "r") as file:
        for line in file:
            word = line.strip()  # Remove leading/trailing whitespace
            if word:  # Skip empty lines
                # i += 1
                # Process the word with spaCy
                doc = model(word)
                # Extract locations
                loc = get_locations_from_doc(doc)
                if loc:
                    recommended_locs.extend(loc)
                # if i == 60000:
                #     break
    return recommended_locs

def get_locations_from_doc(doc):
    """
    Extract location-related entities from a spaCy Doc object.
    
    Parameters:
    - doc: A spaCy Doc object.
    
    Returns:
    - locations (list): List of location names identified in the text.
    """
    location_labels = {"LOC", "GPE", "FAC"}
    return [ent.text for ent in doc.ents if ent.label_ in location_labels]

rec_loc_w1 = process_file_w("preprocessed_comments_words",model_t)
rec_loc_w = get_locations(model_t,rec_loc_w1)


In [151]:
write_to_text_file(rec_loc_w, "NER_loc_T_v2_comments_words")
#write_to_text_file(rec_loc_w, "NER_loc_submissions_words")


In [34]:
print(rec_loc_w)

['florida', 'georgia', 'arizona', 'london', 'scotland', 'florida', 'arizona', 'northeast', 'strip', 'asi120mm', 'coma', 'at60ed', 'at60ed', 'canada', 'california', 'alaska', 'california', 'triton', 'michigan', 'michigan', 'michigan', 'newton', 'newton', 'alabama', 'massachusetts', 'mexico', 'pulsar']
