In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification,AutoModelForCausalLM
from transformers import pipeline
import torch
from spacy.training import Example
import spacy
from collections import Counter
from nltk.util import ngrams

## Fine tuning data

In [55]:
import random

class AstroNERDataset:
    def __init__(self):
        self.templates = [
            "{location} is a popular spot for photographing {astro_object}.",
            "astrophotographers recommend {location} for capturing {astro_object}.",
            "{location} offers some of the clearest skies for observing {astro_object}.",
            "{astro_object} can often be seen clearly from {location}.",
            "the skies over {location} are ideal for photographing {astro_object}.",
            "many photographers visit {location} to capture {astro_object}.",
            "The observatories in {location} provide excellent views of {astro_object}.",
            "{astro_object} enthusiasts often gather at {location}.",
            "photographing {astro_object} from {location} is an unforgettable experience.",
            '{astro_object} can be seen from {location}',
            'the {location} is the 1 place on the planet for {astro_object}',
            'the {astro_object} was super obvious in the {location}'
            '{location} offers a clear view of {astro_object}'
            'great {location}'
        ]

        self.locations = [
            "Mauna Kea","Atacama" "Atacama Desert","mauna loa" "Death Valley", "Mont Mégantic", "Pic du Midi",
            "Lake Tekapo", "Okavango Delta", "Himalayas", "Cerro Paranal", "Uluru",
            "Big Bend National Park", "Lick Observatory", "Lowell Observatory",
             "VLT in Chile", "Meteora in Greece", "Arcetri Astrophysical Observatory" "Kitt Peak",'florida','south carolina'
        ]
        self.locations = [location.lower() for location in self.locations]

        self.astro_objects = [
            "Milky Way", "Moon", "Andromeda Galaxy", "Stars", "Nebulae", 
            "Planets", "Auroras", "Comets", "Meteors", "Exoplanets",
            "Deep-sky Objects", "Satellites", "The Sun", "Earth from Space",
            "Jupiter and its Moons", "The Orion Nebula", 'titan'
        ]
        self.astro_objects = [astro_object.lower() for astro_object in self.astro_objects]

    
    def generate_example(self):
        """Generates a single labeled training example."""
        template = random.choice(self.templates)
        location = random.choice(self.locations)
        astro_object = random.choice(self.astro_objects)
        sentence = template.format(location=location, astro_object=astro_object)
        # Define entities for the example
        location_start = sentence.index(location)
        location_end = location_start + len(location)
        return (sentence, {"entities": [(location_start, location_end, "LOC")]})
    
    def generate_dataset(self, size=500):
        """Generates a dataset with the specified number of examples."""
        dataset = [self.generate_example() for _ in range(size)]
        return dataset

# Create the dataset
astro_dataset = AstroNERDataset()
train_data = astro_dataset.generate_dataset(size=500)

# Preview a few examples
for example in train_data[:5]:
    print(example)


('florida is a popular spot for photographing meteors.', {'entities': [(0, 7, 'LOC')]})
('milky way can be seen from lake tekapo', {'entities': [(27, 38, 'LOC')]})
('the sun can be seen from okavango delta', {'entities': [(25, 39, 'LOC')]})
('the satellites was super obvious in the mauna keamauna kea offers a clear view of satellitesgreat mauna kea', {'entities': [(40, 49, 'LOC')]})
('the skies over cerro paranal are ideal for photographing the orion nebula.', {'entities': [(15, 28, 'LOC')]})


## Scibert

In [5]:
model_name = "allenai/scibert_scivocab_uncased"  # SciBERT model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [22]:
def get_locations(ner_pipeline, text, location_labels=None):
    """
    Extract location-related entities from text using a Hugging Face NER pipeline.

    Parameters:
    - ner_pipeline: A Hugging Face NER pipeline.
    - text (str): Input text.
    - location_labels (set, optional): Set of entity labels considered as locations.

    Returns:
    - locations (list): List of location names identified in the text.
    """
    if location_labels is None:
        location_labels = {"LOC", "GPE", "FAC"}

    # Get NER results from the pipeline
    ner_results = ner_pipeline(text)

    # Extract entities that match the specified location labels
    locations = [entity['word'] for entity in ner_results if entity['entity'] in location_labels]

    return locations

def process_file_s(file_path, ner_pipeline, location_labels=None, max_lines=1000):
    """
    Read sentences from a file and process them using a Hugging Face NER pipeline.

    Parameters:
    - file_path (str): Path to the input text file.
    - ner_pipeline: A Hugging Face NER pipeline.
    - location_labels (set, optional): Set of entity labels considered as locations.
    - max_lines (int, optional): Maximum number of lines to process.

    Returns:
    - recommended_locs (list): List of recommended location names identified in the text.
    """
    recommended_locs = []

    with open(file_path, "r") as file:
        for i, line in enumerate(file):
            if i >= max_lines:
                break

            sentence = line.strip()
            if sentence:
                # Extract locations
                locs = get_locations(ner_pipeline, sentence, location_labels)
                recommended_locs.extend(locs)

    return recommended_locs

rec_loc_sci_s = process_file_s("preprocessed_comments_sentences",ner_pipeline)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [7]:
# # Save model and tokenizer locally
# model.save_pretrained("./local_scibert_scivocab_uncased_model")
# tokenizer.save_pretrained("./local_scibert_scivocab_uncased_model")

## Spacy

In [24]:
# Load the base model
model = spacy.load("en_core_web_sm")
ner = model.get_pipe("ner")
print(ner.labels)
ner.add_label("LOC")  # Add the label if not already present

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


0

In [56]:
def fine_tune(model,train_data):# Convert to spaCy's Example objects
    examples = [Example.from_dict(model.make_doc(text), ann) for text, ann in train_data]

    # Fine-tune the model
    optimizer = model.resume_training()
    for epoch in range(10):
        losses = {}
        model.update(examples, drop=0.5, losses=losses)
fine_tune(model,train_data)



In [57]:
# Save the fine-tuned model
model.to_disk("fine_tuned_astro_ner")

In [45]:
tokens = [
    "The", "Mauna", "Kea", "Observatory", "in", "Hawaii", "offers", "unparalleled", "views", "of", "the", "Milky", "Way", "due", "to", "its", "high", "altitude", "and", "clear", "skies", ".",
    "The", "Atacama", "Desert", "in", "Chile", "is", "one", "of", "the", "best", "places", "on", "Earth", "for", "stargazing", ",", "thanks", "to", "its", "dry", "climate", "and", "minimal", "light", "pollution", ".",
    "Photographers", "love", "capturing", "the", "Aurora", "Borealis", "from", "Tromsø", ",", "Norway", ",", "especially", "during", "the", "winter", "months", "when", "the", "lights", "are", "most", "vivid", ".",
    "The", "Namib", "Desert", "in", "Namibia", "provides", "a", "spectacular", "view", "of", "the", "Southern", "Hemisphere", "'s", "stars", ",", "with", "almost", "no", "artificial", "light", "interference", ".",
    "The", "Canary", "Islands", "are", "popular", "for", "astrophotography", ",", "especially", "at", "the", "Roque", "de", "los", "Muchachos", "Observatory", "on", "La", "Palma", ".",
    "Utah", "'s", "Bryce", "Canyon", "National", "Park", "is", "a", "favorite", "for", "night", "photography", ",", "offering", "some", "of", "the", "darkest", "skies", "in", "the", "United", "States", ".",
    "The", "Dark", "Sky", "Park", "in", "Cherry", "Springs", "State", "Park", ",", "Pennsylvania", ",", "is", "renowned", "for", "its", "pristine", "night", "skies", "and", "excellent", "stargazing", "opportunities", ".",
    "Mount", "Cook", "National", "Park", "in", "New", "Zealand", "is", "part", "of", "the", "Aoraki", "Mackenzie", "International", "Dark", "Sky", "Reserve", ",", "making", "it", "a", "perfect", "spot", "for", "astrophotography", "."
]


In [58]:
model_t = spacy.load("fine_tuned_astro_ner")

In [59]:
model = spacy.load("en_core_web_sm")
ner2 = model.get_pipe("ner")
ner2.add_label("LOC")  # Add the label if not already present

0

In [64]:
text = """
thank you
these posts inevitably have a line in them that says i know what thing looks like but most people actually do not so here is a quick guide to what they usually are very bright star not moving planet typically jupiter or venus very quick flash that may or may not leave a momentary trail meteor brighter quick flash that leaves a trail that may last several seconds bolide faint fuzzy ball with or without tail not moving appears every night for days or weeks comet anything with blinking bits aircraft smoothly moving star satellite brighter smoothly moving star iss star that brightens for a moment then dims again satellite flare line of smoothly moving stars satellite train starlink launch something going upwards with or without tail rocket launch crazy spiral or other pattern failed rocket launch bunch of fiery dots with trails all moving together rocket debris reentry bunch of white lights in formation just hanging in sky sometimes blinking drone test fiery burst that appears and disappears randomly hot air balloon glowing saucer with cow hovering beneath it aliens
south carolina
atacama is a great location for photography
saw 7 an hour
also only watched one hour
another beautiful image of cosmic proportions
i see the skull face too endless wonders all around us
this is why space and everything in it will forever have endless beauty and sights that draws everyone to seek what else lies beyond what we see
tnx for that guide but sadly the object i saw does not match anything of listed above
wow
skysafari app search tonights best
i use skyguide but i will check it out
andromeda can be seen from florida
naked eye on a good night
do you have a sky app on your phone
"""

# Convert the text into a list of words
word_list = text.split()

# Print the list
print(word_list)


['thank', 'you', 'these', 'posts', 'inevitably', 'have', 'a', 'line', 'in', 'them', 'that', 'says', 'i', 'know', 'what', 'thing', 'looks', 'like', 'but', 'most', 'people', 'actually', 'do', 'not', 'so', 'here', 'is', 'a', 'quick', 'guide', 'to', 'what', 'they', 'usually', 'are', 'very', 'bright', 'star', 'not', 'moving', 'planet', 'typically', 'jupiter', 'or', 'venus', 'very', 'quick', 'flash', 'that', 'may', 'or', 'may', 'not', 'leave', 'a', 'momentary', 'trail', 'meteor', 'brighter', 'quick', 'flash', 'that', 'leaves', 'a', 'trail', 'that', 'may', 'last', 'several', 'seconds', 'bolide', 'faint', 'fuzzy', 'ball', 'with', 'or', 'without', 'tail', 'not', 'moving', 'appears', 'every', 'night', 'for', 'days', 'or', 'weeks', 'comet', 'anything', 'with', 'blinking', 'bits', 'aircraft', 'smoothly', 'moving', 'star', 'satellite', 'brighter', 'smoothly', 'moving', 'star', 'iss', 'star', 'that', 'brightens', 'for', 'a', 'moment', 'then', 'dims', 'again', 'satellite', 'flare', 'line', 'of', 'smo

In [65]:

def get_locations(model, tokens, max_tokens=2):
    """
    Extract location-related entities from a list of tokens using a spaCy model,
    with an optional limit on the number of tokens in each entity.
    
    Parameters:
    - model: A spaCy language model.
    - tokens (list): List of tokens (words) from the input text.
    - max_tokens (int, optional): Maximum number of tokens allowed in each returned entity.
    
    Returns:
    - locations (list): List of location names identified in the text, 
      each with a token count less than or equal to max_tokens.
    """
    doc = model(" ".join(tokens))
    location_labels = {"LOC", "GPE",}
    locations = [
        ent.text for ent in doc.ents 
        if ent.label_ in location_labels and (max_tokens is None or len(ent) <= max_tokens)
    ]
    return locations


get_locations(model_t,word_list)

['rocket debris', 'south carolina']

In [50]:
def process_file_s(file_path, model):
    """
    Read sentences from a file and process them using spaCy's NER.
    
    Parameters:
    - file_path (str): Path to the input text file.
    - model: The spaCy NLP model.
    
    Returns:
    - recomended_locs (list): List of extracted locations.
    """
    i = 0
    recomended_locs = []
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    for line in lines:
        sentence = line.strip()  # Remove leading/trailing whitespace
        if sentence:  # Skip empty lines
            
            
            # Tokenize the sentence into words
            tokens = sentence.split()  # Split the sentence into tokens
            # Extract locations
            loc = get_locations(model, tokens)
            if loc:  # Check if loc is not empty
                recomended_locs.extend(loc)
            i += 1  
        if i == 9000:
            break
    return recomended_locs


# Example Usage: Replace 'input.txt' with your text file
rec_loc_s = process_file_s("preprocessed_comments_sentences",model)

In [51]:
rec_loc_s[]

['south carolina', 'florida', 'jacksonville', 'georgia', 'arizona', 'new zealand', 'london', 'prague', 'uk', 'florida', 'harden', 'nina', 'phd2', 'north america', 'mount mount', 'california', 'alaska', 'michigan', 'michigan', 'earth', 'new moon', 'alabama', 'massachusetts', 'new mexico', 'earth', 'india', 'youironmaiden', 'ad10', 'italy', 'pulsar', 'us', 'us', 'netflix', 'new zealand', 'new zealand', 'india', 'south island', 'm3', 'aruba', 'uk', 'randolph county', 'arkansas', 'iraq', 'india', 'mexico', 'answy', 'chicago', 'accidentaly', 'midwest', 'colorado', 'mm', 'youeiferundehre', 'youffefryn', 'dang', 'dmcas', 'the moon', 'coma', 'hahaha', 'arkansas', 'mexico', 'egypt', 'mpcc', 'houston', 'france', 'paris', 'pulsar', 'nova', 'suffix', 'chile', 'daytona beach', 'india', 'astro', 'atlantic', 'titan', 'south america', 'titan', 'india', 'titan', 'canada', 'titan', 'titan', 'uk', 'mount mount', 'uk', 'uk', 'fermi', 'north america', 'ngc6888', 'north america', 'rome', 'india', 'southeast

In [52]:
from HelperFunctions import write_to_text_file

write_to_text_file(rec_loc_s, "NER_loc_comments_sentences")
#write_to_text_file(rec_loc_s, "NER_loc__submissions_sentences")


In [53]:
def process_file_w(file_path, model):
    """
    Read words from a file and process them using spaCy's NER.
    
    Parameters:
    - file_path (str): Path to the input text file.
    - model: The spaCy NLP model.
    
    Returns:
    - recommended_locs (list): List of extracted locations.
    """
    i = 0
    recommended_locs = []
    with open(file_path, "r") as file:
        for line in file:
            word = line.strip()  # Remove leading/trailing whitespace
            if word:  # Skip empty lines
                i += 1
                # Process the word with spaCy
                doc = model(word)
                # Extract locations
                loc = get_locations_from_doc(doc)
                if loc:
                    recommended_locs.extend(loc)
                if i == 60000:
                    break
    return recommended_locs

def get_locations_from_doc(doc):
    """
    Extract location-related entities from a spaCy Doc object.
    
    Parameters:
    - doc: A spaCy Doc object.
    
    Returns:
    - locations (list): List of location names identified in the text.
    """
    location_labels = {"LOC", "GPE", "FAC"}
    return [ent.text for ent in doc.ents if ent.label_ in location_labels]

rec_loc_w = process_file_w("preprocessed_comments_words",model_t)


In [54]:
write_to_text_file(rec_loc_w, "NER_loc_comments_words")
#write_to_text_file(rec_loc_w, "NER_loc_submissions_words")


In [34]:
print(rec_loc_w)

['florida', 'georgia', 'arizona', 'london', 'scotland', 'florida', 'arizona', 'northeast', 'strip', 'asi120mm', 'coma', 'at60ed', 'at60ed', 'canada', 'california', 'alaska', 'california', 'triton', 'michigan', 'michigan', 'michigan', 'newton', 'newton', 'alabama', 'massachusetts', 'mexico', 'pulsar']
