# Mountine Sentences Dataset Generation
This notebook generates a dataset of sentences about mountains, labeling the mountain names 
within the sentences using the BIO (Begin, Inside, Outside) format.

In [4]:
import random

# List of 100 mountain names generated by ChatGPT 
mountains = [
    "Mount Everest", "K2", "Mount Kilimanjaro", "Mont Blanc", "Mount Elbrus", "Denali", "Mount McKinley", "Matterhorn",
    "Mount Fuji", "Aconcagua", "Vinson Massif", "Mount Kosciuszko", "Kangchenjunga", "Lhotse", "Makalu", "Cho Oyu",
    "Dhaulagiri", "Manaslu", "Nanga Parbat", "Annapurna", "Shishapangma", "Gasherbrum I", "Gasherbrum II",
    "Broad Peak", "Mount Whitney", "Mount Rainer", "Mount St. Helens", "Pico de Orizaba", "Popocatépetl", "Iztaccihuatl",
    "Mount Hood", "Mount Shasta", "Mount Adams", "Mount Baker", "Mount Washington", "Mount Olympus", "Ben Nevis",
    "Snowdon", "Scafell Pike", "Mount Toubkal", "Jebel Musa", "Atlas Mountains", "Mulhacen", "Pico Aneto", "Mount Etna",
    "Stromboli", "Teide", "Mount Ararat", "Zugspitze", "Grossglockner", "Mount Aso", "Mount Tate", "Mount Haku",
    "Mount Daisetsu", "Mount Tanigawa", "Mount Hotaka", "Mount Asama", "Pico Bolivar", "Chimborazo", "Cotopaxi",
    "Mount Wilhelm", "Mount Giluwe", "Mount Aoraki", "Mount Tasman", "Mount Aspiring", "Mount Ruapehu", "Mount Taranaki",
    "K2", "Gasherbrum IV", "Masherbrum", "Saltoro Kangri", "Saser Kangri", "Trango Towers", "Mont Perdu", "Pyrenees",
    "Pico de Europa", "Matterhorn", "Monte Rosa", "Weisshorn", "Dufourspitze", "Finsteraarhorn", "Liskamm",
    "Mont Collon", "Mont Maudit", "Punta Gnifetti", "Mount Sniezka", "Mount Kasprowy", "Mount Rysy", "Mount Triglav",
    "Mount Durmitor", "Jebel Toubkal", "Mount Olympus", "Mythen", "Pilatus", "Eiger", "Jungfrau", "Mönch", "Piz Bernina",
    "Monte Generoso", "Matterhorn", "Gran Paradiso", "Mount Galdhøpiggen"
]

# Sample sentences template
sentences = [
    "{} is the highest mountain in the world",  
    "The summit of {} attracts many climbers each year",  
    "{} is considered the tallest mountain in its continent",  
    "Climbing {} is a challenge even for experienced mountaineers",  
    "{} offers breathtaking views from its peak",  
    "The base of {} stretches across several regions",  
    "The rugged terrain of {} makes it difficult to ascend",  
    "{} is a major destination for adventurers",  
    "{} has a unique ecosystem due to its altitude",  
    "Many expeditions have been made to scale {}",  
    "{} is often covered in snow year-round",  
    "{} is a popular spot for trekking and hiking",  
    "Reaching the summit of {} can take several days",  
    "The steep slopes of {} are difficult to navigate",  
    "The glaciers around {} are slowly retreating",  
    "Local legends surround the history of {}",  
    "{} has long been a symbol of adventure and exploration",  
    "{} challenges climbers with its unpredictable weather",  
    "Standing tall, {} is admired for its natural beauty",  
    "The dramatic landscape of {} draws visitors from around the world"  
]

In [5]:
def label_sentence(sentence, mountain, sentence_id):
    """
    Label the sentence with the mountain name using the BIO format.
    
    Parameters:
    - sentence (str): The sentence containing the mountain name.
    - mountain (str): The mountain name to label.
    - sentence_id (int): The ID of the sentence.
    
    Returns:
    - list: A list of dictionaries containing sentence_id, word, and label for each token.
    """
    tokens = sentence.split()
    labels = ["O"] * len(tokens)

    # Split the mountain name into tokens for comparison
    mountain_tokens = mountain.split()
    mountain_length = len(mountain_tokens)

    # Check all possible positions for the mountain name in the sentence
    for i in range(len(tokens) - mountain_length + 1):
        if tokens[i:i + mountain_length] == mountain_tokens:
            labels[i] = "B-MOUNTAIN"
            for j in range(1, mountain_length):
                labels[i + j] = "I-MOUNTAIN"
            break

    # Additional check for the end of the sentence
    if tokens[-mountain_length:] == mountain_tokens:
        labels[-mountain_length] = "B-MOUNTAIN"
        for j in range(1, mountain_length):
            labels[-mountain_length + j] = "I-MOUNTAIN"
            
    return [{"sentence_id": sentence_id, "word": token, "label": label} for token, label in zip(tokens, labels)]

In [6]:
# Generate the dataset and save sentences
dataset = []
sentence_id = 0

# Open the file to save sentences
with open('mountain_sentences.txt', 'w') as sentences_file:
    for i in range(300):
        mountain = random.choice(mountains)
        sentence_template = random.choice(sentences)
        
        # Generate the sentence by formatting the template with the mountain name
        sentence = sentence_template.format(mountain)
        
        sentence_id += 1
        sentences_file.write(f"{sentence}\n")
        
        # Label the sentence with BIO format and include sentence_id
        sentence_data = label_sentence(sentence, mountain, sentence_id)

        # Append each token with its respective label and sentence_id
        dataset.extend(sentence_data)
        
        # Add a blank line between sentences to follow BIO format
        dataset.append({"sentence_id": "", "word": "", "label": ""})

In [7]:
# Filter out empty entries from dataset
filtered_dataset = [entry for entry in dataset if entry['sentence_id'] and entry['word']]

# Write the filtered dataset to a CSV file
with open('mountain_label_dataset.csv', 'w') as f:
    for entry in filtered_dataset:
        f.write(f"{entry['sentence_id']}\t{entry['word']}\t{entry['label']}\n")

print("Dataset successfully generated and saved as 'mountain_label_dataset.csv' and 'mountain_sentences.txt'.")

Dataset successfully generated and saved as 'mountain_label_dataset.csv' and 'mountain_sentences.txt'.
