In [None]:
!pip install -q openai

In [None]:
import re
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
from openai import OpenAI

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
class ChatGPTLabeler():
    ''' A class to transform given sentence to the list of tuples.
        Where each tuple is a pair (token, label), due to the specifics of ChatGPT'''

    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)  # create an OpenAI client

    def __call__(self, sentence):
        ''' Make an API call to CHatGPT to tokenize and label the sentence.'''

        prompt = f''' Every word or punctuation sign, such as: . , ? ! - is a separate token.

                    Tokenize the sentence inside the quotes: "{sentence}"

                    For each token generate a label. If a token is
                    at the beginning of a mountain name, then label it as 'B-MOUNT'.
                    If a token is inside of a mountain name, then label it as 'I-MOUNT'.
                    If a token is not a part of a mountain name, then label it O.

                    Important: only names of mountains should be labeled as 'B-MOUNT' or 'I-MOUNT'.
                    If the token contains words such as mountain, mountains, etc.
                    and this word is not a part of a mountain name, then it should be labeled O.

                    Very important: Output should be only the the tuples (token, label),
                    do not print anything other than these tuples.'''

        response = self.client.chat.completions.create(  # make an API call
        model = "gpt-4",
        temperature = 1,
        max_tokens = 1000,
        messages = [
            {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content

Main steps of the creation of 'mountain_names.csv' dataset:
1. Gather the text data to the 'data.txt' file.
2. Extract all the texts from the 'data.txt'. Label and tokenize the texts via ChatGPT api.
3. Parse the results and bring them to the Python dictionary format.
4. Make a Pandas DataFrame out of the Python dictionary.

In [None]:
# make a single string out of 'data.txt' texts
sentences = ''
with open('data.txt', 'r') as f:
    for line in f.readlines():
        sentences += line

In [None]:
# extract separate sentences from the 'sentences' string
sentences = sent_tokenize(sentences)

In [None]:
# instantiate the ChatGPT labeler
labeler = ChatGPTLabeler(api_key='YOUR_API_KEY')

In [None]:
mount_df = pd.DataFrame()

In [None]:
pattern = '\(.+?\)'  # define a regex pattern to extract tokens and labels
for i, sentence in enumerate(sentences):
    print(f'Processing sentence #{i}')
    print(f'Sentence: {sentence.strip()}')
    token_label_pairs = labeler(sentence.strip())  # convert sentence to the string '[(token, label), ...]'
    token_label_pairs = re.findall(pattern, token_label_pairs)  # make a list of '(token, label)'

    tokens_labels_dict = {'tokens': [], 'ner_tags': []}  # dictionary where tokens and labels will be stored

    try:
        for pair in token_label_pairs:
            token, label = [item.strip('\(\"\'\)') for item in pair.split(', ')]  # extract token and label
                                                                                  # from the '(token, label)' string
            tokens_labels_dict['tokens'].append(token)
            tokens_labels_dict['ner_tags'].append(label)
        tokens_labels_dict['sentence_id'] = i

        tokens_labels_df = pd.DataFrame.from_dict(tokens_labels_dict)  # convert dictionary with labels and tokens
                                                                       # of a given sentence to a DataFrame
        mount_df = pd.concat((mount_df, tokens_labels_df))  # concatenate to the main DataFrame

    except:
        print('Unable to process the sentence.')
        continue

In [None]:
mount_df.shape

(4113, 3)

In [None]:
mount_df = mount_df.reset_index(drop=True)

In [None]:
mount_df

Unnamed: 0,tokens,ner_tags,sentence_id
0,As,O,0
1,Ukraine,O,0
2,continues,O,0
3,to,O,0
4,navigate,O,0
...,...,...,...
4108,that,O,150
4109,cradle,O,150
4110,its,O,150
4111,slopes,O,150


In [None]:
mount_df.to_csv('mountain_names.csv')