## Data Sampling
The test data provided by the MultiCoNER II shared task contains a large number of data instances. Due to the expenses linked with running GPT-3, we are unable to apply the model to the entire data set. In this file we randomly sample instances from the test data as test set, as well as from the validation file to be used as few-shot sets.

In [None]:
import pandas as pd

In [None]:
# Load in the data file
# It should be loaded in as a DataFrame containing the columns "id" and "domain"
# where a row contains a token in the column "id" and the corresponding BIO scheme
# tag in column "domain"

test_full = pd.read_csv("...")
test_full = pd.DataFrame(test_full)

print(test_full.head())

In [None]:
# Load in the data file
# It should be loaded in as a DataFrame containing the columns "id" and "domain"
# where a row contains a token in the column "id" and the corresponding BIO scheme
# tag in column "domain"

dev = pd.read_csv("...")
dev = pd.DataFrame(dev)

print(dev.head())

In [None]:
def sentence_gpt_output(token_df):
    sentences = []
    current_sentence = ''
    curr_tag = None
    curr_tokens = []
    tag_tokens = []
    ne_list = []
    sent_tags = []
    tags = []
    

    # iterate over the rows of the dataframe
    for i, (word, tag) in enumerate(zip(token_df['id'], token_df['domain'])):
        if str(word).startswith("# id"):
            # Append previous sentence
            sentences.append(current_sentence.strip())
            if curr_tokens:
                tag_tokens.append(str(curr_tag + ' (' + ' '.join(curr_tokens) + ')'))
            
            ne_list.append(tag_tokens)
            curr_tokens = []
            curr_tag = None
            tags.append(sent_tags)
            
            # Reset current sentence
            current_sentence = ''
            sent_tags = []
            tag_tokens = []
            
        else: 
            # add the current word to the current sentence
            current_sentence += str(word) + " "
            if str(tag).startswith('B-'):
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = [str(word)]
                curr_tag = tag[2:]
                sent_tags.append(tag[2:])
                
            elif str(tag).startswith('I-'):
                curr_tokens.append(str(word))
                sent_tags.append(tag[2:])
                
            else:
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = []
                sent_tags.append(tag)

    # add the last sentence to the list of sentences
    sentences.append(current_sentence.strip())
    tags.append(sent_tags)
    ne_list.append(tag_tokens)
    
#     print(sentences)
#     print('-' * 25)
#     print(tags)
#     print('-' * 25)
#     print(ne_list)
    

    sentence_df = pd.DataFrame({'sentence': sentences, 'tags': tags, 'fewshot_gpt': ne_list})
    return sentence_df
                
            

In [None]:
def sentence_xlm_output(token_df):
    sentences = []
    current_sentence = ''
    curr_tag = None
    curr_tokens = []
    tag_tokens = []
    ne_list = []
    sent_tags = []
    tags = []
    

    # iterate over the rows of the dataframe
    for i, (word, tag) in enumerate(zip(token_df['id'], token_df['domain'])):
        if str(word).startswith("# id"):
            # Append previous sentence
            sentences.append(current_sentence.strip())
            if curr_tokens:
                tag_tokens.append(str(curr_tag + ' (' + ' '.join(curr_tokens) + ')'))
            
            ne_list.append(tag_tokens)
            curr_tokens = []
            curr_tag = None
            tags.append(sent_tags)
            
            # Reset current sentence
            current_sentence = ''
            sent_tags = []
            tag_tokens = []
            
        else: 
            # add the current word to the current sentence
            current_sentence += str(word) + " "
            if str(tag).startswith('B-'):
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = [str(word)]
                curr_tag = tag[2:]
                sent_tags.append(tag)
                
            elif str(tag).startswith('I-'):
                curr_tokens.append(str(word))
                sent_tags.append(tag)
                
            else:
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = []
                sent_tags.append(tag)

    # add the last sentence to the list of sentences
    sentences.append(current_sentence.strip())
    tags.append(sent_tags)
    ne_list.append(tag_tokens)
    
#     print(sentences)
#     print('-' * 25)
#     print(tags)
#     print('-' * 25)
#     print(ne_list)
    

    sentence_df = pd.DataFrame({'sentence': sentences, 'tags': tags, 'fewshot_gpt': ne_list})
    return sentence_df
                
            

In [None]:
test_gpt = sentence_gpt_output(test_full)
test_gpt.to_csv('...', index=False)

In [None]:
test_gpt_4000_1 = test_gpt.sample(n=4000, replace=False)
test_gpt_4000_1.to_csv('...', index=False)

In [None]:
dev_gpt = sentence_gpt_output(dev)
dev_gpt.to_csv('...', index=False)

In [None]:
dev_gpt_10 = dev_gpt.sample(n=10, replace=False)
dev_gpt_10.to_csv('...', index=False)

In [None]:
dev_gpt_10_2 = dev_gpt.sample(n=10, replace=False)
dev_gpt_10_2.to_csv('...', index=False)

In [None]:
test_xlm = sentence_xlm_output(test_full)
test_xlm.head()

In [None]:
test_xlm_4000_1 = test_xlm[test_xlm['sentence'].isin(test_gpt_4000_1['sentence'])]
test_xlm_4000_1 = test_xlm_4000_1.drop_duplicates(subset=['sentence'])
test_xlm_4000_1.to_csv('...', index=False)