In [None]:
%pip install textstat
%pip install transformers

In [None]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textstat import flesch_reading_ease
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [None]:
def sentence_gpt_output(token_df):
    sentences = []
    current_sentence = ''
    curr_tag = None
    curr_tokens = []
    tag_tokens = []
    ne_list = []
    sent_tags = []
    tags = []


    # iterate over the rows of the dataframe
    for i, (word, tag) in enumerate(zip(token_df['id'], token_df['domain'])):
        if str(word).startswith("# id"):
            # Append previous sentence
            sentences.append(current_sentence.strip())
            if curr_tokens:
                tag_tokens.append(str(curr_tag + ' (' + ' '.join(curr_tokens) + ')'))

            ne_list.append(tag_tokens)
            curr_tokens = []
            curr_tag = None
            tags.append(sent_tags)

            # Reset current sentence
            current_sentence = ''
            sent_tags = []
            tag_tokens = []

        else:
            # add the current word to the current sentence
            current_sentence += str(word) + " "
            if str(tag).startswith('B-'):
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = [str(word)]
                curr_tag = tag[2:]
                sent_tags.append(tag[2:])

            elif str(tag).startswith('I-'):
                curr_tokens.append(str(word))
                sent_tags.append(tag[2:])

            else:
                if curr_tokens:
                    tag_tokens.append(curr_tag + ' (' + ' '.join(curr_tokens) + ')')
                curr_tokens = []
                sent_tags.append(tag)

    # add the last sentence to the list of sentences
    sentences.append(current_sentence.strip())
    tags.append(sent_tags)
    ne_list.append(tag_tokens)

#     print(sentences)
#     print('-' * 25)
#     print(tags)
#     print('-' * 25)
#     print(ne_list)


    sentence_df = pd.DataFrame({'sentence': sentences, 'tags': tags, 'fewshot_gpt': ne_list})
    return sentence_df



In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define a function to tokenize and encode a sentence
def encode_sentence(sentence):
    # Tokenize the sentence and add the special [CLS] and [SEP] tokens
    tokens = tokenizer.tokenize(sentence)
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    # Convert the tokens to their corresponding IDs in the BERT vocabulary
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    # Convert the input IDs to a PyTorch tensor
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)
    return input_ids_tensor


def sentence_tag_vector(dataframe):
    # Define a dictionary to map named entity tags to integers
    tag_dict = {'O': 0, 'Facility': 1, 'OtherLOC': 2, 'HumanSettlement': 3, 'Station': 4, 'VisualWork': 5, 'MusicalWork': 6,
                'WrittenWork': 7, 'ArtWork': 8, 'Software': 9, 'MusicalGRP': 10, 'PublicCorp': 11, 'PrivateCorp': 12,
               'AerospaceManufacturer': 13, 'SportsGRP': 14, 'CarManufacturer': 15, 'ORG': 16, 'Scientist': 17, 'Artist': 18,
                'Athlete': 19, 'Politician': 20, 'Cleric': 21, 'SportsManager': 22, 'OtherPER': 23, 'Clothing': 24, 'Vehicle': 25,
                'Food': 26, 'Drink': 27, 'OtherPROD': 28, 'Medication/Vaccine': 29, 'MedicalProcedure': 30, 'AnatomicalStructure': 31,
                'Symptom': 32, 'Disease': 33}

    # Load the data into a DataFrame
    df = pd.DataFrame({'sentence': dataframe['sentence'].tolist(),
                       'tags': dataframe['tags'].tolist()})

    # Encode the sentences using BERT
    encoded_sentences = df['sentence'].apply(encode_sentence)

    # Pass the encoded sentences through BERT and get the [CLS] token embeddings
    cls_embeddings = []
    for sentence in encoded_sentences:
        with torch.no_grad():
            output = model(sentence)
        last_hidden_state = output[0]
        cls_embedding = last_hidden_state[:, 0, :]
        cls_embeddings.append(cls_embedding.numpy())
    cls_embeddings = torch.tensor(cls_embeddings)

    cls_embeddings = cls_embeddings.squeeze(1)

    # Convert the named entity tags to vectors of integers using the tag dictionary
    tag_vectors = [[tag_dict[tag] for tag in tags] for tags in df['tags']]

    print('tag_vectors:', tag_vectors)

    # Get the maximum length of the vectors
    max_length = max([len(vec) for vec in tag_vectors])

    print('max length:', max_length)

    # Apply padding to the vectors
    padded_vectors = [vec + [0]*(max_length - len(vec)) for vec in tag_vectors]

    # Convert the list of padded vectors to a PyTorch tensor
    padded_vectors = torch.tensor(padded_vectors)

    print('padded_vectors:', padded_vectors)

    # Combine the BERT embeddings and named entity tag vectors to create the final feature vectors
    feature_vectors = torch.cat((cls_embeddings, padded_vectors), dim=1).numpy()

    for emb in feature_vectors:
        print('last digits vector:', emb[-30:-1])

    return feature_vectors


In [None]:
# Load in the data file
# It should be loaded in as a DataFrame containing the columns "id" and "domain"
# where a row contains a token in the column "id" and the corresponding BIO scheme
# tag in column "domain"

train = pd.read_csv("...")
train = pd.DataFrame(train)

print(train.head())

In [None]:
# Load in the data file
# It should be loaded in as a DataFrame containing the columns "id" and "domain"
# where a row contains a token in the column "id" and the corresponding BIO scheme
# tag in column "domain"

dev = pd.read_csv("...")
dev = pd.DataFrame(dev)

print(dev.head())

In [None]:
train_gpt = sentence_gpt_output(train)
dev_gpt = sentence_gpt_output(dev)

In [None]:
train_dev_df = pd.concat([train_gpt, dev_gpt], ignore_index = True)
train_dev_df

In [None]:
train_dev_vector = sentence_tag_vector(train_dev_df)

In [None]:
train_dev_vector_list = train_dev_vector.tolist()

In [None]:
train_dev_embeddings = pd.DataFrame({'sentence': train_dev_df['sentence'], 'embeddings': train_dev_vector_list})

In [None]:
train_dev_embeddings.to_csv('...', index=False)

In [None]:
dev_vector = sentence_tag_vector(dev_gpt)

In [None]:
dev_vector_list = dev_vector.tolist()

In [None]:
dev_embeddings = pd.DataFrame({'sentence': dev_gpt['sentence'], 'embeddings': dev_vector_list})

In [None]:
dev_embeddings.to_csv('...', index=False)

In [None]:
def convert_str_to_list(df):
    embeddings = []
    for i, row in df.iterrows():
        embeddings.append(eval(row['embeddings']))

    embed_df = pd.DataFrame({'sentence': df['sentence'], 'embeddings': embeddings})

    return embed_df

In [None]:
# Load the train_dev_embeddings
df_all = pd.read_csv('train_dev_embeddings.csv')

df_train_emb = df_all[0:16778]
df_dev_emb = df_all[16778:17649]

df_train_emb = convert_str_to_list(df_train_emb)
df_dev_emb = convert_str_to_list(df_dev_emb)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Join the elements of the embeddings column into a single string
embeddings1_str = [','.join(map(str, emb)) for emb in df_dev_emb['embeddings']]
embeddings2_str = [','.join(map(str, emb)) for emb in df_train_emb['embeddings']]

# Convert the embeddings columns to numpy arrays
embeddings1 = np.array([np.fromstring(e, sep=',') for e in embeddings1_str])
embeddings2 = np.array([np.fromstring(e, sep=',') for e in embeddings2_str])

# Compute cosine similarity between each sentence in df1 and df2
cos_sim_matrix = cosine_similarity(embeddings1, embeddings2)

# Find the index of the most similar sentence in df2 for each sentence in df1
most_similar_index = np.argmax(cos_sim_matrix, axis=1)

# Get the most similar sentences from df2
most_similar_sentences = df_train_emb.iloc[most_similar_index]['sentence'].values

# Add the most similar sentences as a new column in df1
df_dev_emb['most_similar_sentences'] = most_similar_sentences


In [None]:
top_sentences_1 = df_dev_emb['most_similar_sentences'].value_counts()[0:1].index.tolist()

In [None]:
vs_train_1 = train_gpt[train_gpt['sentence'].isin(top_sentences_1)]

In [None]:
vs_train_1.to_csv('...', index=False)

In [None]:
top_sentences_5 = df_dev_emb['most_similar_sentences'].value_counts()[0:5].index.tolist()

In [None]:
vs_train_5 = train_gpt[train_gpt['sentence'].isin(top_sentences_5)]

In [None]:
vs_train_5.to_csv('...', index=False)

In [None]:
top_sentences_10 = df_dev_emb['most_similar_sentences'].value_counts()[0:10].index.tolist()

In [None]:
vs_train_10 = train_gpt[train_gpt['sentence'].isin(top_sentences_10)]

In [None]:
vs_train_10.to_csv('...', index=False)