# Merges consecutive sentences to permit longer contexts

In [1]:
import pandas as pd
from transformers import LlamaTokenizer
import os

file_dir = "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/definite_horizon/augmented_standard/calendar_LLAMA"
new_dir = os.path.join(file_dir, "merged")

# Create new_dir
if not os.path.exists(new_dir):
    os.mkdir(new_dir)

print(new_dir)

cache_dir = "/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/toolformer/cache"

tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", cache_dir=cache_dir, token="***REMOVED***",)
tokenizer.add_bos_token = False

dot = tokenizer.encode(".")

max_context_length=150

search_files = {
    "Calculator": ["9.csv"],
    "Calendar": ["8.csv", "6.csv", "1.csv", "9.csv",],
    "WikiSearch": ["5.csv"]
}

def merge_sentences(input_df):

    output_df = pd.DataFrame(columns=input_df.columns)

    # Initialise context to first url
    context = input_df.iloc[0].url
    curr_text = []
    prev_row = None

    for i, row in input_df.iterrows():
        tokenized_text = tokenizer.encode(row.text)

        if row.url == context:
            if len(tokenized_text) + len(curr_text) < max_context_length:
                if prev_row is not None and not prev_row.text.endswith("."):
                    curr_text += dot
                curr_text += tokenized_text
            else:
                if prev_row is not None:
                    prev_row.text = tokenizer.decode(curr_text)
                    output_df = pd.concat([output_df, pd.DataFrame([prev_row])], ignore_index=True)
                curr_text = tokenized_text
        else:
            prev_row.text = tokenizer.decode(curr_text)
            output_df = pd.concat([output_df, pd.DataFrame([prev_row])], ignore_index=True)
            
            context = row.url
            curr_text = tokenized_text

        prev_row = row

    return output_df

for file in [f for f in os.listdir(file_dir) if f.endswith(".csv") and "stat" not in f]:
    print(f"Merging {file}...")
    df = pd.read_csv(os.path.join(file_dir, file))
    df = merge_sentences(df)
    df.to_csv(os.path.join(new_dir, file), index=False)

  from .autonotebook import tqdm as notebook_tqdm


/vol/bitbucket/jg2619/augmenting_llms/augmented_data_pipeline/data/definite_horizon/augmented_prompttrick/wikiSearch_LLAMA/merged
Merging 1.csv...
Merging 0.csv...
Merging 2.csv...


In [2]:
tokenizer.decode(tokenizer.encode("Hello my name.")+tokenizer.encode("My name is John."))

'Hello my name. My name is John.'

In [13]:
tokenizer.encode(" . ")

[29871, 869, 29871]