In [67]:
import json, re, time, uuid
import huggingface_hub
import transformers
from datasets import load_dataset

# Load the dataset
ds = load_dataset("AdaptLLM/finance-tasks", "Headline")

# Check column names
# print(ds['test'][0].keys())
# print(ds['test'][0]['input'])

questionPattern = re.compile(r'\bDoes[^?]*\?', re.IGNORECASE)
# headlinePattern = re.compile(r'\bHeadline\b.*?\bheadline\b', re.IGNORECASE)

start = time.time()
QAPairNum = 0
for idx, entry in enumerate(ds['test']):
    # Debud: if code can handle various formats
    # if idx == 5: break

    records = entry['input'].strip().split('\n')
    goldIndex = entry['gold_index']
    classId = entry['class_id']
    questionList, answerList = list(), list()

    # Get the questions using regex
    for match in questionPattern.finditer(entry['input']):
        question = match.group().strip()
        questionList.append(question)
    
    # Get the headline
    # The headlines are always before the questions, after the empty line
    # Since the dataset is used for training, all information is preserved, sort of 'prompt'
    flag = True
    for i, record in enumerate(records):
        if flag:
            if 'Does' in record:
                idx = record.index('Does')
                context = record[:idx].strip()
            else:
                context = record.strip()
            flag = False
        if record == '':
            flag = True
        
        # Get the answers. Obserbe that the answer is always at the end of the record, before the next empty line
        # Note that the answer to the last question in an entry is not given
        answer = ''
        if i + 1 < len(records) and records[i + 1] == '':
            answer = record[-3:].strip()
            answerList.append(answer)
        elif i + 1 == len(records):
            answerList.append(answer)

    # Save the data in separate jsonl files
    # One with answers and the other without
    # Use jsonl can add new data without overwriting
    for question, answer in zip(questionList, answerList):
        data = {'id': QAPairNum,
                'Question': question, 
                'Answer': answer,
                'Context': context,
                'Answer Options': ['Yes', 'No'],
                'Gold_index': goldIndex,
                'Class_id': classId}
        if answer != '':
            with open('reformated_with_answer.jsonl', 'a') as f:
                f.write(json.dumps(data, indent=4) + '\n')
        else:
            with open('reformated_no_answer.jsonl', 'a') as f:
                f.write(json.dumps(data, indent=4) + '\n')
        QAPairNum += 1

print(f'Total number of QA pairs: {QAPairNum}')
print(f'Time of Reformatting: {time.time() - start:.2f} seconds')


Total number of QA pairs: 123282
Time of Reformatting: 23.78 seconds
