# Notebook to clean data

In [2]:
import json

source_path = "../data/vicuna_generations/rsis/answers"
source_filenames = [
    "rsis_vicuna-13b-v1_vicuna-7b-v1.1_temp1_answers.json",
    "rsis_vicuna-13b-v1_vicuna-13b-v1.1_temp1_answers.json",
    "rsis_vicuna-13b-v1_gpt-3.5-turbo_temp1_answers.json"
]
target_path = "../data/vicuna_generations/rsis/questions"
target_filename = "rsis_vicuna-13b-v1.1_100_questions.json"

source_filename = source_filenames[0]

# Getting json from a file
def open_file(path:str, filename:str) -> list[dict]:


    # Opening source file
    with open(f"{path}/{filename}", "r") as f:
        data = json.load(f)
    
    return data

def save_file(path:str, filename:str, data:list[dict]) -> None:

    with open(f"{path}/{filename}", "w") as f:
        json.dump(data, f, indent=2)

In [3]:

def match_target_data(question:str, target_data:list[dict]) -> int:
    for i in range(len(target_data)):
        if question in target_data[i]["output"]:
            return i
    return -1

def split_context(context:str) -> list[str]:
    context_list = context.split(". ")
    context_list = [sentence + "." for sentence in context_list]
    return context_list

def match_source_data(source_data:list[dict], target_data:list[dict]) -> list[dict]:
    for data in source_data:
        question = data["question"]

        result = match_target_data(question, target_data)

        # If result is valid
        if (result != -1):
            data["context"] = split_context(target_data[result]["context"].strip())
            data["output"] = [x for x in data["output"] if x.strip() != ""]

    return source_data

target_data = open_file(target_path, target_filename)

for filename in source_filenames:
    source_data = open_file(source_path, filename)
    source_data = match_source_data(source_data, target_data)
    save_file(source_path, filename, source_data)

#### Remove non questions

In [None]:
from typing import List, Dict
import glob
import json
from QaGeneration import ensure_string

def is_question(keyword:str, data:Dict) -> bool:
    if keyword in data and "?" in data[keyword]:
        return True
    return False

def is_list_match(keyword:str, target_list:List[str], data:Dict) -> bool:
    if keyword in data:
        text = ensure_string(data[keyword])
    else:
        return False
    
    for target in target_list:
        if target in text:
            return True
    return False

def contains_blank(data:Dict) -> bool:
    for key, value in data.items():
        if type(value) == str and value == "":
            return True
        elif type(value) == int and value == 0:
            return True
        elif type(value) == float and value == 0.0:
            return True
        elif isinstance(value, List) and 0 in value:
            return True
    return False

def get_file_path(file_directories: List[str]) -> List[str]:
    result_list = []
    for file_directory in file_directories:
        result_list += glob.glob(f"{file_directory}/*.json")
    return result_list

file_directories = [
    # "../data/generations/nyt",
    "../data/generations/rsis",
    # "../data/generations/straitstimes"
]

file_paths = get_file_path(file_directories)

flagged_words = [
    "I'm sorry",
    "as an AI language model"
]

keyword_list = [
    "point_form_close_book_answer",
    "close_book_answer"
]


for file in file_paths:
    print(file)
    with open(file, "r") as f:
        dataset = json.load(f)

    new_dataset = [data for data in dataset if is_question(keyword="question", data=data)]
    new_dataset = [data for data in dataset if not contains_blank(data)]
    for keyword in keyword_list:
        new_dataset = [data for data in dataset if not is_list_match(keyword=keyword, target_list=flagged_words, data=data)]

    with open(file, "w") as f:
        json.dump(new_dataset, f, indent=2)
