In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def find_abbreviation_full_form(sentence):
    pattern = r"\(([^)]+)\)"
    matches = re.finditer(pattern, sentence)
    abbr_dict = {} 
    
    for match in matches:
        abbr = match.group(1) 
        before_bracket_text = sentence[:match.start()].strip()  
        words = before_bracket_text.split() 
        abbr_letters = re.sub(r"[^A-Za-z]", "", abbr).lower() 
        full_form_candidates = []
        
        for i in range(len(words) - 1, -1, -1): 
            word = re.sub(r"[^A-Za-z]", "", words[i]).lower() 
            full_form_candidates.append(words[i])
            candidate_letters = "".join([re.sub(r"[^A-Za-z]", "", w).lower() for w in full_form_candidates[::-1]])
            if is_complete_match(abbr_letters, candidate_letters):
                abbr_dict[abbr] = " ".join(full_form_candidates[::-1])
                break 
            
    return abbr_dict


def is_complete_match(sub, full):
    if not sub:
        return False
    it = iter(full)
    return all(char in it for char in sub) and full.startswith(sub[0])

def process_q_row(qa_list):
    abbr = find_abbreviation_full_form(str(qa_list))
    for i in range(len(qa_list)):
        if abbr:
            for abbr_key, full_form in abbr.items():
                pattern = r'\b'+ re.escape(abbr_key)+r'\b'
                qa_list[i] = re.sub(pattern, re.escape(full_form), str(qa_list[i]))
    return qa_list

In [3]:
#  Dataset
df_all = pd.read_json('../QA_data/inf_processed_all.json')

In [4]:
# Process the Q&A 
df_all['New Q&A'] = df_all.loc[:,"Q&A"].apply(process_q_row)

In [5]:
df_all['New Q&A'][1]

["{'Q': 'What is the purpose of the study presented in the paper?', 'A': 'The purpose of the study is to evaluate the potential of a novel alcohol sensor based on nanosized SrCO3 for the detection of ethanol vapor.'}",
 "{'Q': 'What is the mechanism of chemiluminescence (chemiluminescence) from the catalytic oxidation of ethanol on SrCO3?', 'A': 'The chemiluminescence mechanism involves the oxidation of ethanol to produce acetaldehyde and ethylene, which are then converted to carbon dioxide and water. The chemiluminescence is attributed to the oxidation of these intermediates by SrCO3.'}",
 "{'Q': 'What are the advantages of using nanosized SrCO3 as a sensing material?', 'A': 'The advantages of using nanosized SrCO3 include its high activity, good selectivity, and small size, which make it suitable for detecting small concentrations of gas molecules.'}",
 "{'Q': 'How does the temperature dependence of the chemiluminescence intensity affect the performance of the sensor?', 'A': 'The tem

In [6]:
# Save dataframe to csv file
df_all.to_csv('abbr_complement.csv', index=False)

In [None]:
df_all.to_json('../QA_data/inf_processed_all_new.json')

In [49]:
df_test = df_all.head(10)
df_test.to_csv('test.csv', index=False)