## Prepare a dataframe from JSON files

In [23]:
# Define a list to store the extracted data
data = []

# Path to the folder containing JSON documents
folder_path = "./json/articles/"

# Iterate through JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        with open(os.path.join(folder_path, filename), "r") as json_file:
            document = json.load(json_file)
            
            # Extract title, abstract, and body text with error handling
            doc_id = document.get("docId", "")
            title = document["metadata"].get("title", "")
            abstract_data = document.get("abstract", [])
            body_text_items = document.get("body_text", [])
            
            # Initialize abstract as an empty string
            abstract = ""
            
            # Check the type of abstract_data
            if isinstance(abstract_data, list):
                # If it's a list, try to extract the sentences
                try:
                    abstract = " ".join([item["sentence"] for item in abstract_data])
                except (KeyError, TypeError):
                    abstract = ""
            elif isinstance(abstract_data, str):
                # If it's a string, consider it as the entire abstract
                abstract = abstract_data
            
            # Append the extracted data to the list
            data.append({
                "DocID": doc_id,
                "Title": title,
                "Abstract": abstract,
                "BodyText": " ".join([item["sentence"] for item in body_text_items])
            })

# Create a DataFrame from the extracted data
df = pd.DataFrame(data)


In [24]:
df

Unnamed: 0,DocID,Title,Abstract,BodyText
0,S0001457513002972,A system of safety management practices and wo...,Objective The overall research objective was t...,"In particular, the individual worker interface..."
1,S0001457513004806,Network-level accident-mapping: Distance based...,The objective of an accident-mapping algorithm...,The location coordinates of the accident is de...
2,S0001457514003091,Measuring errors and violations on the road: A...,The Driver Behavior Questionnaire (DBQ) is a s...,"Briefly, the Cohort II study randomly sampled ..."
3,S0001457515001098,Operating under the influence: Three year reci...,Operating a motor vehicle under the influence ...,"In addition to loss of life, the economic cost..."
4,S000145751500127X,Real-time driver drowsiness feedback improves ...,Driver drowsiness has been implicated as a maj...,"However, the effects of feedback on other appr..."
...,...,...,...,...
40086,S8756328219302856,PYY is a negative regulator of bone mass and s...,Objective: Bone loss in anorexia nervosa and f...,"Scans were performed at 50 kV, 200 μA, 0.5 mm ..."
40087,S8756328219304004,Development of protocols for the first serial ...,There is an unmet need for a high-resolution t...,We have generated and compared SBF SEM data fr...
40088,S8756328219304715,Overexpression of Pitx1 attenuates the senesce...,To explore the role of low expression of Pitx1...,"However, this process needs to be confirmed. C..."
40089,S8756328219304739,"The effect of pubertal timing, as reflected by...",Objective: To examine the relationship between...,"In terms of limitations, whereas timing of the..."


### Save csv for easier access later

In [25]:
df.to_csv("elsevier_corpus.csv", index=False)