In [108]:
import os
import json
from typing import List, Dict
import polars as pl
import pandas as pd

EXPECTED_COL_SCHEMA = ['pmid', 'year', 'abstract_text', 'abstract_title', 'abstract_authors_list']

directory = '/Users/luis.morales/personal-dev/SciFinder/data/raw/pubmed/'

def validate_json_pubmed_schema(single_pubmed_json: Dict[str, str]) -> bool:
    "Validates that given Dictionary have the correct Schema"
    
    pubmed_col_schema = list(single_pubmed_json.keys())

    return EXPECTED_COL_SCHEMA == pubmed_col_schema


def validate_pubmed_structure(json_files: List[Dict[str, str]]) -> List[int]:
    "Validates that given List of Dictionaries have the correct Schema and return the ones that did not match"
    error_logs = []

    for idx, json_file in enumerate(json_files):
        if validate_json_pubmed_schema(json_file) == False:
            error_logs.append(idx)
            
        return error_logs


def load_json(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: The file at {file_path} is not a valid JSON file.")
        return None



def load_pubmed_json_files(directory: str) -> List[Dict]:
    data = []
    data_error_logs = []

    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            
            filepath = os.path.join(directory, filename)

            file_data = load_json(filepath)
            data.extend(file_data)

            error_logs = validate_pubmed_structure(file_data)

            if error_logs != []:
                log = {
                    filepath: error_logs
                }
                data_error_logs.append(log)
                

    return data, data_error_logs

# Load all JSON files
data_parsed, error_logs = load_pubmed_json_files(directory)

# Print the number of files loaded
print(f"Loaded {len(data_parsed)} JSON files.")
print(f"Errors catched: {len(error_logs)}")

Loaded 28836 JSON files.
Errors catched: 0


In [107]:
pl.DataFrame(data_parsed,orient='row')

pmid,year,abstract_text,abstract_title,abstract_authors_list
i64,i64,str,str,list[str]
35931,1979,"""1. Of the scarce Calyptra minu…","""Skin-piercing blood-sucking mo…","[""Bänziger""]"
35932,1979,"""The formation of the peritroph…","""Peritrophic membrane formation…","[""Houk"", ""Obie"", ""Hardy""]"
35933,1979,"""The source of blood meals from…","""The feeding habits and ecology…","[""Snow"", ""Boreham""]"
35934,1979,"""Activation of complement by pa…","""Complement activation by paras…","[""Santoro"", ""Bernal"", ""Capron""]"
35935,1979,"""This paper presents evidence t…","""Tsetse movement in wind fields…","[""Molyneux"", ""Baldry"", ""Fairhurst""]"
…,…,…,…,…
45951,1975,"""A general mathematical formula…","""Detection of neuroelectric sig…","[""Nahvi"", ""Woody"", … ""Sharafat""]"
45953,1975,"""Direct recording from the stri…","""Direct depth recording of the …","[""Salzarule"", ""Liary"", … ""Stenal""]"
45954,1975,"""This study concerns a case of …","""Disorganized relations of toni…","[""De Barros-Ferreira"", ""Chodkiewicz"", … ""Salzarulo""]"
45955,1975,"""The effect of ochratoxin alone…","""Effect of ochratoxin and aflat…","[""Richard"", ""Thurston"", … ""Booth""]"
