In [6]:
import os
import glob
import json
import csv
from tqdm import tqdm

root_path_folder =  '/hps/nobackup/literature/text-mining/'
dates = ["01_10_2023", "02_10_2023", "03_10_2023", "04_10_2023",
         "27_09_2023", "28_09_2023", "29_09_2023", "30_09_2023"]

# Define the path for the original and comparison folder
original_path = "daily_pipeline_api/{}/fulltext/json_api"
comparison_path = "mlfp_prod/daily_pipeline_api/{}/fulltext/json_api"

def get_entities_and_count(json_path):
    count_json_lines = 0
    entities = {"disease": set(), "gene_protein": set(), "organism": set()}
    pmcid_data = {}
    with open(json_path, 'r', encoding='utf-8', errors='replace') as file:
        for line in file:
            json_line = json.loads(line)
            count_json_lines += 1

            try:
                pmcid = json_line['pmcid']
            except:
                pmcid = '--'

            for each_anno in json_line.get('anns', []):
                entity_type = each_anno['type']
                if entity_type in entities:
                    if pmcid not in pmcid_data:
                        pmcid_data[pmcid] = {"disease": set(), "gene_protein": set(), "organism": set()}
                    pmcid_data[pmcid][entity_type].add(each_anno['exact'])
                    
    return count_json_lines, pmcid_data

with open('MLFP_output1.csv', 'w', newline='') as output1, open('MLFP_output2.csv', 'w', newline='') as output2:
    output1_writer = csv.writer(output1)
    output1_writer.writerow(["file_name", "count_original", "count_comparison"])
    output2_writer = csv.writer(output2, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
    output2_writer.writerow(["File_name", "PMCID", "Entity_Type", "Original_set", "Comparison_set", "Filtered_set"])
    
    for date in tqdm(dates):
        orig_folder = os.path.join(root_path_folder, original_path.format(date))
        comp_folder = os.path.join(root_path_folder, comparison_path.format(date))

        for orig_file in glob.glob(os.path.join(orig_folder, "patch-*.json")):
            comp_file = orig_file.replace(orig_folder, comp_folder)

            orig_count, orig_pmcid_data = get_entities_and_count(orig_file)
            comp_count, comp_pmcid_data = get_entities_and_count(comp_file)

            # Write counts to output1
            file_name = os.path.basename(orig_file)
            output1_writer.writerow([file_name, orig_count, comp_count])

            # Write entities to output2
            for pmcid, orig_entities in orig_pmcid_data.items():
                for entity_type, orig_set in orig_entities.items():
                    comp_set = comp_pmcid_data.get(pmcid, {}).get(entity_type, set())
                    filtered_set = orig_set - comp_set
                    output2_writer.writerow([file_name, pmcid, entity_type, 
                                             ";".join(orig_set), ";".join(comp_set), ";".join(filtered_set)])


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [05:12<00:00, 39.09s/it]


In [4]:
import pandas as pd

# Read the output2.csv
df = pd.read_csv('MLFP_output2.csv')

# Group by file_name and Entity_Type
grouped = df.groupby(['File_name', 'Entity_Type'])

# Sample 2 examples from each group
sampled_data = grouped.apply(lambda x: x.sample(n=2, replace=True)).reset_index(drop=True)

# Display the sampled data
print(sampled_data)

# Optionally, if you want to save the sampled data to a new CSV
sampled_data.to_csv('sampled_output.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

                       File_name     PMCID   Entity_Type  \
0    patch-01-10-2023-0.api.json   6178632       disease   
1    patch-01-10-2023-0.api.json   2518493       disease   
2    patch-01-10-2023-0.api.json   6181134  gene_protein   
3    patch-01-10-2023-0.api.json   6196371  gene_protein   
4    patch-01-10-2023-0.api.json   2927527      organism   
..                           ...       ...           ...   
895  patch-30-09-2023-9.api.json  10523357       disease   
896  patch-30-09-2023-9.api.json  10529583  gene_protein   
897  patch-30-09-2023-9.api.json  10526053  gene_protein   
898  patch-30-09-2023-9.api.json  10532341      organism   
899  patch-30-09-2023-9.api.json  10526998      organism   

                                          Original_set  \
0    cancer;defect;heart failure;Pulmonary embolism...   
1    neonatal diabetes;glucose intolerance;pancreat...   
2                                         US2;SES;US34   
3    PRN;fimbrial proteins;Th1;IL-5;IL-17;inter

In [5]:
# Filter out rows where both Original_set and Comparison_set are empty
df = df[df['Original_set'].notna() | df['Comparison_set'].notna()]

# Group by file_name and Entity_Type
grouped = df.groupby(['File_name', 'Entity_Type'])

# Sample 2 examples from each group
sampled_data = grouped.apply(lambda x: x.sample(n=2, replace=True) if len(x) > 1 else x).reset_index(drop=True)

# Display the sampled data
print(sampled_data)

# Optionally, if you want to save the sampled data to a new CSV
sampled_data.to_csv('sampled_output_filtered.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

                       File_name     PMCID   Entity_Type  \
0    patch-01-10-2023-0.api.json   6188581       disease   
1    patch-01-10-2023-0.api.json   6182251       disease   
2    patch-01-10-2023-0.api.json   6199132  gene_protein   
3    patch-01-10-2023-0.api.json   3186338  gene_protein   
4    patch-01-10-2023-0.api.json   6191844      organism   
..                           ...       ...           ...   
895  patch-30-09-2023-9.api.json  10539101       disease   
896  patch-30-09-2023-9.api.json  10536908  gene_protein   
897  patch-30-09-2023-9.api.json  10523421  gene_protein   
898  patch-30-09-2023-9.api.json  10532140      organism   
899  patch-30-09-2023-9.api.json  10527303      organism   

                                          Original_set  \
0    multiple sclerosis;neurological disease;Shakin...   
1                                               reflux   
2                                    LCB2;LCB1;LC3;stg   
3    P-selectin;Integrin αIIb;thrombin;endothel