Analysis on the preprocessed mr pubmed data set "data / intermediate / mr-pubmed-data / mr-pubmed-data.json"

In [2]:
%load_ext autoreload
%autoreload 2

import json

import altair as alt
import pandas as pd

from yiutils.project_utils import find_project_root


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
proj_root = find_project_root(anchor_file="justfile")
data_dir = proj_root / "data"
assert data_dir.exists(), f"Data directory {data_dir} does not exist."

path_to_processed_mr_pubmed_data = (
    data_dir / "intermediate" / "mr-pubmed-data" / "mr-pubmed-data.json"
)
assert path_to_processed_mr_pubmed_data.exists(), (
    f"Processed MR PubMed data file {path_to_processed_mr_pubmed_data} does not exist."
)


NameError: name 'find_project_root' is not defined

In [5]:
with path_to_processed_mr_pubmed_data.open("r") as f:
    mr_pubmed_json = json.load(f)
    mr_pubmed_df = pd.DataFrame(mr_pubmed_json)


# Glimpse

In [7]:
mr_pubmed_df.info()
mr_pubmed_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15635 entries, 0 to 15634
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   pmid          15635 non-null  object
 1   ab            15635 non-null  object
 2   pub_date      15635 non-null  object
 3   title         15635 non-null  object
 4   journal_issn  15434 non-null  object
 5   journal       15635 non-null  object
 6   author_affil  15473 non-null  object
dtypes: object(7)
memory usage: 855.2+ KB


Unnamed: 0,pmid,ab,pub_date,title,journal_issn,journal,author_affil
0,38794754,Alcohol consumption significantly impacts dise...,2024-05-25,Alcohol Exposure and Disease Associations: A M...,2072-6643,Nutrients,Key Laboratory of Environmental Stress and Chr...
1,38794655,The aim of this study was to assess the causal...,2024-05-25,Exploring the Causal Effects of Mineral Metabo...,2072-6643,Nutrients,"Department of Radiation Medicine, Guangdong Pr..."
2,38793035,The causal effect and pathways of gut microbio...,2024-05-25,The Causal Effect of Gut Microbiota and Plasma...,2075-4426,Journal of personalized medicine,"Department of Thoracic Surgery, Xiangya Hospit..."
3,38790305,Sensorineural hearing loss (SNHL) is a multifa...,2024-05-25,Mendelian Randomization Reveals: Triglycerides...,2306-5354,"Bioengineering (Basel, Switzerland)",ENT Institute and Department of Otorhinolaryng...
4,38789873,Pre-pregnancy obesity was associated with gest...,2024-02-08,Phenome-Wide Investigation of the Causal Assoc...,1933-7205,"Reproductive sciences (Thousand Oaks, Calif.)","Xuanwu Hospital, Capital Medical University, B..."
...,...,...,...,...,...,...,...
15630,39110833,We aimed to examine the association between hy...,2024-04-01,Hypnotic use and the risk of cardiovascular di...,2047-4881,European journal of preventive cardiology,"Department of Cardiology, State Key Laboratory..."
15631,39087877,Numerous observational studies link obstructiv...,2024-04-16,Novel susceptibility genes and biomarkers for ...,1550-9109,Sleep,"Department of Geriatric Neurology, Nanjing Bra..."
15632,39084694,Ovarian cancer is characterized by late-stage ...,2024-08-01,Large-scale analysis to identify risk factors ...,1525-1438,International journal of gynecological cancer ...,"Australian Centre for Precision Health, Unit o..."
15633,39004505,Vitamin D (VitD) affects the risk of multiple ...,2024-04-16,Vitamin D affects the risk of disease activity...,1468-330X,"Journal of neurology, neurosurgery, and psychi...","Division of Neuroscience, IRCCS San Raffaele S..."


# descriptive stats

## uniq ids

In [8]:
uniq_id = set(mr_pubmed_df["pmid"].unique())

print(len(uniq_id))


15635


In [9]:
duplicated_pmids = mr_pubmed_df[
    mr_pubmed_df["pmid"].duplicated(keep=False)
].sort_values("pmid")
duplicated_pmids


Unnamed: 0,pmid,ab,pub_date,title,journal_issn,journal,author_affil


# num missing values

In [None]:
cols = ["pmid", "title", "ab", "pub_date"]
for col in cols:
    print(col)
    print(sum(mr_pubmed_df[col].isna()))


pmid
0
title
0
ab
0
pub_date
0


# Distribution of the number of abstracts per year

In [None]:
# Ensure the 'pub_date' column is in datetime format
mr_pubmed_df["pub_date"] = pd.to_datetime(mr_pubmed_df["pub_date"], errors="coerce")

# Extract the year from the 'pub_date' column
mr_pubmed_df["year"] = mr_pubmed_df["pub_date"].dt.year

# Group by year and count the number of records
yearly_counts = mr_pubmed_df.groupby("year").size().reset_index(name="count")

# Create the Altair bar chart
chart = (
    alt.Chart(yearly_counts)
    .mark_bar()
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y("count:Q", title="Number of Records"),
        tooltip=["year", "count"],
    )
    .properties(
        title="Yearly Sum of Total Number of Records for mr_pubmed_df",
        width=600,
        height=400,
    )
)

chart
