## Import

In [2]:
from sentence_transformers import SentenceTransformer
import polars as pl
import seaborn as sns
from tqdm.autonotebook import tqdm, trange

import numpy as np


model = SentenceTransformer("neuml/pubmedbert-base-embeddings")

  from tqdm.autonotebook import tqdm, trange


In [2]:
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)
print(embeddings)

[[-0.54902214 -0.0099182  -0.26375914 ... -0.1578922  -1.2998055
   0.80934745]
 [-1.0420779   0.7897051   0.5180282  ... -0.59063584 -1.0819347
   0.5042986 ]]


In [31]:
type(embeddings.tolist()[0][0])

float

## Select Most important papers

In [2]:
complete_df = pl.read_parquet("../../data/processed/pubmed/language=eng")

In [3]:
complete_df.select("number_of_referenced").describe()

statistic,number_of_referenced
str,f64
"""count""",10355354.0
"""null_count""",0.0
"""mean""",10.381413
"""std""",13811.348459
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",0.0
"""max""",44444441.0


In [4]:
most_referenced_papers = complete_df.filter(pl.col("number_of_referenced") > 10)

In [6]:
most_referenced_papers.write_parquet(
        "../../data/features/pubmed/most_cited_papers/",
        use_pyarrow=True,
        pyarrow_options={"partition_cols": ["year"]},
    )

In [5]:
most_referenced_papers.select("number_of_referenced").describe()

statistic,number_of_referenced
str,f64
"""count""",900972.0
"""null_count""",0.0
"""mean""",119.058182
"""std""",46823.281718
"""min""",11.0
"""25%""",33.0
"""50%""",54.0
"""75%""",88.0
"""max""",44444441.0


In [6]:
most_referenced_papers_pd = most_referenced_papers.to_pandas()
most_referenced_papers_pd


Unnamed: 0,number_of_referenced
0,24
1,21
2,17
3,21
4,13
...,...
900967,17
900968,27
900969,74
900970,56


In [7]:
# sns.histplot(data=most_referenced_papers_pd)

KeyboardInterrupt: 

## Playground


In [3]:
df = pl.read_parquet("../../data/processed/pubmed/language=eng").sample(10000)


In [9]:
df


pmid,date,number_of_referenced,date_revised,abstract_text,abstract_title,abstract_authors_list,medline_journal_info,pubmed_data,year
i64,struct[3],i64,struct[3],str,str,struct[1],struct[1],struct[4],i64
19284661,"{2009,6,25}",0,"{2021,10,20}","""Statistical learning is a cand…","""Statistical language learning …","{{[""Teinonen"", ""Fellman"", … ""Huotilainen""],[""Tuomas"", ""Vineta"", … ""Minna""],[""T"", ""V"", … ""M""],["""", """", … """"]}}","{""England""}","{{[[], [""19284661"", ""1471-2202-10-21"", … ""PMC2670827""]]},""epublish"",{{[2008, 2009, … 2009],[11, 3, … 6],[7, 13, … 26]}},{[],[]}}",2009
6109297,"{1981,3,17}",0,"{2018,12,12}","""The gene A protein of bacterio…","""The role of gene A protein and…","{{[""Eisenberg""],[""S""],[""S""],[""""]}}","{""England""}","{{[[], [""6109297"", ""10.1098/rspb.1980.0138""]]},""ppublish"",{{[1980, 1980, 1980],[11, 11, 11],[19, 19, 19]}},{[],[]}}",1981
19204935,"{2009,7,7}",72,"{2021,10,20}","""MFG-E8 was initially identifie…","""SED1/MFG-E8: a bi-motif protei…","{{[""Raymond"", ""Ensslin"", ""Shur""],[""Adam"", ""Michael A"", ""Barry D""],[""A"", ""MA"", ""BD""],["""", """", """"]}}","{""United States""}","{{[[], [""19204935"", ""10.1002/jcb.22076"", … ""NIHMS136771""]]},""ppublish"",{{[2009, 2009, 2009],[2, 2, 7],[11, 11, 8]}},{[],[]}}",2009
16887943,"{2007,3,19}",0,"{2018,11,13}","""To assess the effects of under…","""Effects of undercover police s…","{{[""Webster"", ""Bulzacchelli"", … ""Vernick""],[""D W"", ""M T"", … ""J S""],[""DW"", ""MT"", … ""JS""],["""", """", … """"]}}","{""England""}","{{[[], [""16887943"", ""12/4/225"", … ""PMC2586780""]]},""ppublish"",{{[2006, 2007, 2006],[8, 3, 8],[5, 21, 5]}},{[],[]}}",2007
2176790,"{1991,2,28}",0,"{2013,11,21}","""Plasmid pUC18rspL is a 3.788-k…","""pUC18rspL: a plasmid vector fo…","{{[""Vockley"", ""Pène""],[""J G"", ""J J""],[""JG"", ""JJ""],["""", """"]}}","{""England""}","{{[[], [""2176790""]]},""ppublish"",{{[1990, 1990, 1990],[12, 12, 12],[1, 1, 1]}},{[],[]}}",1991
…,…,…,…,…,…,…,…,…,…
15371212,"{2004,10,21}",0,"{2010,11,18}","""A bench-scale continuous-flow …","""Comparative performance studie…","{{[""Awuah"", ""Oppong-Peprah"", … ""Gijzen""],[""Esi"", ""M"", … ""H J""],[""E"", ""M"", … ""HJ""],["""", """", … """"]}}","{""England""}","{{[[], [""15371212"", ""10.1080/15287390490493466"", ""9N18A18VL8L275UB""]]},""ppublish"",{{[2004, 2004, 2004],[9, 10, 9],[17, 22, 17]}},{[],[]}}",2004
3378978,"{1988,7,15}",0,"{2017,12,13}","""Contractile failure during var…","""Motor drive and metabolic resp…","{{[""Vøllestad"", ""Sejersted"", … ""Bigland-Ritchie""],[""N K"", ""O M"", … ""B""],[""NK"", ""OM"", … ""B""],["""", """", … """"]}}","{""United States""}","{{[[], [""3378978"", ""10.1152/jappl.1988.64.4.1421""]]},""ppublish"",{{[1988, 1988, 1988],[4, 4, 4],[1, 1, 1]}},{[],[]}}",1988
14574242,"{2004,6,15}",0,"{2007,11,14}","""This article addresses the des…","""Methodological and statistical…","{{[""Stout""],[""Robert L""],[""RL""],[""""]}}","{""England""}","{{[[], [""14574242"", ""10.1097/01.ALC.0000091225.43296.8A""]]},""ppublish"",{{[2003, 2004, 2003],[10, 6, 10],[24, 16, 24]}},{[],[]}}",2004
3987632,"{1985,5,30}",0,"{2015,11,19}","""The control of adenylate cycla…","""Regulation by calmodulin of ad…","{{[""Peake"", ""Smoake""],[""G T"", ""J A""],[""GT"", ""JA""],["""", """"]}}","{""United States""}","{{[[], [""3987632"", ""10.1210/endo-116-5-2098""]]},""ppublish"",{{[1985, 1985, 1985],[5, 5, 5],[1, 1, 1]}},{[],[]}}",1985


In [10]:
subset_df = df.select("pmid", "date", "abstract_title", "abstract_text", "abstract_authors_list")

In [5]:
def encode_sentence(sentence: str) -> np.ndarray:
    return model.encode(sentence)

In [50]:
encode_sentence(sentences)

array([[-0.54902214, -0.0099182 , -0.26375914, ..., -0.1578922 ,
        -1.2998055 ,  0.80934745],
       [-1.0420779 ,  0.7897051 ,  0.5180282 , ..., -0.59063584,
        -1.0819347 ,  0.5042986 ]], dtype=float32)

In [14]:
def encode_sentence(sentence: str) -> list:
    return model.encode(sentence).tolist()

# Apply the model to the 'abstract_title' column and store the embeddings in a new column
subset_df = subset_df.with_columns(
    pl.col("abstract_title")
        .map_elements(encode_sentence, return_dtype=pl.List(pl.Float64))
        .alias("embeddings") 
)


In [15]:
subset_df

pmid,date,abstract_title,abstract_text,abstract_authors_list,embeddings
i64,struct[3],str,str,struct[1],list[f64]
19284661,"{2009,6,25}","""Statistical language learning …","""Statistical learning is a cand…","{{[""Teinonen"", ""Fellman"", … ""Huotilainen""],[""Tuomas"", ""Vineta"", … ""Minna""],[""T"", ""V"", … ""M""],["""", """", … """"]}}","[0.80617, 0.284487, … -0.319046]"
6109297,"{1981,3,17}","""The role of gene A protein and…","""The gene A protein of bacterio…","{{[""Eisenberg""],[""S""],[""S""],[""""]}}","[-0.121103, -0.264458, … -0.379017]"
19204935,"{2009,7,7}","""SED1/MFG-E8: a bi-motif protei…","""MFG-E8 was initially identifie…","{{[""Raymond"", ""Ensslin"", ""Shur""],[""Adam"", ""Michael A"", ""Barry D""],[""A"", ""MA"", ""BD""],["""", """", """"]}}","[-0.234351, 0.638279, … 0.353558]"
16887943,"{2007,3,19}","""Effects of undercover police s…","""To assess the effects of under…","{{[""Webster"", ""Bulzacchelli"", … ""Vernick""],[""D W"", ""M T"", … ""J S""],[""DW"", ""MT"", … ""JS""],["""", """", … """"]}}","[-0.190866, 1.16512, … -0.66012]"
2176790,"{1991,2,28}","""pUC18rspL: a plasmid vector fo…","""Plasmid pUC18rspL is a 3.788-k…","{{[""Vockley"", ""Pène""],[""J G"", ""J J""],[""JG"", ""JJ""],["""", """"]}}","[-0.291644, 0.163299, … -0.522049]"
…,…,…,…,…,…
15371212,"{2004,10,21}","""Comparative performance studie…","""A bench-scale continuous-flow …","{{[""Awuah"", ""Oppong-Peprah"", … ""Gijzen""],[""Esi"", ""M"", … ""H J""],[""E"", ""M"", … ""HJ""],["""", """", … """"]}}","[-0.294186, 0.183503, … 0.063347]"
3378978,"{1988,7,15}","""Motor drive and metabolic resp…","""Contractile failure during var…","{{[""Vøllestad"", ""Sejersted"", … ""Bigland-Ritchie""],[""N K"", ""O M"", … ""B""],[""NK"", ""OM"", … ""B""],["""", """", … """"]}}","[0.092667, 0.137016, … 1.167617]"
14574242,"{2004,6,15}","""Methodological and statistical…","""This article addresses the des…","{{[""Stout""],[""Robert L""],[""RL""],[""""]}}","[0.63467, 0.283401, … 0.322849]"
3987632,"{1985,5,30}","""Regulation by calmodulin of ad…","""The control of adenylate cycla…","{{[""Peake"", ""Smoake""],[""G T"", ""J A""],[""GT"", ""JA""],["""", """"]}}","[-0.691206, -0.478099, … 0.328019]"


## Read most important papers

In [7]:
import polars as pl
import os

# Define the base directory where your Parquet files are stored
base_dir = "../../data/features/pubmed/most_cited_papers/"

import polars as pl
import os

def load_parquet_files_filtering_by_year(base_dir: str, year_threshold: int) -> pl.DataFrame:
    # Get a list of subdirectories (years) and filter those greater than the specified threshold
    years = [d for d in os.listdir(base_dir) if d.startswith('year=')]
    filtered_years = [d for d in years if int(d.split('=')[1]) > year_threshold]

    # Create the full paths to the parquet files
    parquet_files = [os.path.join(base_dir, year, "*.parquet") for year in filtered_years]

    # Scan and concatenate all parquet files lazily
    df = pl.concat([pl.scan_parquet(file) for file in parquet_files])

    # Collect into a single DataFrame (execute the lazy operation)
    return df.collect()


df_1998 = load_parquet_files_filtering_by_year(base_dir, 1997)

df_1998


pmid,date,number_of_referenced,date_revised,abstract_text,abstract_title,abstract_authors_list,medline_journal_info,pubmed_data
i64,struct[3],i64,struct[3],str,str,struct[1],struct[1],struct[4]
9760878,"{1998,10,14}",59,"{2013,11,21}","""There are many skeptics who co…","""Criteria for standardizing and…","{{[""Stanley""],[""H R""],[""HR""],[""""]}}","{""United States""}","{{[[], [""9760878""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}"
9760925,"{1998,10,9}",35,"{2022,3,18}","""Implant success is as difficul…","""The implant quality scale: a c…","{{[""Misch""],[""C E""],[""CE""],[""""]}}","{""Canada""}","{{[[], [""9760925""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}"
9760923,"{1998,10,9}",25,"{2013,11,21}","""Until recently, there was no p…","""Intraoral repair of the fractu…","{{[""Robbins""],[""J W""],[""JW""],[""""]}}","{""United States""}","{{[[], [""9760923""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}"
9760977,"{1998,10,15}",25,"{2019,8,22}","""A case of an epidermoid cyst i…","""Intracranial epidermoid mimick…","{{[""Hasegawa"", ""Bitoh"", … ""Yasuda""],[""H"", ""S"", … ""H""],[""H"", ""S"", … ""H""],["""", """", … """"]}}","{""United States""}","{{[[], [""9760977"", ""0090-3019(81)90174-9"", ""10.1016/0090-3019(81)90174-9""]]},""ppublish"",{{[1981, 1998, 1981],[5, 10, 5],[1, 7, 1]}},{[],[]}}"
9760981,"{1998,10,15}",98,"{2022,4,19}","""The postoperative progress of …","""Preoperative neurological stat…","{{[""Foo"", ""Rossier""],[""D"", ""A B""],[""D"", ""AB""],["""", """"]}}","{""United States""}","{{[[], [""9760981"", ""0090-3019(81)90178-6"", ""10.1016/0090-3019(81)90178-6""]]},""ppublish"",{{[1981, 1998, 1981],[5, 10, 5],[1, 7, 1]}},{[],[]}}"
…,…,…,…,…,…,…,…,…
18802413,"{2022,11,21}",152,"{2022,11,21}","""Division of labour--individual…","""Genetic and genomic analyses o…","{{[""Smith"", ""Toth"", … ""Robinson""],[""Chris R"", ""Amy L"", … ""Gene E""],[""CR"", ""AL"", … ""GE""],["""", """", … """"]}}","{""England""}","{{[[], [""18802413"", ""nrg2429"", ""10.1038/nrg2429""]]},""ppublish"",{{[2008, 2008, 2008],[9, 10, 9],[20, 9, 20]}},{[],[]}}"
16332224,"{2022,11,21}",147,"{2022,11,21}","""Although best known for cooper…","""Conflict resolution in insect …","{{[""Ratnieks"", ""Foster"", ""Wenseleers""],[""Francis L W"", ""Kevin R"", ""Tom""],[""FL"", ""KR"", ""T""],["""", """", """"]}}","{""United States""}","{{[[], [""16332224"", ""10.1146/annurev.ento.51.110104.151003""]]},""ppublish"",{{[2005, 2006, 2005],[12, 8, 12],[8, 4, 8]}},{[],[]}}"
17115184,"{2022,11,1}",23,"{2022,11,1}","""Recurrent acute respiratory tr…","""Oral purified bacterial extrac…","{{[""Steurer-Stey"", ""Lagler"", … ""Bachmann""],[""Claudia"", ""Leonie"", … ""Lucas M""],[""C"", ""L"", … ""LM""],["""", """", … """"]}}","{""Germany""}","{{[[], [""17115184"", ""10.1007/s00431-006-0248-3""]]},""ppublish"",{{[2005, 2006, … 2006],[12, 7, … 11],[28, 5, … 23]}},{[],[]}}"
11497072,"{2022,12,30}",36,"{2022,12,30}","""More than one in five communit…","""Assessing functional ability i…","{{[""Davis""],[""L L""],[""LL""],[""""]}}","{""United States""}","{{[[], [""11497072"", ""10.1097/01376517-200108000-00005""]]},""ppublish"",{{[2001, 2002, 2001],[8, 1, 8],[11, 5, 11]}},{[],[]}}"


### Embed data


In [15]:
def encode_sentence(sentence: str) -> np.ndarray:
    return model.encode(sentence).tolist()

In [8]:
df_1998_sample = df_1998.sample(100)

In [29]:
# Apply the model to the 'abstract_title' column and store the embeddings in a new column

df_1998 = df_1998.with_columns(
    pl.col("abstract_title")
        .map_elements(encode_sentence) 
        .alias("embeddings") 
)



In [31]:
df_1998.head()

pmid,date,number_of_referenced,date_revised,abstract_text,abstract_title,abstract_authors_list,medline_journal_info,pubmed_data,embeddings
i64,struct[3],i64,struct[3],str,str,struct[1],struct[1],struct[4],list[f64]
9760878,"{1998,10,14}",59,"{2013,11,21}","""There are many skeptics who co…","""Criteria for standardizing and…","{{[""Stanley""],[""H R""],[""HR""],[""""]}}","{""United States""}","{{[[], [""9760878""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}","[-0.110846, 0.635127, … 0.339552]"
9760925,"{1998,10,9}",35,"{2022,3,18}","""Implant success is as difficul…","""The implant quality scale: a c…","{{[""Misch""],[""C E""],[""CE""],[""""]}}","{""Canada""}","{{[[], [""9760925""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}","[0.047, 0.179832, … 0.453903]"
9760923,"{1998,10,9}",25,"{2013,11,21}","""Until recently, there was no p…","""Intraoral repair of the fractu…","{{[""Robbins""],[""J W""],[""JW""],[""""]}}","{""United States""}","{{[[], [""9760923""]]},""ppublish"",{{[1998, 1998, 1998],[10, 10, 10],[7, 7, 7]}},{[],[]}}","[-0.560129, 0.195439, … 0.143268]"
9760977,"{1998,10,15}",25,"{2019,8,22}","""A case of an epidermoid cyst i…","""Intracranial epidermoid mimick…","{{[""Hasegawa"", ""Bitoh"", … ""Yasuda""],[""H"", ""S"", … ""H""],[""H"", ""S"", … ""H""],["""", """", … """"]}}","{""United States""}","{{[[], [""9760977"", ""0090-3019(81)90174-9"", ""10.1016/0090-3019(81)90174-9""]]},""ppublish"",{{[1981, 1998, 1981],[5, 10, 5],[1, 7, 1]}},{[],[]}}","[0.372623, 0.002539, … -0.229112]"
9760981,"{1998,10,15}",98,"{2022,4,19}","""The postoperative progress of …","""Preoperative neurological stat…","{{[""Foo"", ""Rossier""],[""D"", ""A B""],[""D"", ""AB""],["""", """"]}}","{""United States""}","{{[[], [""9760981"", ""0090-3019(81)90178-6"", ""10.1016/0090-3019(81)90178-6""]]},""ppublish"",{{[1981, 1998, 1981],[5, 10, 5],[1, 7, 1]}},{[],[]}}","[1.024776, 1.150899, … -0.348803]"


In [32]:
df_1998.schema

Schema([('pmid', Int64),
        ('date', Struct({'Year': Int64, 'Month': Int64, 'Day': Int64})),
        ('number_of_referenced', Int64),
        ('date_revised',
         Struct({'Year': Int64, 'Month': Int64, 'Day': Int64})),
        ('abstract_text', String),
        ('abstract_title', String),
        ('abstract_authors_list',
         Struct({'Author': Struct({'LastName': List(String), 'ForeName': List(String), 'Initials': List(String), 'CollectiveName': List(String)})})),
        ('medline_journal_info', Struct({'Country': String})),
        ('pubmed_data',
         Struct({'ArticleIdList': Struct({'ArticleId': List(List(String))}), 'PublicationStatus': String, 'History': Struct({'PubMedPubDate': Struct({'Year': List(Int64), 'Month': List(Int64), 'Day': List(Int64)})}), 'ReferenceList': Struct({'Citation': List(Null), 'CitationId': List(Null)})})),
        ('embeddings', List(Float64))])

In [41]:
pmid_embeddings_df = df_1998.select(['pmid', 'embeddings'])

# Create a new column 'metadata' containing all other columns as key-value pairs
metadata_columns = ['date', 'number_of_referenced', 'date_revised', 'abstract_text', 'abstract_title',  'abstract_authors_list', 'medline_journal_info']


In [43]:
metadata_df = df_1998.select(metadata_columns).with_columns(pl.struct(metadata_columns).alias('metadata'))

In [46]:
metadata_df.schema

Schema([('date', Struct({'Year': Int64, 'Month': Int64, 'Day': Int64})),
        ('number_of_referenced', Int64),
        ('date_revised',
         Struct({'Year': Int64, 'Month': Int64, 'Day': Int64})),
        ('abstract_text', String),
        ('abstract_title', String),
        ('abstract_authors_list',
         Struct({'Author': Struct({'LastName': List(String), 'ForeName': List(String), 'Initials': List(String), 'CollectiveName': List(String)})})),
        ('medline_journal_info', Struct({'Country': String})),
        ('metadata',
         Struct({'date': Struct({'Year': Int64, 'Month': Int64, 'Day': Int64}), 'number_of_referenced': Int64, 'date_revised': Struct({'Year': Int64, 'Month': Int64, 'Day': Int64}), 'abstract_text': String, 'abstract_title': String, 'abstract_authors_list': Struct({'Author': Struct({'LastName': List(String), 'ForeName': List(String), 'Initials': List(String), 'CollectiveName': List(String)})}), 'medline_journal_info': Struct({'Country': String})}))])

In [49]:
# Combine the 'pmid', 'embeddings', and 'metadata' columns
result_df = pl.DataFrame({
    'pmid': df_1998['pmid'],
    'embeddings': df_1998['embeddings'],
    'metadata': metadata_df['metadata']
})

# Display the resulting DataFrame schema
result_df.schema

Schema([('pmid', Int64),
        ('embeddings', List(Float64)),
        ('metadata',
         Struct({'date': Struct({'Year': Int64, 'Month': Int64, 'Day': Int64}), 'number_of_referenced': Int64, 'date_revised': Struct({'Year': Int64, 'Month': Int64, 'Day': Int64}), 'abstract_text': String, 'abstract_title': String, 'abstract_authors_list': Struct({'Author': Struct({'LastName': List(String), 'ForeName': List(String), 'Initials': List(String), 'CollectiveName': List(String)})}), 'medline_journal_info': Struct({'Country': String})}))])

In [55]:
! ls ../../data/features/pubmed/pinecone/formated/most_cited_papers_1998/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [60]:

result_df.write_parquet(
        "../../data/features/pubmed/pinecone/formated/most_cited_papers_1998/most_cited_papers_1998.parquet",
        use_pyarrow=True
    )

In [9]:

df = pl.read_parquet("../../data/features/pubmed/pinecone/formated/most_cited_papers_1998/most_cited_papers_1998.parquet")

In [13]:
df = df.with_columns(pl.col("id").cast(pl.Utf8))

In [16]:
df

id,values,metadata
str,list[f64],struct[7]
"""9760878""","[-0.110846, 0.635127, … 0.339552]","{{1998,10,14},59,{2013,11,21},""There are many skeptics who condemn pulp capping but like to keep an eye on the research progress being made. Considerable literature emphasizes the negative aspects of vital pulp therapy and discourages its practice. Some clinicians and investigators continue to condemn pulp capping therapy for the same reasons reported in the literature 80 years ago despite the advances made in pulp biology. Clinicians are well aware of the immediate and long-term success rates after root canal therapy, but are less certain of the success of pulp capping. A number of nagging questions plague clinicians, when confronted with the choice of treatment. The research data on pulp capping is at times inadequate, confusing, misleading or even incorrect and diminishes the confidence of the practitioner in performing pulp capping."",""Criteria for standardizing and increasing credibility of direct pulp capping studies."",{{[""Stanley""],[""H R""],[""HR""],[""""]}},{""United States""}}"
"""9760925""","[0.047, 0.179832, … 0.453903]","{{1998,10,9},35,{2022,3,18},""Implant success is as difficult to describe as the success criteria required for a tooth. A range from health to disease exists in both conditions. The primary criteria for assessing implant quality are pain and mobility. The presence of either one greatly compromises the implant, and removal is usually indicated. Probing depths may be related to the presence of local disease or pre-existing tissue thickness before the implant was inserted. An increasing probing depth is more diagnostic and signifies bone loss, gingival hyperplasia or hypertrophy. Bone loss is usually evaluated best with probing rather than with radiographs. The most common cause of bone loss during the first few years of function are exaggerated factors of stress. The bleeding index is easily observed and indicates inflammation of the gingiva. However, implant health status is not as related to sulcular inflammation as would be the case for a natural tooth. Implant failure is easier to describe and may consist of a variety of factors. Any pain, vertical mobility, uncontrolled progressive bone loss, and/or generalized periradiolucency warrant implant removal. Implant quality factors were established by James and modified by Misch into an implant quality scale which not only assesses the implant health-disease continuum, but relates treatment and prognosis to the existing conditions."",""The implant quality scale: a clinical assessment of the health--disease continuum."",{{[""Misch""],[""C E""],[""CE""],[""""]}},{""Canada""}}"
"""9760923""","[-0.560129, 0.195439, … 0.143268]","{{1998,10,9},25,{2013,11,21},""Until recently, there was no predictable technique for repairing the fractured porcelain restoration. However, with the advent of many new products related to bonding porcelain, there are techniques available today to repair fractured porcelain with moderate expectations of success."",""Intraoral repair of the fractured porcelain restoration."",{{[""Robbins""],[""J W""],[""JW""],[""""]}},{""United States""}}"
"""9760977""","[0.372623, 0.002539, … -0.229112]","{{1998,10,15},25,{2019,8,22},""A case of an epidermoid cyst in the frontal base which showed homogeneous high density in noncontrast computed tomography, simulating a meningioma with calcification, is reported. Operative findings and histological examination suggested that this high density was caused by spontaneous hemorrhage into the cyst."",""Intracranial epidermoid mimicking meningioma."",{{[""Hasegawa"", ""Bitoh"", … ""Yasuda""],[""H"", ""S"", … ""H""],[""H"", ""S"", … ""H""],["""", """", … """"]}},{""United States""}}"
"""9760981""","[1.024776, 1.150899, … -0.348803]","{{1998,10,15},98,{2022,4,19},""The postoperative progress of 3 patients with spinal epidural hemorrhage, but without spinal fracture or dislocation, is presented. From the literature, 158 cases were collected of spontaneous spinal epidural hematoma treated surgically. Postoperative return of motor function was noted in 95.3%, 87%, and 45.3% of the patients with incomplete sensorimotor, incomplete sensory but complete motor, and complete sensorimotor lesions, respectively. Complete sensorimotor recovery occurred in 41.9%, 26.1%, and 11.3% of these 3 groups of patients, respectively. Recovery following surgical treatment depends on the severity of neurological deficits before treatment. However, the absence of motor or sensorimotor functions preoperatively does not necessarily indicate a poor prognosis."",""Preoperative neurological status in predicting surgical outcome of spinal epidural hematomas."",{{[""Foo"", ""Rossier""],[""D"", ""A B""],[""D"", ""AB""],["""", """"]}},{""United States""}}"
…,…,…
"""18802413""","[0.742783, 0.402037, … 0.266104]","{{2022,11,21},152,{2022,11,21},""Division of labour--individuals specializing in different activities--features prominently in the spectacular success of the social insects. Until recently, genetic and genomic analyses of division of labour were limited to just a few species. However, research on an ever-increasing number of species has provided new insight, from which we highlight two results. First, heritable influences on division of labour are more pervasive than previously imagined. Second, different forms of division of labour, in lineages in which eusociality has arisen independently, have evolved through changes in the regulation of highly conserved molecular pathways associated with several basic life-history traits, including nutrition, metabolism and reproduction."",""Genetic and genomic analyses of the division of labour in insect societies."",{{[""Smith"", ""Toth"", … ""Robinson""],[""Chris R"", ""Amy L"", … ""Gene E""],[""CR"", ""AL"", … ""GE""],["""", """", … """"]}},{""England""}}"
"""16332224""","[0.188841, 0.726058, … 0.375117]","{{2022,11,21},147,{2022,11,21},""Although best known for cooperation, insect societies also manifest many potential conflicts among individuals. These conflicts involve both direct reproduction by individuals and manipulation of the reproduction of colony members. Here we review five major areas of reproductive conflict in insect societies: (a) sex allocation, (b) queen rearing, (c) male rearing, (d) queen-worker caste fate, and (e) breeding conflicts among totipotent adults. For each area we discuss the basis for conflict (potential conflict), whether conflict is expressed (actual conflict), whose interests prevail (conflict outcome), and the factors that reduce colony-level costs of conflict (conflict resolution), such as factors that cause workers to work rather than to lay eggs. Reproductive conflicts are widespread, sometimes having dramatic effects on the colony. However, three key factors (kinship, coercion, and constraint) typically combine to limit the effects of reproductive conflict and often lead to complete resolution."",""Conflict resolution in insect societies."",{{[""Ratnieks"", ""Foster"", ""Wenseleers""],[""Francis L W"", ""Kevin R"", ""Tom""],[""FL"", ""KR"", ""T""],["""", """", """"]}},{""United States""}}"
"""17115184""","[0.290454, -0.171456, … 0.101436]","{{2022,11,1},23,{2022,11,1},""Recurrent acute respiratory tract infections (ARTI) are a common problem in childhood. Some evidence suggests a benefit regarding the prevention of ARTI in children treated with the immunomodulator OM-85 BV (Bronchovaxom)."",""Oral purified bacterial extracts in acute respiratory tract infections in childhood: a systematic quantitative review."",{{[""Steurer-Stey"", ""Lagler"", … ""Bachmann""],[""Claudia"", ""Leonie"", … ""Lucas M""],[""C"", ""L"", … ""LM""],["""", """", … """"]}},{""Germany""}}"
"""11497072""","[-0.361052, -0.046856, … 0.215672]","{{2022,12,30},36,{2022,12,30},""More than one in five community-dwelling older individuals is unwilling or unable to provide information on functional abilities. In such situations the standard procedure is to augment self-reports with those of family members or other close informants. However, when these reports differ, it often is difficult to determine whether the older individual is overly optimistic about his or her functional abilities or the family informant is unduly pessimistic. This article explores factors that influence family caregiver assessments of functional abilities in older individuals with some degree of cognitive loss or impairment and presents suggestions for enhancing the accuracy and dependability of functional assessments by family informants."",""Assessing functional ability in persons with dementia: using family caregivers as informants."",{{[""Davis""],[""L L""],[""LL""],[""""]}},{""United States""}}"


In [6]:
df = df.rename(
    {"embeddings": "values",
     "pmid": "id"}
)

In [17]:
df.write_parquet(
        "../../data/features/pubmed/pinecone/formated/most_cited_papers_1998/most_cited_papers_1998.parquet",
        use_pyarrow=True
    )
