In [1]:
import os
import json
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import torch

In [2]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))

PyTorch version: 2.5.1+cu118
CUDA available: True
CUDA device: NVIDIA GeForce RTX 4060 Ti


In [3]:
from gliner import GLiNER

model = GLiNER.from_pretrained("gliner-community/gliner_small-v2.5", load_tokenizer=True)

text = """
Hypertrophic pachymeningitis (HP) is a rare chronic inflammatory disorder characterized by marked fibrous thickening of the cerebral and/or spinal dura mater. This condition is caused by infection, inflammation, autoimmune disorder, neoplasms, or idiopathic. Magnetic resonance imaging (MRI) may play an important role in differentiating idiopathic HP from secondary HP, may avoid unnecessarily invasive dural biopsy, and prompt specific treatment. To determine the specific MRI findings for differentiation between idiopathic HP and secondary HP. A total of 34 patients underwent MRI of the brain and cervical spine from January 2003 to December 2015. In all, 23 patients were diagnosed idiopathic HP and 11 patients were secondary HP. Demographic data and imaging findings reveal the following: Configuration, thickness, signal intensity on T1-weighted image (T1WI), T2-weighted image (T2WI), and enhancement pattern of the lesions. The data were analyzed by T-test and Fisher's exact test. Secondary HP were significantly located at anterior and middle cranial fossa (P = 0.033). There is no significant difference of lesions in configurations, T1 and T2 signal intensity and patterns of enhancement. There was significant and exclusive difference in T2 hypointense/dark intensity and homogeneous enhancement in idiopathic HP (75%, P = 0.044). MRI may play a complimentarily important role in distinguishing idiopathic HP from secondary HP. Idiopathic HP is probably preferred diagnosis in the lesions with T2-rim pattern and T2 hypointense/dark intensity with homogeneous enhancement
"""

labels = {
    "Agricultural and Biological Sciences": 0,
    "Arts and Humanities": 0,
    "Biochemistry, Genetics and Molecular Biology": 0,
    "Business, Management and Accounting": 0,
    "Chemical Engineering": 0,
    "Chemistry": 0,
    "Computer Science": 0,
    "Decision Sciences": 0,
    "Dentistry": 0,
    "Earth and Planetary Sciences": 0,
    "Economics, Econometrics and Finance": 0,
    "Energy": 0,
    "Engineering": 0,
    "Environmental Science": 0,
    "Health Professions": 0,
    "Immunology and Microbiology": 0,
    "Materials Science": 0,
    "Mathematics": 0,
    "Medicine": 0,
    "Neuroscience": 0,
    "Nursing": 0,
    "Pharmacology, Toxicology and Pharmaceutics": 0,
    "Physics and Astronomy": 0,
    "Psychology": 0,
    "Social Sciences": 0,
    "Veterinary": 0,
    "Multidisciplinary": 0
}

labels2 = {
    "Sciences": 0,
    "Health and Medicine":0,
    "Engineering and Technology":0,
    "Social Sciences and Humanities":0,
    "Mathematics and Multidisciplinary":0
}


entities = model.predict_entities(text, labels2.keys())



for entity in entities:
    labels2[entity['label']] +=1
    print(entity["text"], "=>", entity["label"])


  from .autonotebook import tqdm as notebook_tqdm
Fetching 10 files: 100%|██████████| 10/10 [00:00<?, ?it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Hypertrophic pachymeningitis => Health and Medicine
HP => Health and Medicine
inflammatory disorder => Health and Medicine
infection => Health and Medicine
inflammation => Health and Medicine
autoimmune disorder => Health and Medicine
neoplasms => Health and Medicine
idiopathic => Health and Medicine
MRI => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
HP => Health and Medicine
dural biopsy => Health and Medicine
idiopathic HP => Health and Medicine
HP => Health and Medicine
idiopathic HP => Health and Medicine
HP => Health and Medicine
lesions => Health and Medicine
HP => Health and Medicine
lesions => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
HP => Health and Medicine
Idiopathic => Health and Medicine
HP => Health and Medicine


In [None]:
for k,v in labels2.items():
    if v == max(labels2.values()):
        print(k,v)

Health and Medicine 27


In [4]:
uri = "mongodb+srv://KTAP8:JhpxOn0CFlXE5mty@dsdedata.hv1co.mongodb.net/?retryWrites=true&w=majority&appName=DsdeData"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
db = client['DsdeData']  # Replace with your database name

papers = db['papers']

df_papers = pd.DataFrame(list(papers.find()))


In [4]:
df_papers.shape

(20216, 11)

In [5]:
df_papers.isnull().sum()

_id                  0
reference            0
abstracts          665
correspondence    2363
affiliation          0
publishedDate        0
coreData             0
language           120
authorKeywords    3762
subjectArea          0
author               0
dtype: int64

In [5]:
# Map abbreviations to categories and store them in a new column
generalized_fields = {
    "Sciences": [
        "AGRI",  # Agricultural and Biological Sciences
        "BIOC",  # Biochemistry, Genetics and Molecular Biology
        "EART",  # Earth and Planetary Sciences
        "ENVI",  # Environmental Science
        "MATE",  # Materials Science
        "PHYS"   # Physics and Astronomy
    ],
    "Health and Medicine": [
        "DENT",  # Dentistry
        "HEAL",  # Health Professions
        "IMMU",  # Immunology and Microbiology
        "MEDI",  # Medicine
        "NEUR",  # Neuroscience
        "NURS",  # Nursing
        "PHAR",  # Pharmacology, Toxicology and Pharmaceutics
        "VETE"   # Veterinary
    ],
    "Engineering and Technology": [
        "CENG",  # Chemical Engineering
        "COMP",  # Computer Science
        "ENER",  # Energy
        "ENGI"   # Engineering
    ],
    "Social Sciences and Humanities": [
        "ARTS",  # Arts and Humanities
        "BUSI",  # Business, Management and Accounting
        "DECI",  # Decision Sciences
        "ECON",  # Economics, Econometrics and Finance
        "PSYC",  # Psychology
        "SOCI"   # Social Sciences
    ],
    "Mathematics and Multidisciplinary": [
        "MATH",  # Mathematics
        "MULT"   # Multidisciplinary
    ]
}


def map_subject_area(subject_dict):
    ans = set()
    for key, value in subject_dict.items():
        for name, sub in generalized_fields.items():
            if key in sub:
                ans.add(name)
    return list(ans)

df_papers["mapped_subjectArea"] = df_papers["subjectArea"].apply(map_subject_area)

In [7]:
df_papers[0:5]

Unnamed: 0,_id,reference,abstracts,correspondence,affiliation,publishedDate,coreData,language,authorKeywords,subjectArea,author,mapped_subjectArea
0,673c36ebe2e18c4ad60c5074,"{'ref_count': '46', 'ref_publishYear_titleText...",© 2018The microstructure and corrosion behavio...,{'Zhang X.': None},{'60091507': {'name': 'Metallurgy and Material...,2018-10-01,{'title': 'Effects of iron content on the micr...,eng,"[EIS, Microstructure, Pitting corrosion, Polar...","{'MATE': ['Materials Science (all)'], 'PHYS': ...","{'57203845459': {'name': 'Gao X.', 'afid': '60...",[Sciences]
1,673c36ebe2e18c4ad60c5075,"{'ref_count': '19', 'ref_publishYear_titleText...",Copyright © 2018 Inderscience Enterprises Ltd....,{'Chandrachai A.': None},{'60028190': {'name': 'Chulalongkorn Universit...,2018-01-01,{'title': 'The critical factors of research an...,eng,"[Critical factors, Innovation creation, Public...",{'BUSI': ['Business and International Manageme...,"{'57202719581': {'name': 'Supapawawisit B.', '...",[Social Sciences and Humanities]
2,673c36ebe2e18c4ad60c5076,"{'ref_count': '27', 'ref_publishYear_titleText...",© 2018Background: Hyperkyphosis may be frequen...,{'Amatachaya S.': None},"{'60017165': {'name': 'Khon Kaen University', ...",2018-12-01,{'title': 'Is the occiput-wall distance valid ...,eng,"[Cobb angle, Dowager's hump, Round back, Spine]","{'HEAL': ['Physical Therapy, Sports Therapy an...","{'57194518787': {'name': 'Wiyanad A.', 'afid':...",[Health and Medicine]
3,673c36ebe2e18c4ad60c5077,"{'ref_count': '15', 'ref_publishYear_titleText...",© 2018 Society for Innovative Research. All ri...,{'Pruksakorn S.': 'Interdisciplinary Program o...,"{'60110518': {'name': 'Rajabhat University', '...",2018-01-01,{'title': 'Comparison of soil composition betw...,eng,"[Agriculture land management, Conserved area, ...","{'CHEM': ['Analytical Chemistry', 'Spectroscop...","{'57201333216': {'name': 'Pruksakorn S.', 'afi...",[Health and Medicine]
4,673c36ebe2e18c4ad60c5078,"{'ref_count': '18', 'ref_publishYear_titleText...",© 2018Background and Aims: Wire-guided biliary...,{'Bourke M.J.': None},{'60073385': {'name': 'National Taiwan Univers...,2018-06-01,{'title': 'The impact of wire caliber on ERCP ...,eng,,"{'MEDI': ['Radiology, Nuclear Medicine and Ima...","{'57191723984': {'name': 'Bassan M.S.', 'afid'...",[Health and Medicine]


In [6]:
# Detect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move model to GPU
model = model.to(device)

Using device: cuda


In [7]:
def get_prediction(df_papers, model):
    # Initialize columns
    df_papers['Prediction'] = 0  # Default to 0 for all rows
    df_papers['Predictions_area'] = None  # Default to None for all rows

    for index, each in df_papers.iterrows():
        # Ensure 'coredata' and 'title' exist
        if 'abstracts' in each:
            if each['abstracts'] == None:
                continue
            text = each['abstracts']
        else:
            continue  # Skip rows without 'coredata' or 'title'
        
        # Initialize labels dictionary
        labels2 = {
            "Sciences": 0,
            "Health and Medicine": 0,
            "Engineering and Technology": 0,
            "Social Sciences and Humanities": 0,
            "Mathematics and Multidisciplinary": 0
        }

        # Get predictions from the model
        entities = model.predict_entities(text, labels2.keys())

        # Update label counts
        for entity in entities:
            labels2[entity['label']] += 1
        # Find the label with the maximum count
        max_label = max(labels2, key=labels2.get)
        
        # Update Predictions_area with max_label
        df_papers.at[index, 'Predictions_area'] = max_label

        # Update Prediction if the max label is in mapped_subjectArea
        if max_label in each['mapped_subjectArea']:
            df_papers.at[index, 'Prediction'] = 1

    print('done predicting!!')

df_temp = df_papers.copy()

get_prediction(df_temp,model)



done predicting!!


In [8]:
df_temp

Unnamed: 0,_id,reference,abstracts,correspondence,affiliation,publishedDate,coreData,language,authorKeywords,subjectArea,author,mapped_subjectArea,Prediction,Predictions_area
0,673c36ebe2e18c4ad60c5074,"{'ref_count': '46', 'ref_publishYear_titleText...",© 2018The microstructure and corrosion behavio...,{'Zhang X.': None},{'60091507': {'name': 'Metallurgy and Material...,2018-10-01,{'title': 'Effects of iron content on the micr...,eng,"[EIS, Microstructure, Pitting corrosion, Polar...","{'MATE': ['Materials Science (all)'], 'PHYS': ...","{'57203845459': {'name': 'Gao X.', 'afid': '60...",[Sciences],0,Health and Medicine
1,673c36ebe2e18c4ad60c5075,"{'ref_count': '19', 'ref_publishYear_titleText...",Copyright © 2018 Inderscience Enterprises Ltd....,{'Chandrachai A.': None},{'60028190': {'name': 'Chulalongkorn Universit...,2018-01-01,{'title': 'The critical factors of research an...,eng,"[Critical factors, Innovation creation, Public...",{'BUSI': ['Business and International Manageme...,"{'57202719581': {'name': 'Supapawawisit B.', '...",[Social Sciences and Humanities],0,Sciences
2,673c36ebe2e18c4ad60c5076,"{'ref_count': '27', 'ref_publishYear_titleText...",© 2018Background: Hyperkyphosis may be frequen...,{'Amatachaya S.': None},"{'60017165': {'name': 'Khon Kaen University', ...",2018-12-01,{'title': 'Is the occiput-wall distance valid ...,eng,"[Cobb angle, Dowager's hump, Round back, Spine]","{'HEAL': ['Physical Therapy, Sports Therapy an...","{'57194518787': {'name': 'Wiyanad A.', 'afid':...",[Health and Medicine],1,Health and Medicine
3,673c36ebe2e18c4ad60c5077,"{'ref_count': '15', 'ref_publishYear_titleText...",© 2018 Society for Innovative Research. All ri...,{'Pruksakorn S.': 'Interdisciplinary Program o...,"{'60110518': {'name': 'Rajabhat University', '...",2018-01-01,{'title': 'Comparison of soil composition betw...,eng,"[Agriculture land management, Conserved area, ...","{'CHEM': ['Analytical Chemistry', 'Spectroscop...","{'57201333216': {'name': 'Pruksakorn S.', 'afi...",[Health and Medicine],0,Sciences
4,673c36ebe2e18c4ad60c5078,"{'ref_count': '18', 'ref_publishYear_titleText...",© 2018Background and Aims: Wire-guided biliary...,{'Bourke M.J.': None},{'60073385': {'name': 'National Taiwan Univers...,2018-06-01,{'title': 'The impact of wire caliber on ERCP ...,eng,,"{'MEDI': ['Radiology, Nuclear Medicine and Ima...","{'57191723984': {'name': 'Bassan M.S.', 'afid'...",[Health and Medicine],1,Health and Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211,673c3a8be2e18c4ad60c9f67,"{'ref_count': '43', 'ref_publishYear_titleText...",© 2023 by the authors.This study discusses the...,{'Chaitusaney S.': 'Department of Electrical E...,{'60028190': {'name': 'Chulalongkorn Universit...,2023-04-01,{'title': 'A Techno-Economic Assessment of a S...,eng,"[battery degradation, electric vehicle chargin...","{'COMP': ['Computer Science (miscellaneous)', ...","{'56085590400': {'name': 'Wangsupphaphol A.', ...","[Sciences, Engineering and Technology, Social ...",1,Engineering and Technology
20212,673c3a8be2e18c4ad60c9f68,"{'ref_count': '129', 'ref_publishYear_titleTex...",© 2022 ERP Environment and John Wiley & Sons L...,"{'Anantachart S.': 'Advertising Program, Depar...",{'60028190': {'name': 'Chulalongkorn Universit...,2023-01-01,{'title': 'Encouraging green product purchase:...,eng,"[attitude–behavior gap, environmental knowledg...",{'BUSI': ['Business and International Manageme...,"{'57201432588': {'name': 'Chaihanchanchai P.',...","[Sciences, Social Sciences and Humanities]",0,Health and Medicine
20213,673c3a8be2e18c4ad60c9f69,"{'ref_count': '35', 'ref_publishYear_titleText...",© 2023 The Author(s). Published by Informa UK ...,"{'Owattanapanich W.': 'Division of Hematology,...","{'60199578': {'name': 'Ramathibodi Hospital', ...",2023-01-01,{'title': 'Does leukocytosis remain a predicti...,eng,"[Acute promyelocytic leukemia, APL, Thailand]",{'MEDI': ['Hematology']},{'57203951956': {'name': 'Kungwankiattichai S....,[Health and Medicine],1,Health and Medicine
20214,673c3a8be2e18c4ad60c9f6a,"{'ref_count': '30', 'ref_publishYear_titleText...",© 2023 by Animal Bioscience.Objective: Inflamm...,"{'Tummaruk P.': 'Department of Obstetrics, Gyn...",{'60028190': {'name': 'Chulalongkorn Universit...,2023-08-01,{'title': 'Administration of ketoprofen in pos...,eng,"[Colostrum, Inflammation, Ketoprofen, Lactatio...","{'AGRI': ['Food Science', 'Animal Science and ...","{'58503344900': {'name': 'Jeeraphokhakul S.', ...","[Sciences, Health and Medicine]",1,Health and Medicine


In [9]:
df_temp['Prediction'].sum()/df_temp[df_temp['abstracts'] != None].shape[0]

0.5795409576573012