In [1]:
import os
import json
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
from gliner import GLiNER

model = GLiNER.from_pretrained("gliner-community/gliner_small-v2.5", load_tokenizer=True)

text = """
Hypertrophic pachymeningitis (HP) is a rare chronic inflammatory disorder characterized by marked fibrous thickening of the cerebral and/or spinal dura mater. This condition is caused by infection, inflammation, autoimmune disorder, neoplasms, or idiopathic. Magnetic resonance imaging (MRI) may play an important role in differentiating idiopathic HP from secondary HP, may avoid unnecessarily invasive dural biopsy, and prompt specific treatment. To determine the specific MRI findings for differentiation between idiopathic HP and secondary HP. A total of 34 patients underwent MRI of the brain and cervical spine from January 2003 to December 2015. In all, 23 patients were diagnosed idiopathic HP and 11 patients were secondary HP. Demographic data and imaging findings reveal the following: Configuration, thickness, signal intensity on T1-weighted image (T1WI), T2-weighted image (T2WI), and enhancement pattern of the lesions. The data were analyzed by T-test and Fisher's exact test. Secondary HP were significantly located at anterior and middle cranial fossa (P = 0.033). There is no significant difference of lesions in configurations, T1 and T2 signal intensity and patterns of enhancement. There was significant and exclusive difference in T2 hypointense/dark intensity and homogeneous enhancement in idiopathic HP (75%, P = 0.044). MRI may play a complimentarily important role in distinguishing idiopathic HP from secondary HP. Idiopathic HP is probably preferred diagnosis in the lesions with T2-rim pattern and T2 hypointense/dark intensity with homogeneous enhancement
"""

labels = {
    "Agricultural and Biological Sciences": 0,
    "Arts and Humanities": 0,
    "Biochemistry, Genetics and Molecular Biology": 0,
    "Business, Management and Accounting": 0,
    "Chemical Engineering": 0,
    "Chemistry": 0,
    "Computer Science": 0,
    "Decision Sciences": 0,
    "Dentistry": 0,
    "Earth and Planetary Sciences": 0,
    "Economics, Econometrics and Finance": 0,
    "Energy": 0,
    "Engineering": 0,
    "Environmental Science": 0,
    "Health Professions": 0,
    "Immunology and Microbiology": 0,
    "Materials Science": 0,
    "Mathematics": 0,
    "Medicine": 0,
    "Neuroscience": 0,
    "Nursing": 0,
    "Pharmacology, Toxicology and Pharmaceutics": 0,
    "Physics and Astronomy": 0,
    "Psychology": 0,
    "Social Sciences": 0,
    "Veterinary": 0,
    "Multidisciplinary": 0
}

labels2 = {
    "Sciences": 0,
    "Health and Medicine":0,
    "Engineering and Technology":0,
    "Social Sciences and Humanities":0,
    "Mathematics and Multidisciplinary":0
}


entities = model.predict_entities(text, labels2.keys())



for entity in entities:
    labels2[entity['label']] +=1
    print(entity["text"], "=>", entity["label"])


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 10 files: 100%|██████████| 10/10 [00:18<00:00,  1.86s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Hypertrophic pachymeningitis => Health and Medicine
HP => Health and Medicine
inflammatory disorder => Health and Medicine
infection => Health and Medicine
inflammation => Health and Medicine
autoimmune disorder => Health and Medicine
neoplasms => Health and Medicine
idiopathic => Health and Medicine
MRI => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
HP => Health and Medicine
dural biopsy => Health and Medicine
idiopathic HP => Health and Medicine
HP => Health and Medicine
idiopathic HP => Health and Medicine
HP => Health and Medicine
lesions => Health and Medicine
HP => Health and Medicine
lesions => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
idiopathic => Health and Medicine
HP => Health and Medicine
HP => Health and Medicine
Idiopathic => Health and Medicine
HP => Health and Medicine


In [None]:
for k,v in labels2.items():
    if v == max(labels2.values()):
        print(k,v)

Health and Medicine 27


In [3]:
uri = "mongodb+srv://KTAP8:JhpxOn0CFlXE5mty@dsdedata.hv1co.mongodb.net/?retryWrites=true&w=majority&appName=DsdeData"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
db = client['DsdeData']  # Replace with your database name

papers = db['papers']

df_papers = pd.DataFrame(list(papers.find()))


In [4]:
df_papers.shape

(20215, 10)

In [30]:
df_papers.isnull().sum()

_id                  0
reference            0
abstracts          665
correspondence    2363
affiliation          0
coreData             0
language           119
authorKeywords    3762
subjectArea          0
author               0
dtype: int64

In [5]:
# Map abbreviations to categories and store them in a new column
generalized_fields = {
    "Sciences": [
        "AGRI",  # Agricultural and Biological Sciences
        "BIOC",  # Biochemistry, Genetics and Molecular Biology
        "EART",  # Earth and Planetary Sciences
        "ENVI",  # Environmental Science
        "MATE",  # Materials Science
        "PHYS"   # Physics and Astronomy
    ],
    "Health and Medicine": [
        "DENT",  # Dentistry
        "HEAL",  # Health Professions
        "IMMU",  # Immunology and Microbiology
        "MEDI",  # Medicine
        "NEUR",  # Neuroscience
        "NURS",  # Nursing
        "PHAR",  # Pharmacology, Toxicology and Pharmaceutics
        "VETE"   # Veterinary
    ],
    "Engineering and Technology": [
        "CENG",  # Chemical Engineering
        "COMP",  # Computer Science
        "ENER",  # Energy
        "ENGI"   # Engineering
    ],
    "Social Sciences and Humanities": [
        "ARTS",  # Arts and Humanities
        "BUSI",  # Business, Management and Accounting
        "DECI",  # Decision Sciences
        "ECON",  # Economics, Econometrics and Finance
        "PSYC",  # Psychology
        "SOCI"   # Social Sciences
    ],
    "Mathematics and Multidisciplinary": [
        "MATH",  # Mathematics
        "MULT"   # Multidisciplinary
    ]
}


def map_subject_area(subject_dict):
    ans = set()
    for key, value in subject_dict.items():
        for name, sub in generalized_fields.items():
            if key in sub:
                ans.add(name)
    return list(ans)

df_papers["mapped_subjectArea"] = df_papers["subjectArea"].apply(map_subject_area)

In [6]:
df_papers[0:5]

Unnamed: 0,_id,reference,abstracts,correspondence,affiliation,coreData,language,authorKeywords,subjectArea,author,mapped_subjectArea
0,67385a92da1b8d3d5cbba44c,"{'ref_count': '41', 'ref_publishYear_titleText...",Copyright © 2022 by Animal BioscienceObjective...,{'Nuengjamnong C.': 'Department of Animal Husb...,{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Microencapsulated basil oil (Ocimum...,eng,"[Antioxidant Capacity, Basil Oil, Broiler Chic...","{'AGRI': ['Food Science', 'Animal Science and ...","{'57225954552': {'name': 'Thuekeaw S.', 'afid'...","[Sciences, Health and Medicine]"
1,67385a92da1b8d3d5cbba44d,"{'ref_count': '51', 'ref_publishYear_titleText...",© 2022 Lippincott Williams and Wilkins. All ri...,{'Lele A.V.': 'Departments of Anesthesiology a...,{'60121845': {'name': 'Cleveland Clinic Abu Dh...,{'title': 'Perceptions Regarding the SARS-CoV-...,eng,"[care delivery, COVID-19, neurocritical care, ...","{'MEDI': ['Surgery', 'Neurology (clinical)', '...","{'57159279300': {'name': 'Lele A.V.', 'afid': ...",[Health and Medicine]
2,67385a92da1b8d3d5cbba44e,"{'ref_count': '40', 'ref_publishYear_titleText...",© 2022 The Author(s). Published by Informa UK ...,"{'Maes M.': 'Department of Psychiatry, Faculty...","{'60002620': {'name': 'Faculty of Medicine, Ch...",{'title': 'Construction of a short version of ...,eng,"[Alzheimer’s disease, cognitive disorders, Mil...",{'NEUR': ['Neuroscience (all)']},"{'56021778200': {'name': 'Hemrungrojn S.', 'af...",[Health and Medicine]
3,67385a92da1b8d3d5cbba44f,"{'ref_count': '69', 'ref_publishYear_titleText...",© 2022 Author(s).Different theoretical methodo...,{'Johansson E.': 'Theoretical Physics Division...,{'60009358': {'name': 'Linköpings Universitet'...,{'title': 'The effect of strain and pressure o...,eng,,{'PHYS': ['Physics and Astronomy (all)']},"{'57225126420': {'name': 'Johansson E.', 'afid...",[Sciences]
4,67385a92da1b8d3d5cbba450,"{'ref_count': '46', 'ref_publishYear_titleText...","© 2022, Journal of Exercise Physiology Online....",{'Chaunchaiyakul R.': None},{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Dynamic Cardiopulmonary and Metabol...,eng,"[Bra, Cardiorespiratory function, Metabolism]",{'MEDI': ['Physiology (medical)']},"{'57208242465': {'name': 'Masodsai K.', 'afid'...",[Health and Medicine]


In [7]:
def get_prediction(df_papers, model):
    # Initialize columns
    df_papers['Prediction'] = 0  # Default to 0 for all rows
    df_papers['Predictions_area'] = None  # Default to None for all rows

    for index, each in df_papers.iterrows():
        # Ensure 'coredata' and 'title' exist
        if 'coreData' in each and each['coreData'] and 'title' in each['coreData']:
            text = each['coreData']['title']
        else:
            continue  # Skip rows without 'coredata' or 'title'
        
        # Initialize labels dictionary
        labels2 = {
            "Sciences": 0,
            "Health and Medicine": 0,
            "Engineering and Technology": 0,
            "Social Sciences and Humanities": 0,
            "Mathematics and Multidisciplinary": 0
        }

        # Get predictions from the model
        entities = model.predict_entities(text, labels2.keys())

        # Update label counts
        for entity in entities:
            labels2[entity['label']] += 1
        # Find the label with the maximum count
        max_label = max(labels2, key=labels2.get)
        
        # Update Predictions_area with max_label
        df_papers.at[index, 'Predictions_area'] = max_label

        # Update Prediction if the max label is in mapped_subjectArea
        if max_label in each['mapped_subjectArea']:
            df_papers.at[index, 'Prediction'] = 1

    print('done predicting!!')

df_temp = df_papers.copy()

get_prediction(df_temp,model)

done predicting!!


In [8]:
df_temp

Unnamed: 0,_id,reference,abstracts,correspondence,affiliation,coreData,language,authorKeywords,subjectArea,author,mapped_subjectArea,Prediction,Predictions_area
0,67385a92da1b8d3d5cbba44c,"{'ref_count': '41', 'ref_publishYear_titleText...",Copyright © 2022 by Animal BioscienceObjective...,{'Nuengjamnong C.': 'Department of Animal Husb...,{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Microencapsulated basil oil (Ocimum...,eng,"[Antioxidant Capacity, Basil Oil, Broiler Chic...","{'AGRI': ['Food Science', 'Animal Science and ...","{'57225954552': {'name': 'Thuekeaw S.', 'afid'...","[Sciences, Health and Medicine]",1,Health and Medicine
1,67385a92da1b8d3d5cbba44d,"{'ref_count': '51', 'ref_publishYear_titleText...",© 2022 Lippincott Williams and Wilkins. All ri...,{'Lele A.V.': 'Departments of Anesthesiology a...,{'60121845': {'name': 'Cleveland Clinic Abu Dh...,{'title': 'Perceptions Regarding the SARS-CoV-...,eng,"[care delivery, COVID-19, neurocritical care, ...","{'MEDI': ['Surgery', 'Neurology (clinical)', '...","{'57159279300': {'name': 'Lele A.V.', 'afid': ...",[Health and Medicine],1,Health and Medicine
2,67385a92da1b8d3d5cbba44e,"{'ref_count': '40', 'ref_publishYear_titleText...",© 2022 The Author(s). Published by Informa UK ...,"{'Maes M.': 'Department of Psychiatry, Faculty...","{'60002620': {'name': 'Faculty of Medicine, Ch...",{'title': 'Construction of a short version of ...,eng,"[Alzheimer’s disease, cognitive disorders, Mil...",{'NEUR': ['Neuroscience (all)']},"{'56021778200': {'name': 'Hemrungrojn S.', 'af...",[Health and Medicine],0,Mathematics and Multidisciplinary
3,67385a92da1b8d3d5cbba44f,"{'ref_count': '69', 'ref_publishYear_titleText...",© 2022 Author(s).Different theoretical methodo...,{'Johansson E.': 'Theoretical Physics Division...,{'60009358': {'name': 'Linköpings Universitet'...,{'title': 'The effect of strain and pressure o...,eng,,{'PHYS': ['Physics and Astronomy (all)']},"{'57225126420': {'name': 'Johansson E.', 'afid...",[Sciences],0,Engineering and Technology
4,67385a92da1b8d3d5cbba450,"{'ref_count': '46', 'ref_publishYear_titleText...","© 2022, Journal of Exercise Physiology Online....",{'Chaunchaiyakul R.': None},{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Dynamic Cardiopulmonary and Metabol...,eng,"[Bra, Cardiorespiratory function, Metabolism]",{'MEDI': ['Physiology (medical)']},"{'57208242465': {'name': 'Masodsai K.', 'afid'...",[Health and Medicine],1,Health and Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20210,67385edfda1b8d3d5cbbf33e,"{'ref_count': '19', 'ref_publishYear_titleText...",© 2018 The authors and IOS Press. All rights r...,{'Kingpaiboon S.': None},"{'60017165': {'name': 'Khon Kaen University', ...",{'title': 'Estimating actual evapotranspiratio...,eng,"[Actual evapotranspiration, Landsat 8, NDVI, R...",{'COMP': ['Artificial Intelligence']},"{'57205326624': {'name': 'Jermthaisong P.', 'a...",[Engineering and Technology],1,Engineering and Technology
20211,67385edfda1b8d3d5cbbf33f,"{'ref_count': '27', 'ref_publishYear_titleText...",© 2018 The Author(s).We have performed a genom...,{'Sawai H.': None},{'60178602': {'name': 'Graduate School of Medi...,{'title': 'Genome-wide association study ident...,eng,,{'MULT': ['Multidisciplinary']},"{'7102263841': {'name': 'Sawai H.', 'afid': '6...",[Mathematics and Multidisciplinary],0,Health and Medicine
20212,67385edfda1b8d3d5cbbf340,"{'ref_count': '21', 'ref_publishYear_titleText...","© 2018, Medical Association of Thailand. All r...",{'Kitisomprayoonkul W.': None},{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Effects of transcranial direct curr...,eng,"[Motor cortex, Rehabilitation, Stroke, Transcr...",{'MEDI': ['Medicine (all)']},"{'57200751864': {'name': 'Utarapichat S.', 'af...",[Health and Medicine],1,Health and Medicine
20213,67385edfda1b8d3d5cbbf341,"{'ref_count': '29', 'ref_publishYear_titleText...",© 2018 Chulalongkorn University Printing House...,{'Benjanirut C.': None},{'60028190': {'name': 'Chulalongkorn Universit...,{'title': 'Prevalence and risk factors for can...,eng,"[Canine cognitive dysfunction syndrome, Preval...",{'VETE': ['Veterinary (all)']},"{'7801357061': {'name': 'Benjanirut C.', 'afid...",[Health and Medicine],1,Health and Medicine


In [11]:
df_temp['Prediction'].sum()/df_temp[df_temp['coreData'] != None].shape[0]

0.6088053425674005