In [19]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import re

pd.pandas.set_option('display.max_columns', None)

In [20]:
dataset = pd.read_csv("data\collection_with_abstracts.csv")

In [21]:
dataset.head()

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,,10.3389/fnins.2024.1501636,
1,39398866,Characterization of arteriosclerosis based on ...,"Zhou J, Li X, Demeke D, Dinh TA, Yang Y, Janow...",J Med Imaging (Bellingham). 2024 Sep;11(5):057...,Zhou J,J Med Imaging (Bellingham),2024,2024/10/14,PMC11466048,,10.1117/1.JMI.11.5.057501,PURPOSE: Our purpose is to develop a computer ...
2,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...
3,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,,10.1093/bib/bbae476,The application of deep learning to spatial tr...
4,39363262,Truncated M13 phage for smart detection of E. ...,"Yuan J, Zhu H, Li S, Thierry B, Yang CT, Zhang...",J Nanobiotechnology. 2024 Oct 3;22(1):599. doi...,Yuan J,J Nanobiotechnology,2024,2024/10/04,PMC11451008,,10.1186/s12951-024-02881-y,BACKGROUND: The urgent need for affordable and...


In [22]:
dataset.shape

(11450, 12)

In [23]:
print(dataset.columns)

Index(['PMID', 'Title', 'Authors', 'Citation', 'First Author', 'Journal/Book',
       'Publication Year', 'Create Date', 'PMCID', 'NIHMS ID', 'DOI',
       'Abstract'],
      dtype='object')


# Data Preprocessing

In [24]:
# Standardize column names

dataset.columns = [col.strip().replace(" ", "_").lower() for col in dataset.columns]

Checking for null values

In [25]:
dataset.isnull().sum()

pmid                    0
title                   0
authors                 0
citation                0
first_author            0
journal/book            0
publication_year        0
create_date             0
pmcid                5000
nihms_id            10494
doi                   481
abstract              213
dtype: int64

In [27]:
# Filling all the missing values in "pmcid", "nihms_id" and "doi" with the word "Missing".

dataset[['pmcid', 'nihms_id', 'doi', 'abstract']] = dataset[['pmcid', 'nihms_id', 'doi', 'abstract']].fillna("Unavailable")

In [28]:
dataset.isnull().sum()

pmid                0
title               0
authors             0
citation            0
first_author        0
journal/book        0
publication_year    0
create_date         0
pmcid               0
nihms_id            0
doi                 0
abstract            0
dtype: int64

In [29]:
dataset.head()

Unnamed: 0,pmid,title,authors,citation,first_author,journal/book,publication_year,create_date,pmcid,nihms_id,doi,abstract
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,Unavailable,10.3389/fnins.2024.1501636,Unavailable
1,39398866,Characterization of arteriosclerosis based on ...,"Zhou J, Li X, Demeke D, Dinh TA, Yang Y, Janow...",J Med Imaging (Bellingham). 2024 Sep;11(5):057...,Zhou J,J Med Imaging (Bellingham),2024,2024/10/14,PMC11466048,Unavailable,10.1117/1.JMI.11.5.057501,PURPOSE: Our purpose is to develop a computer ...
2,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,Unavailable,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...
3,39367648,An initial game-theoretic assessment of enhanc...,"Fatemi MY, Lu Y, Diallo AB, Srinivasan G, Azhe...",Brief Bioinform. 2024 Sep 23;25(6):bbae476. do...,Fatemi MY,Brief Bioinform,2024,2024/10/05,PMC11452536,Unavailable,10.1093/bib/bbae476,The application of deep learning to spatial tr...
4,39363262,Truncated M13 phage for smart detection of E. ...,"Yuan J, Zhu H, Li S, Thierry B, Yang CT, Zhang...",J Nanobiotechnology. 2024 Oct 3;22(1):599. doi...,Yuan J,J Nanobiotechnology,2024,2024/10/04,PMC11451008,Unavailable,10.1186/s12951-024-02881-y,BACKGROUND: The urgent need for affordable and...


All the null values has been handled

In [30]:
dataset.to_csv("preprocessed_data.csv", index=False)