In [5]:
import re
from bs4 import BeautifulSoup
import requests

def clean_text(text):
    cleaned_text = text.strip()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    return cleaned_text



def get_paper(url):
    res = {}
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html,"html.parser")
    li = soup.find("li", class_="locale_pt_PT")
    a_pt = li.a
    url_pt = a_pt["href"]
    response = requests.get(url_pt)
    html_pt = response.text
    soup_pt = BeautifulSoup(html_pt,"html.parser")
    section_abstract = soup_pt.find("section", class_="item abstract")
    section_keywords = soup_pt.find("section", class_="item keywords")
    section_authors = soup_pt.find("section", class_="item authors")
    section_doi = soup_pt.find("section", class_="item doi")
    pdf = soup_pt.find("a", class_="obj_galley_link pdf")
    publish_date = soup_pt.find("div", class_="item published")
    title = soup_pt.find("h1", class_="page_title")

    if section_abstract:
        res["abstract"] = clean_text(section_abstract.text)
    if section_keywords:
        res["keywords"] = clean_text(section_keywords.span.text)
    if section_authors:
        res["authors"] = str(section_authors.ul)
    if section_doi:
        res["doi"] = clean_text(section_doi.a.text)
    if pdf:
        res["pdf"] = pdf["href"]
    if publish_date:
        res["publish_date"] = clean_text(publish_date.section.div.text)
    if title:
        res["title"] = clean_text(title.text)
    res["url"] = url


    return res

def get_papers(url):
    res = {}
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html,"html.parser")
    divs = soup.find_all("div",class_="obj_article_summary")
    for div in divs:
        a = div.h3.a
        url = a["href"]
        print(url)
        paper = get_paper(url)
        id = url.split("/")[-1]
        res[id] = paper
    return res

def process_div(div, res):
    a = div.find("a", class_="title")
    if a:
        url = a["href"]
        print(url)
        papers = get_papers(url)
        res.update(papers)

def get_chapters(url,res):
	response = requests.get(url)
	html = response.text
	soup = BeautifulSoup(html,"html.parser")
	ul = soup.find("ul", class_="issues_archive")
	divs = ul.find_all("div", class_="obj_issue_summary")
	for div in divs:
		process_div(div, res)

res = {}
url = f"https://casereports.spmi.pt/index.php/cr/issue/archive/1"
get_chapters(url,res)



https://casereports.spmi.pt/index.php/cr/issue/view/2
https://casereports.spmi.pt/index.php/cr/article/view/5
https://casereports.spmi.pt/index.php/cr/article/view/6
https://casereports.spmi.pt/index.php/cr/article/view/7
https://casereports.spmi.pt/index.php/cr/article/view/8
https://casereports.spmi.pt/index.php/cr/article/view/9
https://casereports.spmi.pt/index.php/cr/article/view/10
https://casereports.spmi.pt/index.php/cr/article/view/11
https://casereports.spmi.pt/index.php/cr/article/view/20
https://casereports.spmi.pt/index.php/cr/article/view/23
https://casereports.spmi.pt/index.php/cr/article/view/12
https://casereports.spmi.pt/index.php/cr/article/view/13
https://casereports.spmi.pt/index.php/cr/article/view/14
https://casereports.spmi.pt/index.php/cr/article/view/15
https://casereports.spmi.pt/index.php/cr/article/view/17
https://casereports.spmi.pt/index.php/cr/article/view/21
https://casereports.spmi.pt/index.php/cr/article/view/16
https://casereports.spmi.pt/index.php/c

In [8]:
import json

f_out = open("resumos_spmi.json","w")
json.dump(res,f_out,indent=4,ensure_ascii=False)
f_out.flush()
f_out.close()

### Merge new articles with older articles list

In [9]:
#load json
import json
f = open("resumos_spmi.json","r")
res = json.load(f)


In [12]:
# save res to a csv file with columns: Unnamed: 0.1,Unnamed: 0,link,title,journal,category,authors,affiliations,acceptance Date,publication Date,ISSN,abstract,keywords,articles,language

import pandas as pd

df_old = pd.read_csv("dataset_articles.csv")
#add entry to df
i = len(df_old)
for k,v in res.items():
    df_old.loc[i] = [i,i,v["url"],v["title"],"SPMI","Casos Clínicos",v.get("authors"),None,None,v.get("publish_date"),"ISSN 2975-822X",v.get("abstract"),v.get("keywords"),None,"pt"]
    i += 1

#save new df
df_old.to_csv("dataset_articles_2.csv",index=False)


df_old

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,link,title,journal,category,authors,affiliations,acceptance Date,publication Date,ISSN,abstract,keywords,articles,language
0,0,0,https://casosclinicosonline.spmi.pt/artigos_co...,VASCULITE SECUNDÁRIA A INFEÇÃO PULMONAR POR EN...,Casos Clínicos,Doença Infecciosa e Parasitárias,"C. Pais, A. R. Lima, A. Morais",\r\n + \r\n\r\n - \r\n\r\n ...,21-01-2018,23-03-2018,2183-7546,As vasculites caracterizam-se por inflamação e...,"Vasculite sistémica, Entamoeba histolytica, am...",Introdução \r\nA Entamoeba histolytica é um pr...,pt
1,1,1,https://casosclinicosonline.spmi.pt/artigos_co...,PROTEINOSE ALVEOLAR PULMONAR: DIAGNÓSTICO DIFE...,Casos Clínicos,Doenças Respiratórias,"Cátia Canelas, Juliana Pinho, João Carvas, Joa...",\r\n + \r\n\r\n - \r\n\r\n ...,27-06-2018,12-09-2018,2183-7546,Proteinose alveolar pulmonar é uma patologia r...,"Proteinose Alveolar Pulmonar, Teleradiografia ...",INTRODUÇÃO\r\nA proteinose alveolar pulmonar (...,pt
2,2,2,https://casosclinicosonline.spmi.pt/artigos_co...,UM CASO DE INTOXICAÇÃO AGUDA POR COCAÍNA,Casos Clínicos,Outros,"Margarida Eulálio1, Arsénio Santos2, Augusta C...",\r\n + \r\n\r\n - \r\n\r\n ...,21-05-2016,05-02-2017,2183-7546,A cocaína é uma droga de uso generalizado e o ...,"cocaína, insuficiência renal aguda, hepatite a...",INTRODUÇÃO\r\nA cocaína é um alcalóide extraíd...,pt
3,3,3,https://casosclinicosonline.spmi.pt/artigos_co...,PENFIGOIDE BOLHOSO INDUZIDO POR INIBIDOR DA DI...,Imagens em Medicina,Doenças Autoimunes e vasculites,"Mylene Costa (0000-0002-5132-0632)1, Francisco...",\r\n + \r\n\r\n - \r\n\r\n ...,09-05-2019,28-06-2019,2183-7546,.,PENFIGOIDE BOLHOSO; INIBIDOR DA DIPEPTIDIL PEP...,O penfigóide bolhoso (PB) é uma dermatose bolh...,pt
4,4,4,https://casosclinicosonline.spmi.pt/artigos_co...,DOENÇA DE KIKUCHI-FUJIMOTO E INFECÇÃO POR EBV,Casos Clínicos,Doenças Autoimunes e vasculites,Filipa Cardoso1 (https://orcid.org/0000-0002-1...,\r\n + \r\n\r\n - \r\n\r\n ...,22-04-2021,28-06-2021,2183-7546,A doença de Kikuchi-Fujimoto (DKF) é uma condi...,Doença de Kikuchi-Fujimoto; Linfadenite histio...,INTRODUCTION Kikuchi-Fujimoto disease (KFD) al...,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,859,859,https://casereports.spmi.pt/index.php/cr/artic...,Fungal Endocarditis: A Successful Case with Me...,SPMI,Casos Clínicos,"<ul class=""authors"">\n<li>\n<span class=""name""...",,,26-09-2024,ISSN 2975-822X,Resumo A endocardite fúngica é uma condição ra...,"Antifúngicos/uso terapêutico, Candida, Endocar...",,pt
860,860,860,https://casereports.spmi.pt/index.php/cr/artic...,Between the Miracle of Life and the Complexity...,SPMI,Casos Clínicos,"<ul class=""authors"">\n<li>\n<span class=""name""...",,,26-09-2024,ISSN 2975-822X,,"Embolia de Líquido Amniótico, Gravidez",,pt
861,861,861,https://casereports.spmi.pt/index.php/cr/artic...,A Rare Triad: Plummer Vinson Syndrome,SPMI,Casos Clínicos,"<ul class=""authors"">\n<li>\n<span class=""name""...",,,26-09-2024,ISSN 2975-822X,,Síndrome de Plummer Vinson/diagnóstico por imagem,,pt
862,862,862,https://casereports.spmi.pt/index.php/cr/artic...,Calciphylaxis: A Rare Cause for Lower Extremit...,SPMI,Casos Clínicos,"<ul class=""authors"">\n<li>\n<span class=""name""...",,,26-09-2024,ISSN 2975-822X,,"Calcifilaxia, Membro Inferior, Pé, Perna, Úlce...",,pt
