In [1]:
import requests, bs4
from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
import pickle

import time
import random
from progressbar import ProgressBar

import numpy as np
import pandas as pd

# Webscraping the abstracts from manuscripts.

Here I'll load the list of publications, which includes the web address of each publication. Then I'll webscrape the abstracts from these publications and add them to the dataframe. The dataframe then gets saved for later analysis.

## Load the list of publications and clean it up a bit.

In [2]:
# Import the different sheets.

yrs = ["2019", "2020"]
df = pd.DataFrame()

for sheet in yrs:
    temp_df = pd.read_excel("../data/2019 - 2020 JCRP Metrics -EB.xlsx", 
                   sheet_name = sheet)
    df = pd.concat([df, temp_df])

In [3]:
# Drop some empty rows.

df = df.dropna(axis=0, how="all")

In [4]:
# A lot of the Title entries are "Selected Abstracts..." and are from "Literature Update" article types.
# However, the Vol 40 doesn't have article type so have to look through titles.

df = df[df["Title"].str.contains("Selected Abstracts From Recent Publications") == False].reset_index()

## Webscrape the abstracts.

In [7]:
# Need to initialize the column for the full abstract.
df["Full_Abstract"] = ""

In [9]:
# Use random user agent when scraping.
ua = UserAgent()
user_agent = {'User-agent': ua.random}

# Initialize a counter to keep track of progress.
pbar = ProgressBar()

for i, link in enumerate(pbar(df["Link"])):
    
    # Ensure that there's a link.
    if link == np.nan:
        pass
    else:
        url = link

        response = requests.get(url, headers = user_agent)
        status = response.status_code
        if status == 200:
            page = response.text
            soup = bs(page)
        else:
            print(f"Oops! On df row {i} with this link ({url}),\nreceived status code {status}")
            table = ""

        # Add the abstract sections to the dataframe.
        table = soup.find('div', attrs = {'id': 'article-abstract-content1'})

        # If there's no abstract on the page, skip it.
        if table:
            temp_lst = []
            for header in table.find_all("h3"):
                temp_lst.append(header.text[:-2])

            # Add the individual sections as columns to dataframe.
            # Then add full abstract.
            full_paragraph = ""    
            for j,paragraph in enumerate(table.find_all("p")):
                full_paragraph += paragraph.text + " "
                
                if len(temp_lst) != 0:
                    df.loc[i,temp_lst[j]] = paragraph.text
                    
            df.loc[i,"Full_Abstract"] = full_paragraph[:-1]      
                    
                    
                    
        else:
            pass
    
    # Add (1.5-4.5sec) delay that makes the web scraping more human-like.
    timeout = 1.5*random.randint(1,3)
    time.sleep(timeout)

100% |########################################################################|


In [10]:
# Save the list. 
with open('../data/dataframe_with_abstracts.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

In [None]:
### Test some random selections to make sure the abstracts are correct.

In [23]:
idx = random.randint(0,len(df))

print(df.loc[idx,"Title"])
print(df.loc[idx,"Link"])
print(df.loc[idx,"Full_Abstract"])

Using the 6-min Walk Test to Monitor Peak Oxygen Uptake Response to Cardiac Rehabilitation in Patients With Heart Failure
https://journals.lww.com/jcrjournal/Fulltext/2020/11000/Using_the_6_min_Walk_Test_to_Monitor_Peak_Oxygen.6.aspx
We examined the agreement between peak oxygen uptake (V˙o2peak), estimated using prediction equations from the 6-min Walk Test (6MWT), and V˙o2peak measured using a cardiopulmonary exercise test (CPX) to estimate change in V˙o2peak in patients with heart failure (HF) enrolled in cardiac rehabilitation (CR). This was secondary analysis of 54 (including 9 women) patients with HF who completed a clinical CR program. Four previously published equations using 6MWT distance were used to estimate V˙o2peak and were compared with a CPX at baseline, follow-up, and change using the standard and modified Bland-Altman method. Analyses were repeated for quartiles of cardiorespiratory fitness (CRF) based on measured V˙o2peak from the CPX. Bland-Altman plots revealed prop