# Title
[]()

In [2]:

import pandas as pd
import sys
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
from silvhua import *
from datetime import datetime
# sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\portfolio-projects\online-PT-social-media-NLP\src")
# import json
# from pandas import json_normalize  
# from plotly.subplots import make_subplots
# import requests

In [3]:
# set the option to wrap text within cells
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# set up

In [5]:
root_article_dict = dict()
text_dict = dict()
root_display_dict = dict()
partial_article_dict = dict()

# from `crawl.py`

In [None]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            # article_dict[index] = dict()
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        # print('Initiation')
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        text = response.xpath('//h2|//p|//h3|//h4').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")

iteration_id = 1
article_dict = dict()

# Processing scripts from [Jun 24 notebook](../notebooks/2023-06-24%20process%20scraped%20articles.ipynb)

In [None]:
from IPython import display

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
            # print(f'\tArticle type: {article_type}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


root_display_dict = dict()
text_dict = dict()
iteration = 2.2
text_id = 2

# regex = r'(.*)'
# text_dict[iteration], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), article_regex_str=regex, abs_regex_str=regex)

text_dict[iteration], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration] = display_dict
display_html(display_dict, type='abstract')



# Iteration 1: Update & Scrape

In [5]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        text = response.xpath('//h2|//p|//h3|//h4').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")

iteration_id = 1
article_dict = dict()
n_articles = 'all'
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict

article_titles(article_dict)

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'text']
0.0: Nutrition and immunity: perspectives on key issues and next steps
	Applied Physiology, Nutrition, and Metabolism https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R

0.01: Prevalence of sarcopenia indicators and sub-optimal protein intake among elective total joint replacement patients
	Applied Physiology, Nutrition, and Metabolism https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0125?af=R

0.02: The impact of COVID-19 on pulmonary function and airway reactivity after recovery in college-aged adults
	Applied Physiology, Nutrition, and Metabolism https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0410?af=R

0.03: Perception of exercise-induced dyspnea after experimentally

In [8]:
print(article_dict.keys())
article_dict

dict_keys([0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21])


{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'text': '\n<h2>Create a new account</h2>\n<h2>Request Username</h2>\n<p>Can\'t sign in? Forgot your username?</p>\n<p class="sub">Enter your email address below and we will send you your username</p>\n<p>If the address matches an existing account you will receive an email with instructions to retrieve your username</p>\n<h2>Change Password </h2>\n<h2>Password Changed Successfully</h2>\n<p>Your password has been changed</p>\n<p>Can\'t sign in? Forgot your password?</p>\n<p class="sub">Enter your email address below and we will send you the reset instructions</p>\n<p>If the address matches an existing account you will receive an email with instructions to reset your password</p>\n<h2>Verify Phone</h2>\n<h2>Congrats!</h2>\n<p>Your Phone has been verified</p>\n<h2 proper

## Update processing script

In [17]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        text = response.xpath('//h2|//p|//h3|//h4').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
            # print(f'\tArticle type: {article_type}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 1
text_id = 1
n_articles = 'all'
####
# article_dict = dict()
# run_RSS_spider(n_articles)
# root_article_dict[iteration_id] = article_dict

# article_titles(article_dict)

# regex = r'(.*)'
# text_dict[iteration], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), article_regex_str=regex, abs_regex_str=regex)

partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

text_dict[iteration_id], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration] = display_dict
# display_html(display_dict, type='abstract')

Keys for article_dict: [0.0, 0.01]
Journals:
	Applied Physiology, Nutrition, and Metabolism
header: [234]
Regex patterns: 
	.*<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?(<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References)<.*
	.*(<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?)<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References<.*
Journal: Applied Physiology, Nutrition, and Metabolism 0.0
	An error occurred on line 118 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23620\1185238394.py: 'NoneType' object has no attribute 'group'
		Unable to parse article text
Journal: Applied Physiology, Nutrition, and Metabolism 0.01
	An error occurred on line 118 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23620\1185238394.py: 'NoneType' object has no attribute 'group'
		Unable to parse article text
text_dict keys: [0.0, 0.01]


In [18]:
root_display_dict[iteration_id] = display_dict
display_html(display_dict, type='abstract')


************************************* Start *************************************


************************************* Start *************************************


# iteration 2: update scraper

In [19]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")

iteration_id = 2
article_dict = dict()
n_articles = 2
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict

article_titles(article_dict)

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'text']
0.0: Nutrition and immunity: perspectives on key issues and next steps
	Applied Physiology, Nutrition, and Metabolism https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R

0.01: Prevalence of sarcopenia indicators and sub-optimal protein intake among elective total joint replacement patients
	Applied Physiology, Nutrition, and Metabolism https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0125?af=R



In [20]:
article_dict

{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'text': '\n<h2>Create a new account</h2>\n<h2>Request Username</h2>\n<p>Can\'t sign in? Forgot your username?</p>\n<p class="sub">Enter your email address below and we will send you your username</p>\n<p>If the address matches an existing account you will receive an email with instructions to retrieve your username</p>\n<h2>Change Password </h2>\n<h2>Password Changed Successfully</h2>\n<p>Your password has been changed</p>\n<p>Can\'t sign in? Forgot your password?</p>\n<p class="sub">Enter your email address below and we will send you the reset instructions</p>\n<p>If the address matches an existing account you will receive an email with instructions to reset your password</p>\n<h2>Verify Phone</h2>\n<h2>Congrats!</h2>\n<p>Your Phone has been verified</p>\n<h2 proper

## 2.1

In [22]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
            # print(f'\tArticle type: {article_type}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.1
text_id = iteration_id
n_articles = 2
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict

# article_titles(article_dict)


partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

text_dict[iteration_id], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration_id] = display_dict
display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'text']
Keys for article_dict: [0.0, 0.01]
Journals:
	Applied Physiology, Nutrition, and Metabolism
header: [234]
Regex patterns: 
	.*<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?(<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References)<.*
	.*(<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?)<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References<.*
Journal: Applied Physiology, Nutrition, and Metabolism 0.0
	An error occurred on line 118 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23620\1952865074.py: 'NoneType' object has no attribute 'group'
		Unable to parse article text
Journal: Applied Physiology, Nutrition, and Metabolism 0.01
	An e

************************************* Start *************************************


In [24]:
article_dict

{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'text': '\n<h2>Create a new account</h2>\n<h2>Request Username</h2>\n<p>Can\'t sign in? Forgot your username?</p>\n<p class="sub">Enter your email address below and we will send you your username</p>\n<p>If the address matches an existing account you will receive an email with instructions to retrieve your username</p>\n<h2>Change Password </h2>\n<h2>Password Changed Successfully</h2>\n<p>Your password has been changed</p>\n<p>Can\'t sign in? Forgot your password?</p>\n<p class="sub">Enter your email address below and we will send you the reset instructions</p>\n<p>If the address matches an existing account you will receive an email with instructions to reset your password</p>\n<h2>Verify Phone</h2>\n<h2>Congrats!</h2>\n<p>Your Phone has been verified</p>\n<h2 proper

## 2.2 troubleshoot

In [25]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        text = response.xpath('//div[@role="paragraph"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function:
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
            # print(f'\tArticle type: {article_type}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.2
text_id = iteration_id
n_articles = 2
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict

# article_titles(article_dict)


partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

text_dict[iteration_id], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration_id] = display_dict
display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'text']
Keys for article_dict: [0.0, 0.01]
Journals:
	Applied Physiology, Nutrition, and Metabolism
header: [234]
Regex patterns: 
	.*<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?(<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References)<.*
	.*(<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?)<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References<.*
Journal: Applied Physiology, Nutrition, and Metabolism 0.0
	An error occurred on line 119 in C:\Users\silvh\AppData\Local\Temp\ipykernel_23620\3686114734.py: 'NoneType' object has no attribute 'group'
		Unable to parse article text
Journal: Applied Physiology, Nutrition, and Metabolism 0.01
	An e

************************************* Start *************************************


## 2.3

In [1]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        text = response.xpath('//div[@role="paragraph"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
            # print(f'\tArticle type: {article_type}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.3
text_id = iteration_id
n_articles = 2
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict

# article_titles(article_dict)


partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

text_dict[iteration_id], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration_id] = display_dict
display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 44 articles and 22 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 22
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'text']


NameError: name 'root_article_dict' is not defined

In [3]:
article_dict 

{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'text': '\n<div role="paragraph">In January 2022, a group of experts came together to discuss current perspectives and future directions in nutritional immunology as part of a symposium organized by the Canadian Nutrition Society. Objectives included (1) creating an understanding of the complex interplay between diet and the immune system from infants through to older adults, (2) illustrating the role of micronutrients that are vital to the immune system, (3) learning about current research comparing the impact of various dietary patterns and novel approaches to reduce inflammation, autoimmune conditions, allergies, and infections, and (4) discussing select dietary recommendations aimed at improving disease-specific immune function. The aims of this review are to sum

## 2.4  try to parse iframe

In [13]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        iframe = response.xpath('//iframe/@src').extract()
        article_dict[key]['iframe'] = iframe
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        for url in iframe:
            yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    def parse_iframe(self, response, key, article_dict):
        print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        article_dict['hello'] = 'world'
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.4
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 1 articles and 1 URLs for Applied Physiology, Nutrition, and Metabolism
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'iframe']
Journal #0.0


{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'iframe': ['https://www.googletagmanager.com/ns.html?id=GTM-K8SF9N8']},
 'hello': 'world'}

## 2.5

In [16]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        iframe = response.xpath('//iframe/@src').extract()
        article_dict[key]['iframe'] = iframe
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        for url in iframe:
            yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    def parse_iframe(self, response, key, article_dict):
        print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        # article_dict['hello'] = 'world'
        article_dict['response'] = response.xpath('//text()').getall()
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.5
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 1 articles and 1 URLs for Applied Physiology, Nutrition, and Metabolism
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'iframe']
Journal #0.0


{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'iframe': ['https://www.googletagmanager.com/ns.html?id=GTM-K8SF9N8']},
 'response': ['\n',
  '\n  ',
  '\n  ',
  'ns',
  '\n',
  '\n',
  '\n  \n\n  \n\n  \n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']}

## 2.51

In [17]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        iframe = response.xpath('//iframe/@src').extract()
        article_dict[key]['iframe'] = iframe
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        for url in iframe:
            yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    def parse_iframe(self, response, key, article_dict):
        print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
        # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        # article_dict['hello'] = 'world'
        article_dict['response'] = response.xpath('//text()|//h1|//h2|//h3|//h4').getall()
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.5
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 1 articles and 1 URLs for Applied Physiology, Nutrition, and Metabolism
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'iframe']
Journal #0.0


{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'iframe': ['https://www.googletagmanager.com/ns.html?id=GTM-K8SF9N8']},
 'response': ['\n',
  '\n  ',
  '\n  ',
  'ns',
  '\n',
  '\n',
  '\n  \n\n  \n\n  \n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']}

## 2.6

In [23]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, key, article_dict):
        # print(f'Journal #{key}')
        # text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        iframe = response.xpath('//iframe/@src').extract()
        article_dict[key]['iframe'] = iframe
        text = response.xpath('//div[@class="core-container"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
    #     for url in iframe:
    #         yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    # def parse_iframe(self, response, key, article_dict):
    #     print(f'Journal #{key}')
    #     # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
    #     # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     # article_dict['hello'] = 'world'
    #     article_dict['response'] = response.xpath('//text()|//h1|//h2|//h3|//h4').getall()
        
@wait_for(40)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration_id = 2.5
text_id = iteration_id
n_articles = 1
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration_id] = article_dict
article_dict
# article_titles(article_dict)


# partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals='all')

# text_dict[iteration_id], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

# root_display_dict[iteration_id] = display_dict
# display_html(display_dict, type='body')

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 1 articles and 1 URLs for Applied Physiology, Nutrition, and Metabolism
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'iframe', 'text']


{0.0: {'journal': 'Applied Physiology, Nutrition, and Metabolism',
  'title': 'Nutrition and immunity: perspectives on key issues and next steps',
  'url': 'https://cdnsciencepub.com/doi/abs/10.1139/apnm-2022-0276?af=R',
  'iframe': ['https://www.googletagmanager.com/ns.html?id=GTM-K8SF9N8'],
  'text': '\n<div class="core-container"><div data-article-access="free" data-article-access-type="open" class="meta-panel"><div class="meta-panel__left-content"><div class="meta-panel__access meta-panel__access--open"><span>Open access</span></div><div class="meta-panel__editor-award"><i aria-hidden="true" class="icon-editor-choice"></i>Editor\'s Choice</div><div class="meta-panel__type"><a href="/topic/paper-type/review-article">Review</a></div></div><div class="meta-panel__right-content"><div class="meta-panel__share">\n\n\n\n        \n        <!-- Go to https://www.addtoany.com/buttons/customize/ to customize your tools --><script type="text/javascript" defer src="https://static.addtoany.com/m

In [22]:
root_article_dict[iteration_id].keys()

dict_keys([0.0, 'response'])

### display 

In [None]:
partial_article_dict[text_id] = create_partial_article_dict(root_article_dict[iteration_id], n_articles=2, journals=[0])

text_dict[iteration_id], display_dict = text_dict_from_web(
    partial_article_dict[text_id], to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration_id] = display_dict
display_html(display_dict, type='body')

Keys for article_dict: [0.0]
Journals:
	Applied Physiology, Nutrition, and Metabolism


# *End of Page*