# Title
[]()

In [2]:
import pandas as pd
import sys
import os
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\custom_python")
sys.path.append(r"C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\src")
from file_functions import *


In [7]:
root_article_dict = dict()
text_dict = dict()
root_display_dict = dict()
partial_article_dict = dict()
crawler_dict = dict()

# From 2023-07-12 summaries from journal.ipynb

In [22]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            # 'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'journal': journal, 'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, journal, key, article_dict):
        print(f'Journal #{key}: {journal}')
        if journal != 'Applied Physiology, Nutrition, and Metabolism':
            text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        else:
            iframe = response.xpath('//iframe/@src').extract()
            article_dict[key]['iframe'] = iframe
            text = response.xpath('//div[@class="core-container"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
    #     for url in iframe:
    #         yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    # def parse_iframe(self, response, key, article_dict):
    #     print(f'Journal #{key}')
    #     # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
    #     # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     # article_dict['hello'] = 'world'
    #     article_dict['response'] = response.xpath('//text()|//h1|//h2|//h3|//h4').getall()
        
@wait_for(10)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration = 1
text_id = iteration
n_articles = 2
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration] = article_dict

Found 30 articles and 30 URLs for PLOS One
Journal #0.0: PLOS One
	PLOS One
		Article attributes: ['journal', 'title', 'url', 'text']
Journal #0.01: PLOS One


In [17]:
article_dict.keys()

dict_keys([0.0, 0.01])

In [18]:
article_dict[0.01]

{'journal': 'PLOS One',
 'title': '<i>In vivo</i> Polycystin-1 interactome using a novel <i>Pkd1</i> knock-in mouse model',
 'url': 'https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0289778',
 'text': '\n<h3 class="callout-headline">Submit Your Manuscript</h3>\n<p class="callout-content">\n  Discover a faster, simpler path to publishing in a high-quality journal. <em>PLOS ONE</em> promises fair, rigorous peer review,\n  broad scope, and wide readership – a perfect fit for your research every time.\n  </p>\n<p class="button-contain special">\n    <a class="button button-default" href="/plosone/static/publish">\n     Learn More\n    </a>\n    <a class="button-link" href="https://www.editorialmanager.com/pone/default.asp">\n      Submit Now\n    </a>\n  </p>\n<p>Click through the PLOS taxonomy to find articles in your field.</p>\n<p>For more information about PLOS Subject Areas, click\n          <a href="https://github.com/PLOS/plos-thesaurus/blob/master/README.md" target

In [19]:
next(iter(article_dict.values()))['text']

'\n<h3 class="callout-headline">Submit Your Manuscript</h3>\n<p class="callout-content">\n  Discover a faster, simpler path to publishing in a high-quality journal. <em>PLOS ONE</em> promises fair, rigorous peer review,\n  broad scope, and wide readership – a perfect fit for your research every time.\n  </p>\n<p class="button-contain special">\n    <a class="button button-default" href="/plosone/static/publish">\n     Learn More\n    </a>\n    <a class="button-link" href="https://www.editorialmanager.com/pone/default.asp">\n      Submit Now\n    </a>\n  </p>\n<p>Click through the PLOS taxonomy to find articles in your field.</p>\n<p>For more information about PLOS Subject Areas, click\n          <a href="https://github.com/PLOS/plos-thesaurus/blob/master/README.md" target="_blank" title="Link opens in new window">here</a>.\n        </p>\n<p>Loading metrics</p>\n<p class="license-short" id="licenseShort">Open Access</p>\n<p class="peer-reviewed" id="peerReviewed">Peer-reviewed</p>\n<p c

In [23]:

text_id = 0

# regex = r'(.*)'
# text_dict[iteration], display_dict = text_dict_from_web(
#     partial_article_dict[text_id], to_display='all', header=(2,4), article_regex_str=regex, abs_regex_str=regex)

text_dict[iteration], display_dict = text_dict_from_web(
    article_dict, to_display='all', header=(2,4), verbose=True)

root_display_dict[iteration] = display_dict
display_html(display_dict, type='abstract')



header: [234]
Regex patterns: 
	.*<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?(<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References)<.*
	.*(<h[234].*?>Abstract</h[234]>.*(?:(?:Introduction|Background).*)?)<h[234].*?>1?.?\s?(?:Introduction|Background)</h[234]>.*References<.*
Journal: PLOS One 0.0
Journal: PLOS One 0.01
text_dict keys: [0.0, 0.01]

************************************* Start *************************************


************************************* Start *************************************


In [24]:
article_dict[0]['journal']

'PLOS One'

## Applied Physiology, Nutrition, and Metabolism

In [5]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_RSS1(scrapy.Spider):
    name = "crawler_RSS1"
    
    def __init__(self, n_articles='all'):
        self.n_articles = n_articles
    
    def start_requests(self):
        journals = {
            # 'PLOS One': 'https://journals.plos.org/plosone/feed/atom',
            # 'BMJ Open': 'https://bmjopen.bmj.com/rss/current.xml',
            # 'Journal of Medical Internet Research': 'https://www.jmir.org/feed/atom',
            # 'PLOS Medicine': 'https://journals.plos.org/plosmedicine/feed/atom',
            'Applied Physiology, Nutrition, and Metabolism': 'https://cdnsciencepub.com/action/showFeed?type=etoc&feed=rss&jc=apnm'

            # 'Annual Review of Medicine': 'https://www.annualreviews.org/action/showFeed?ui=45mu4&mi=3fndc3&ai=sm&jc=med&type=etoc&feed=atom' # response code 403
            }
        for index, journal in enumerate(journals):
            yield scrapy.Request(
                url=journals[journal], callback=self.parse_front, 
                cb_kwargs={'journal': journal, 'journal_index': index, 'article_dict': article_dict}
                )
    
    def parse_front(self, response, journal, journal_index, article_dict):
        response.selector.remove_namespaces() # This is needed for any Atom feeds
        try:
            if self.n_articles != 1:
                article_title = response.xpath('//entry/title/text()').getall()
                article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
                if article_url == []:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = response.xpath('//item/title/text()').getall()
                    article_url = response.css('item > link::text').getall()
            else:
                article_title = [response.xpath('//entry/title/text()').get()]
                article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
                if article_url[0] is None:
                    print(f'\tExtracting using method 2 for {journal}')
                    article_title = [response.xpath('//item/title/text()').get()]
                    article_url = [response.css('item > link::text').get()]
        except:
            print('fail')
        print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

        # This is required for BMJ Open, which for some reason repeats each article title.
        if len(article_title) == len(article_url) * 2:
            unique_article_title = []
            [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
            article_title = unique_article_title
            print(f'\tCorrected number of article titles: {len(article_title)}')
        if type(self.n_articles) == int:
            article_url = article_url[:self.n_articles]

        for index, url in enumerate(article_url):
            # print(url)
            key = round(journal_index + index/100, 2)
            article_dict[key] = {
                'journal': journal,
                'title': article_title[index],
                'url': url
            }
            yield response.follow(
                url=url, callback=self.parse_pages, 
                cb_kwargs={'journal': journal, 'key': key, 'article_dict': article_dict})
                
    
    def parse_pages(self, response, journal, key, article_dict):
        print(f'Journal #{key}: {journal}')
        if journal != 'Applied Physiology, Nutrition, and Metabolism':
            text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
        else:
            iframe = response.xpath('//iframe/@src').extract()
            article_dict[key]['iframe'] = iframe
            text = response.xpath('//div[@class="core-container"]').extract()
        article_dict[key]['text'] = ''.join(['\n'+line for line in text])
        if key - int(key) == 0:
            print(f'\t{article_dict[key]["journal"]}')
            print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
    #     for url in iframe:
    #         yield scrapy.Request(url=url, callback=self.parse_iframe, cb_kwargs={'key': key, 'article_dict': article_dict})


    # def parse_iframe(self, response, key, article_dict):
    #     print(f'Journal #{key}')
    #     # text = response.xpath('//h2|//p|//h3|//h4|]').extract()
    #     # article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     # article_dict['hello'] = 'world'
    #     article_dict['response'] = response.xpath('//text()|//h1|//h2|//h3|//h4').getall()
        
@wait_for(10)
def run_RSS_spider(n_articles='all'):
    """
    Scrape articles from RSS feeds. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_RSS1, n_articles)
    return d

def article_titles(article_dict):
    """
    Print the titles of the articles in a dictionary of articles.
    """
    for article in sorted(article_dict):
        print(f"{article}: {article_dict[article]['title']}")
        print(f"\t{article_dict[article]['journal']} {article_dict[article]['url']}\n")


from IPython import display
import re
import sys

def trim_text(text, article_regex=None, abs_regex=None):
    if article_regex==None:
        article_regex = '.*<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?(<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References)<.*' 
        abs_regex = '.*(<h2>Abstract</h2>.*(?:(?:Introduction|Background).*)?)<h2.*?>\d?.?\s?(?:Introduction|Background)</h2>.*References<.*' 
    try:
        body = re.search(article_regex, text, re.DOTALL).group(1)
        abstract = re.search(abs_regex, text, re.DOTALL).group(1)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to parse article text')
        body = text 
        abstract = text 
    try:
        article_display = display.HTML(body)
        abs_display = display.HTML(abstract)
    except Exception as error: 
        exc_type, exc_obj, tb = sys.exc_info()
        file = tb.tb_frame
        lineno = tb.tb_lineno
        filename = file.f_code.co_filename
        print(f'\tAn error occurred on line {lineno} in {filename}: {error}')    
        print('\t\tUnable to create HTML display')
        article_display = f'<p>{body}</p>'
        abs_display = f'<p>{abstract}</p>'
    processed_article = {
        'abstract': abstract,
        'body': body,
    }
    display_dict = {
        'article_display': article_display,
        'abs_display': abs_display
    }
    return processed_article, display_dict

def text_dict_from_web(article_dict, header=(2,4), to_display=0.01, verbose=False,
        article_regex_str=r'.*<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?(<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References)<.*',
        abs_regex_str=r'.*(<h\d.*?>Abstract</h\d>.*(?:(?:Introduction|Background).*)?)<h\d.*?>1?.?\s?(?:Introduction|Background)</h\d>.*References<.*'
        ):
    """
    Create a text dictionary from a dictionary containing web-scraped articles.

    Parameters:
        article_dict (dict): Values of each dictionary item are a dictionary representing the data from a 
            single article: 'url', 'text', and 'title'.

    Returns:
        text_dict: Dictionary where each item is a string of the text of an article, starting with the title.
    """
    if type(header) == int:
        header = str(header) 
    else :
        header = rf"[{''.join([str(h) for h in range(header[0], header[-1]+1)])}]"
    print(rf'header: {header}')
    article_regex_str = article_regex_str.replace('\d', header)
    abs_regex_str = abs_regex_str.replace('\d', header)
    article_regex = rf'{article_regex_str}'
    abs_regex = rf'{abs_regex_str}'
    print(f'Regex patterns: \n\t{article_regex}\n\t{abs_regex}')
    text_dict = dict()
    display_dict = dict()
    if (type(to_display) == int) or (type(to_display) == float):
        to_display = [to_display] 
    for article_key in article_dict:
        journal = article_dict[article_key]["journal"]
        text = article_dict[article_key]['text']
        if 'PLOS' in journal:
            article_type_regex = r'id="artType">(.+?)<.*'
            article_type = re.search(article_type_regex, text, re.DOTALL).group(1)
            types_to_exclude = ['Editorial', 'Correction', 'Perspective', 'Retraction']
            types_to_exclude += [type.lower() for type in types_to_exclude]
            types_to_exclude += [type.upper() for type in types_to_exclude]
        else:
            article_type = 'Research Article'
        if (verbose == True) or  ((article_key +1) - (article_key +1) //1 == 0): # if integer
            print(f'Journal: {journal} {article_key}')
        if (article_type == 'Research Article') or (article_type not in types_to_exclude):
            trimmed_text, display = trim_text(text, article_regex, abs_regex)
            text_dict[article_key] = {
                'title': article_dict[article_key]['title'],
                'body': f"{article_dict[article_key]['title']}\n\n{trimmed_text['body']}",
                'abstract': trimmed_text['abstract'],
            }
            if (to_display == 'all') or (to_display == None) or (article_key in to_display):
                display_dict[article_key] = {
                    'abstract': display['abs_display'],
                    'body': display['article_display']
                }
        else:
            print(f'\tArticle type "{article_type}" excluded')
    print(f'text_dict keys: {[key for key in text_dict.keys()]}')
    return text_dict, display_dict

def display_html(display_dict, type='abstract'):
    """
    Display the HTML from the dictionary of HTML displays.
    """
    print()
    for text in display_dict:
        print('************************************* Start *************************************')
        display.display(display_dict[text][type])


def create_partial_article_dict(article_dict, n_articles=2, journals='all'):
    """
    Creates a partial article dictionary from the full article dictionary.
    
    Args:
        article_dict (dict): The full article dictionary.
        n_articles (int, optional): The number of articles per journal to include in the partial dictionary.
            Defaults to 2.
        journals ('all', int, or list, optional): The integers of the journals to include in the partial dictionary.
            Defaults to 'all'.
    
    Returns:
        dict: A partial article dictionary.
    """
    if journals == 'all':
        journals = list(set([key//1 for key in article_dict.keys()]))
    elif (type(journals) == float) or (type(journals) == int):
        journals = [journals]
    article_dict = {
        key: article_dict[key] for key in article_dict.keys() if \
        (key//1 in journals) and (key - int(key) < n_articles/100)
        }
    print(f'Keys for article_dict: {[key for key in sorted(article_dict.keys())]}')
    journals = [journal for journal in set([key["journal"] for key in article_dict.values()])]
    print('Journals:')
    for journal in journals:
        print(f'\t{journal}')
    return article_dict





iteration = 1.1
text_id = iteration
n_articles = 2
####
article_dict = dict()
run_RSS_spider(n_articles)
root_article_dict[iteration] = article_dict

	Extracting using method 2 for Applied Physiology, Nutrition, and Metabolism
Found 32 articles and 16 URLs for Applied Physiology, Nutrition, and Metabolism
	Corrected number of article titles: 16
Journal #0.01: Applied Physiology, Nutrition, and Metabolism
Journal #0.0: Applied Physiology, Nutrition, and Metabolism
	Applied Physiology, Nutrition, and Metabolism
		Article attributes: ['journal', 'title', 'url', 'iframe', 'text']


In [6]:
next(iter(article_dict.values()))['text']

'\n<div class="core-container"><div data-article-access="free" data-article-access-type="free" class="meta-panel"><div class="meta-panel__left-content"><div class="meta-panel__access meta-panel__access--free"><span>Free access</span></div><div class="meta-panel__editor-award"><i aria-hidden="true" class="icon-editor-choice"></i>Editor\'s Choice</div><div class="meta-panel__type"><a href="/topic/paper-type/review-article">Review</a></div></div><div class="meta-panel__right-content"><div class="meta-panel__share">\n\n\n\n        \n        <!-- Go to https://www.addtoany.com/buttons/customize/ to customize your tools --><script type="text/javascript" defer src="https://static.addtoany.com/menu/page.js"></script><div class="share__block share__inline-links"><span class="sr-only">Share on</span><ul class="rlist--inline a2a a2a_kit a2a_kit_size_32 a2a_default_style"><li class="a2a_listitem_custom"><a role="link" title="Facebook" class="a2a_button_facebook"><i aria-hidden="true" class="at-ico

# Scrape website

In [15]:

from scrapy.http import FormRequest, request
class crawler_website(scrapy.Spider):
    name = "crawler_website"
    
    def __init__(self, n_articles):
        self.n_articles = n_articles
    

    def parse(self, response):
        # Handle the login response here, check if login was successful
        if "Welcome" in response.text:
            self.logger.info("Login successful!")
            yield scrapy.Request(url='https://examine.com/research-feed/filter/?filter=categories&value=healthy-aging-longevity', callback=self.parse_protected_page)

    def parse_protected_page(self, response):

        page_headings = response.xpath('//h1|//h2').extract()
        article_dict['headings'] = page_headings

    def start_requests(self):
        # This method is called to initiate the login process
        return [FormRequest(
            url='https://examine.com/login/',
            formdata={'email': os.environ['email'], 'password': os.environ['examine_password']},
            callback=self.parse
        )]

    # def start_requests(self):
    #     sites = {
    #         'Examine': 'https://examine.com/login/'}
        # for index, site in enumerate(sites):
        #     yield scrapy.Request(
        #         url=journals[journal], callback=self.parse_front, 
        #         cb_kwargs={'site': site, 'index': index, 'article_dict': article_dict}
        #         )
    
    # def parse_front(self, response, site, index, article_dict):
    #     # response.selector.remove_namespaces() # This is needed for any Atom feeds
    #     try:
    #         if self.n_articles != 1:
    #             article_title = response.xpath('//entry/title/text()').getall()
    #             article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
    #             if article_url == []:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = response.xpath('//item/title/text()').getall()
    #                 article_url = response.css('item > link::text').getall()
    #         else:
    #             article_title = [response.xpath('//entry/title/text()').get()]
    #             article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
    #             if article_url[0] is None:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = [response.xpath('//item/title/text()').get()]
    #                 article_url = [response.css('item > link::text').get()]
    #     except:
    #         print('fail')
    #     print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

    #     # This is required for BMJ Open, which for some reason repeats each article title.
    #     if len(article_title) == len(article_url) * 2:
    #         unique_article_title = []
    #         [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
    #         article_title = unique_article_title
    #         print(f'\tCorrected number of article titles: {len(article_title)}')
    #     if type(self.n_articles) == int:
    #         article_url = article_url[:self.n_articles]

    #     for index, url in enumerate(article_url):
    #         # print(url)
    #         key = round(journal_index + index/100, 2)
    #         article_dict[key] = {
    #             'journal': journal,
    #             'title': article_title[index],
    #             'url': url
    #         }
    #         yield response.follow(
    #             url=url, callback=self.parse_pages, 
    #             cb_kwargs={'journal': journal, 'key': key, 'article_dict': article_dict})
                
    
    # def parse_pages(self, response, journal, key, article_dict):
    #     print(f'Journal #{key}: {journal}')
    #     if journal != 'Applied Physiology, Nutrition, and Metabolism':
    #         text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
    #     else:
    #         iframe = response.xpath('//iframe/@src').extract()
    #         article_dict[key]['iframe'] = iframe
    #         text = response.xpath('//div[@class="core-container"]').extract()
    #     article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     if key - int(key) == 0:
    #         print(f'\t{article_dict[key]["journal"]}')
    #         print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(10)
def run_website_spider(n_articles='all'):
    """
    Scrape articles from websites. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_website, n_articles)
    return crawler

iteration = 2
text_id = iteration
n_articles = 2
####
article_dict = dict()
crawler_dict = {iteration: run_website_spider(n_articles)}
root_article_dict[iteration] = article_dict

In [18]:
vars(crawler_dict[iteration])

{'settings': {'AJAXCRAWL_ENABLED': False,
 'ASYNCIO_EVENT_LOOP': None,
 'AUTOTHROTTLE_DEBUG': False,
 'AUTOTHROTTLE_ENABLED': False,
 'AUTOTHROTTLE_MAX_DELAY': 60.0,
 'AUTOTHROTTLE_START_DELAY': 5.0,
 'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0,
 'BOT_NAME': 'scrapybot',
 'CLOSESPIDER_ERRORCOUNT': 0,
 'CLOSESPIDER_ITEMCOUNT': 0,
 'CLOSESPIDER_PAGECOUNT': 0,
 'CLOSESPIDER_TIMEOUT': 0,
 'COMMANDS_MODULE': '',
 'COMPRESSION_ENABLED': True,
 'CONCURRENT_ITEMS': 100,
 'CONCURRENT_REQUESTS': 16,
 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
 'CONCURRENT_REQUESTS_PER_IP': 0,
 'COOKIES_DEBUG': False,
 'COOKIES_ENABLED': True,
 'DEFAULT_ITEM_CLASS': 'scrapy.item.Item',
 'DEFAULT_REQUEST_HEADERS': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                             'Accept-Language': 'en'},
 'DEPTH_LIMIT': 0,
 'DEPTH_PRIORITY': 0,
 'DEPTH_STATS_VERBOSE': False,
 'DNSCACHE_ENABLED': True,
 'DNSCACHE_SIZE': 10000,
 'DNS_RESOLVER': 'scrapy.resolver.CachingThreadedResolver',

In [19]:
crawler_dict[iteration].response

AttributeError: 'CrawlerRunner' object has no attribute 'response'

In [12]:
article_dict

{}

## Iteration 2.1

In [20]:
class crawler_website(scrapy.Spider):
    name = "crawler_website"
    
    def __init__(self, n_articles):
        self.n_articles = n_articles
    
    def login(self):
        # This method is called to initiate the login process
        return [FormRequest(
            url='https://examine.com/login/',
            formdata={'email': os.environ['email'], 'password': os.environ['examine_password']},
            callback=self.parse
        )]

    def parse(self, response):
        # Handle the login response here, check if login was successful
        if "Welcome" in response.text:
            self.logger.info("Login successful!")
            yield scrapy.Request(url='https://examine.com/research-feed/filter/?filter=categories&value=healthy-aging-longevity', callback=self.parse_protected_page)

    def parse_protected_page(self, response):

        page_headings = response.xpath('//h1|//h2').extract()
        article_dict['headings'] = page_headings


    # def start_requests(self):
    #     sites = {
    #         'Examine': 'https://examine.com/login/'}
        # for index, site in enumerate(sites):
        #     yield scrapy.Request(
        #         url=journals[journal], callback=self.parse_front, 
        #         cb_kwargs={'site': site, 'index': index, 'article_dict': article_dict}
        #         )
    
    # def parse_front(self, response, site, index, article_dict):
    #     # response.selector.remove_namespaces() # This is needed for any Atom feeds
    #     try:
    #         if self.n_articles != 1:
    #             article_title = response.xpath('//entry/title/text()').getall()
    #             article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
    #             if article_url == []:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = response.xpath('//item/title/text()').getall()
    #                 article_url = response.css('item > link::text').getall()
    #         else:
    #             article_title = [response.xpath('//entry/title/text()').get()]
    #             article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
    #             if article_url[0] is None:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = [response.xpath('//item/title/text()').get()]
    #                 article_url = [response.css('item > link::text').get()]
    #     except:
    #         print('fail')
    #     print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

    #     # This is required for BMJ Open, which for some reason repeats each article title.
    #     if len(article_title) == len(article_url) * 2:
    #         unique_article_title = []
    #         [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
    #         article_title = unique_article_title
    #         print(f'\tCorrected number of article titles: {len(article_title)}')
    #     if type(self.n_articles) == int:
    #         article_url = article_url[:self.n_articles]

    #     for index, url in enumerate(article_url):
    #         # print(url)
    #         key = round(journal_index + index/100, 2)
    #         article_dict[key] = {
    #             'journal': journal,
    #             'title': article_title[index],
    #             'url': url
    #         }
    #         yield response.follow(
    #             url=url, callback=self.parse_pages, 
    #             cb_kwargs={'journal': journal, 'key': key, 'article_dict': article_dict})
                
    
    # def parse_pages(self, response, journal, key, article_dict):
    #     print(f'Journal #{key}: {journal}')
    #     if journal != 'Applied Physiology, Nutrition, and Metabolism':
    #         text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
    #     else:
    #         iframe = response.xpath('//iframe/@src').extract()
    #         article_dict[key]['iframe'] = iframe
    #         text = response.xpath('//div[@class="core-container"]').extract()
    #     article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     if key - int(key) == 0:
    #         print(f'\t{article_dict[key]["journal"]}')
    #         print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(10)
def run_website_spider(n_articles='all'):
    """
    Scrape articles from websites. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_website, n_articles)
    return crawler

iteration = 2.1
text_id = iteration
n_articles = 2
####
article_dict = dict()
crawler_dict = {iteration: run_website_spider(n_articles)}
root_article_dict[iteration] = article_dict

In [21]:
article_dict

{}

## 2.2

In [8]:
import scrapy
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import time

setup()

class crawler_website(scrapy.Spider):
    name = "crawler_website"
    start_urls = ['https://examine.com/login/']
    
    def __init__(self, n_articles):
        self.n_articles = n_articles

    def start_requests(self):
        sites = {
            'Examine': 'https://examine.com/login/'}
        for index, site in enumerate(sites):
            yield scrapy.Request(
                url=sites[site], callback=self.login, 
                cb_kwargs={'site': site, 'index': index, 'article_dict': article_dict}
                )
    
    def login(self, response, site, index, article_dict):
        # This method is called to initiate the login process
        print('Logging in...')
        return [scrapy.FormRequest.from_response(
            response,
            formdata={'email': os.environ['email'], 'password': os.environ['examine_password']},
            callback=self.after_login
        )]

    def after_login(self, response):
        # Handle the login response here, check if login was successful
        print(f'{response.status}')
        if "Welcome" in response.text:
            self.logger.info("Login successful!")
            yield scrapy.Request(url='https://examine.com/research-feed/filter/?filter=categories&value=healthy-aging-longevity', callback=self.parse_protected_page)

    def parse_protected_page(self, response):

        page_headings = response.xpath('//h1|//h2').extract()
        article_dict['headings'] = page_headings
        article_dict['response'] = response


    # def start_requests(self):
    #     sites = {
    #         'Examine': 'https://examine.com/login/'}
        # for index, site in enumerate(sites):
        #     yield scrapy.Request(
        #         url=journals[journal], callback=self.parse_front, 
        #         cb_kwargs={'site': site, 'index': index, 'article_dict': article_dict}
        #         )
    
    # def parse_front(self, response, site, index, article_dict):
    #     # response.selector.remove_namespaces() # This is needed for any Atom feeds
    #     try:
    #         if self.n_articles != 1:
    #             article_title = response.xpath('//entry/title/text()').getall()
    #             article_url = response.css('entry > link[rel="alternate"]::attr(href)').getall()
    #             if article_url == []:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = response.xpath('//item/title/text()').getall()
    #                 article_url = response.css('item > link::text').getall()
    #         else:
    #             article_title = [response.xpath('//entry/title/text()').get()]
    #             article_url = [response.css('entry > link[rel="alternate"]::attr(href)').get()]
    #             if article_url[0] is None:
    #                 print(f'\tExtracting using method 2 for {journal}')
    #                 article_title = [response.xpath('//item/title/text()').get()]
    #                 article_url = [response.css('item > link::text').get()]
    #     except:
    #         print('fail')
    #     print(f'Found {len(article_title)} articles and {len(article_url)} URLs for {journal}')

    #     # This is required for BMJ Open, which for some reason repeats each article title.
    #     if len(article_title) == len(article_url) * 2:
    #         unique_article_title = []
    #         [unique_article_title.append(article) for article in article_title if article not in unique_article_title]
    #         article_title = unique_article_title
    #         print(f'\tCorrected number of article titles: {len(article_title)}')
    #     if type(self.n_articles) == int:
    #         article_url = article_url[:self.n_articles]

    #     for index, url in enumerate(article_url):
    #         # print(url)
    #         key = round(journal_index + index/100, 2)
    #         article_dict[key] = {
    #             'journal': journal,
    #             'title': article_title[index],
    #             'url': url
    #         }
    #         yield response.follow(
    #             url=url, callback=self.parse_pages, 
    #             cb_kwargs={'journal': journal, 'key': key, 'article_dict': article_dict})
                
    
    # def parse_pages(self, response, journal, key, article_dict):
    #     print(f'Journal #{key}: {journal}')
    #     if journal != 'Applied Physiology, Nutrition, and Metabolism':
    #         text = response.xpath('//h2|//p|//h3|//h4|//div[@role="paragraph"]').extract()
    #     else:
    #         iframe = response.xpath('//iframe/@src').extract()
    #         article_dict[key]['iframe'] = iframe
    #         text = response.xpath('//div[@class="core-container"]').extract()
    #     article_dict[key]['text'] = ''.join(['\n'+line for line in text])
    #     if key - int(key) == 0:
    #         print(f'\t{article_dict[key]["journal"]}')
    #         print(f'\t\tArticle attributes: {[key for key in article_dict[key].keys()]}')
        
@wait_for(10)
def run_website_spider(n_articles='all'):
    """
    Scrape articles from websites. Must instantiate a blank dictionary as `article_dict` before running the script.
    Parameters:
        - n_articles (int): Number of articles to scrape from each journal. 
            If 'all' or other non-integer value, scrape all articles. Default is 'all'.

    How to call the function: 
    ```
    article_dict = dict()
    run_RSS_spider(n_articles)

    ```
    """
    crawler = CrawlerRunner()
    d = crawler.crawl(crawler_website, n_articles)
    return crawler

iteration = 2.2
text_id = iteration
n_articles = 2
####
article_dict = dict()
crawler_dict[2.2] = run_website_spider(n_articles)
root_article_dict[iteration] = article_dict

Logging in...
200


### response

In [12]:
article_dict['response'].text

'<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title class="capitalize">Showing the latest research summaries for Healthy Aging &amp; Longevity</title><meta name="next-head-count" content="3" /><link rel="icon" href="/favicon.ico" /><link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png" /><link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png" /><link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png" /><link rel="manifest" href="/site.webmanifest" /><script id="hotjar" data-nscript="beforeInteractive">(function(h,o,t,j,a,r){\n                h.hj=h.hj||function(){(h.hj.q=h.hj.q||[]).push(arguments)};\n                h._hjSettings={hjid:3102592,hjsv:6};\n                a=o.getElementsByTagName(\'head\')[0];\n                r=o.createElement(\'script\');r.async=1;\n                r.src=t+h._hjSettings.hjid+j+h._hjSettings.hjsv;\n                

In [13]:
article_dict['response'].xpath('//h1').extract()

['<h1 class="font-lora font-medium capitalize">Latest Studies on<!-- --> <!-- -->Healthy Aging &amp; Longevity<!-- --></h1>']

In [14]:
article_dict['response'].xpath('//span').extract()

['<span class="font-bold">Summer Sale Early Bird:</span>',
 '<span class="overview max-w-3xl pt-4 text-lg -tracking-2 lg:text-xl">Every day, the Examine team reviews, analyzes, and summarizes the latest health and nutrition research to help you be healthier.<!-- --></span>',
 '<span>© 2011–<!-- -->2023<!-- --> Examine.com Inc.<!-- --></span>',
 '<span class="mt-1.5">|</span>',
 '<span class="lowercase first-letter:uppercase group-hover:underline ">About Us<!-- --> <!-- --></span>',
 '<span class="lowercase first-letter:uppercase group-hover:underline ">Our story<!-- --> <!-- --></span>',
 '<span class="lowercase first-letter:uppercase group-hover:underline ">Our team<!-- --> <!-- --></span>',
 '<span class="lowercase first-letter:uppercase group-hover:underline ">Our editorial policy<!-- --> <!-- --></span>',
 '<span class="lowercase first-letter:uppercase group-hover:underline ">Minimizing Bias<!-- --> <!-- --></span>',
 '<span class="lowercase first-letter:uppercase group-hover:under

### Failed at saving the response

In [22]:
help(save_output)

Help on function save_output in module silvhua:

save_output(df, filename=None, description=None, append_version=True, iteration_id=None, index=False, csv_path='C:\\Users\\silvh\\OneDrive\\lighthouse\\Ginkgo coding\\content-summarization\\output\\CSV', pickle_path='C:\\Users\\silvh\\OneDrive\\lighthouse\\Ginkgo coding\\content-summarization\\output\\pickles')
    Save an Python object as both pickle and CSV. Automatically create filename using date and time 
    if not provided.



In [23]:
def save_output(df, filename=None, description=None, append_version=True, iteration_id=None, index=False,
    csv_path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\CSV',
    pickle_path=r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\output\pickles'
    ):
    """
    Save an Python object as both pickle and CSV. Automatically create filename using date and time 
    if not provided.
    
    """
    if description:
        filename = f'{description}_{datetime.now().strftime("%Y-%m-%d_%H%M")}'
        append_version = False
    elif filename == None:
        filename = f'{datetime.now().strftime("%Y-%m-%d_%H%M")}_outputs'
        append_version = False
    if iteration_id:
        filename += f'_{"{:02d}".format(iteration_id)}'
    try:
        savepickle(df, filename=filename, path=pickle_path, append_version=append_version)
        print('\tObject saved as pickle')
    except Exception as error:
        print('Unable to save pickle')
        print(f'\t{error}')
    if (type(df) == pd.core.frame.DataFrame) & (csv_path != None):
        save_csv(df, filename=filename, path=csv_path, append_version=append_version, index=index)
        print('\tDataFrame saved as CSV')
    elif (type(df) == dict) & (csv_path != None):
        try:
            save_csv(pd.DataFrame(df), filename=filename, path=csv_path, append_version=append_version)
            print('\tDictionary converted to CSV')
        except:
            print('\tUnable to save CSV')

In [24]:
from datetime import datetime
description = 'article_dict'
# path = r'C:\Users\silvh\OneDrive\lighthouse\Ginkgo coding\content-summarization\web_articles\2023-08-05 web'
path = r'../web_articles/2023-08-05 web'
save_output(
    article_dict, description = description, pickle_path=path, csv_path=None
)

Unable to save pickle
	can't pickle Selector objects


# *End of Page*