In [2]:
"""
Class for pulling and downloading arxiv papers.

Authors:
Garrett Williams
"""
import arxiv
import os
from datetime import datetime, timedelta
import logging
import re
import inspect
import time


# Initialize logging.
logging_location = os.path.join(os.getcwd(), 'logging')
if not (os.path.isdir(logging_location) and os.path.exists(logging_location)):
    os.mkdir(logging_location)
log_file = os.path.join(os.getcwd(), 'logging', 'logging.log')

logging.basicConfig(filename=log_file, level=logging.INFO,
                    format=' % (asctime)s - %(levelname)s - %(message)s')


class Arxiv():
    """
    Class for pulling and downloading Arxiv data according to some logic.

    Uses the arxiv python wrapper "arxiv" found at:
        https://github.com/lukasschwab/arxiv.py

    Parameters:
    -----------
    number_of_repeats: int, optional
        Number of times to make call to API. Due to time outs and other issues a single call 
        may fail to return all desired results.

    sort_by: string, optional
        Logic for sorting results

    max_results: int, optional
        Maximum number of results to return from an arxiv query request.

    max_chunk_results: int, optional
        Maximum number of results to return from a single request to the arxiv API.

    iterative: bool, optional
        Whether to return results or iterator over results.
    """

    def __init__(self, number_of_repeats=5, sort_by="submittedDate", max_results=1000, max_chunk_results=10, iterative=True):
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        for arg, val in values.items():
            setattr(self, arg, val)

        self.have_queried_papers = False
        self.have_filtered_papers = True

    def _search_query(self, categories_filter=True, authors_filter=True, key_phrases_filter=False):
        """
        Function for generating search query for arxiv. The actual categories, authors and key phrases 
        to filter by are stored in external text files. 

        Currently the search is generated by taking all authors, key phrases and categories and combining
        into a single string by repeated logical ORs.

        Parameters:
        -----------
        categories_filter: Bool, optional
            Whether to filter by categories.

        author_filter: Bool, optional
            Whether to filter by authors.

        key_phrases_filter: Bool, optional
            Whether to filter using key words.

        Returns:
        --------
        search_query: string
            Arxiv API compatible search query string.
        """

        logging.info('Generated search query.')

        categories = ""
        key_phrases = ""
        authors = ""

        if categories_filter:
            categories_path = os.getcwd()
            categories_path = os.path.join(categories_path, 'categories.txt')

            with open(categories_path, 'r') as f:
                categories = f.read().splitlines()
            categories = '(cat:' + ' OR cat:'.join(categories) + ')'

        if authors_filter:
            authors_path = os.getcwd()
            authors_path = os.path.join(authors_path, 'authors.txt')

            with open(authors_path, 'r') as f:
                authors = f.read().splitlines()
            authors = '(au:' + ' OR au:'.join(authors) + ')'

        if key_phrases_filter:
            key_phrases_path = os.getcwd()
            key_phrases_path = os.path.join(
                key_phrases_path, 'key_phrases.txt')

            with open(key_phrases_path, 'r') as f:
                key_phrases = f.read().splitlines()
            key_phrases = '(au:' + ' OR au:'.join(key_phrases) + ')'

        self.search_query_string = authors + " OR " + categories + " OR " + key_phrases

    def _generate_papers(self, search_query=None):
        """
        Pulls information on arxiv papers including authors, summary, link, and more. Based off 
        a wrapper of the Arxiv api found here:
        https://github.com/lukasschwab/arxiv.py


        Parameters:
        -----------
        search_query: string
            Arxiv API search query.

        Returns:
        --------
        all_papers: list
            All papers found by query. 
        """

        logging.info('Generating unfiltered list of papers.')

        query = self._search_query()

        all_papers = []
        for _ in range(self.number_of_repeats):
            result = arxiv.query(search_query=query,
                                 sort_by=self.sort_by,
                                 max_results=self.max_results,
                                 max_chunk_results=self.max_chunk_results,
                                 iterative=self.iterative)
            for paper in result():
                all_papers.append(paper)
            time.sleep(3)

        self.papers = list(set(all_papers))

        self.have_queried_papers = True

    def _filter_papers_time(self, date='Today'):
        """
        Takes a list of arxiv papers and filters the results by date.

        Parameter:
        ----------
        date: str, optional
            Date to filter by. If 'All' does no time filtering. If not today or 'All' must be in the form '%Y-%m-%d %H:%M:%S'.

        Returns:
        --------
        filtered_results:
        """

        logging.info('Filtering papers by date.')

        if not self.have_queried_papers:
            self._generate_papers()
            if date == 'Today':
                today = datetime.today().replace(minute=0, second=0, hour=0,
                                                 microsecond=0) - timedelta(days=2)
                filtered = []
                for paper in self.papers:
                    paper_published = datetime.strptime(
                        ' '.join(paper['published'].split('T'))[:-1], '%Y-%m-%d %H:%M:%S')
                    if paper_published >= today:
                        filtered.append(paper)
                self.filtered_papers = filtered
            elif date == 'All':
                self.filtered_papers = self.papers
            else:
                filter_date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                filtered = []
                for paper in self.papers:
                    paper_published = datetime.strptime(
                        ' '.join(paper['published'].split('T'))[:-1], '%Y-%m-%d %H:%M:%S')
                    if paper_published >= today:
                        filtered.append(paper)
                self.filtered_papers = filtered
        elif self.have_queried_papers:
            if date == 'Today':
                today = datetime.today().replace(minute=0, second=0, hour=0,
                                                 microsecond=0) - timedelta(days=2)
                filtered = []
                for paper in self.papers:
                    paper_published = datetime.strptime(
                        ' '.join(paper['published'].split('T'))[:-1], '%Y-%m-%d %H:%M:%S')
                    if paper_published >= today:
                        filtered.append(paper)
                self.filtered_papers = filtered
            elif date == 'All':
                self.filtered_papers = self.papers
            else:
                filter_date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                filtered = []
                for paper in self.papers:
                    paper_published = datetime.strptime(
                        ' '.join(paper['published'].split('T'))[:-1], '%Y-%m-%d %H:%M:%S')
                    if paper_published >= today:
                        filtered.append(paper)
                self.filtered_papers = filtered

        self.have_filtered_papers = True

    def arxiv_papers(self, papers_desired=5, store_papers=False):
        """
        Takes results of _filter_papers_time and filters down to N papers.

        Parameters:
        -----------
        papers_desired: int, optional
            Number of papers to keep.

        store_papers: Bool, optional
            If true, download papers.
        """

        logging.info('Applying user defined filters.')

        self._filter_papers_time()

        if self.have_filtered_papers:
            self.fav_papers = self.filtered_papers[:papers_desired]

        if store_papers:
            self._download_papers()

    def _custom_slugify(self, obj):
        return re.sub(r'([^\s\w]|_)+', '', ' '.join(obj['title'].split()))

    def _download_papers(self):
        """
        Method for downloading retrieved papers.
        """

        logging.info('Downloading papers locally.')

        paper_dump = os.path.join(os.path.normpath(
            os.getcwd() + os.sep + os.pardir), 'arxiv_papers')

        if not (os.path.isdir(paper_dump) and os.path.exists(paper_dump)):
            os.mkdir(paper_dump)
        for paper in self.fav_papers:
            arxiv.download(paper, dirpath=paper_dump,
                           slugify=self._custom_slugify)

In [3]:
example = Arxiv()
example._search_query()

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Garrett\Anaconda3\lib\logging\__init__.py", line 1034, in emit
    msg = self.format(record)
  File "C:\Users\Garrett\Anaconda3\lib\logging\__init__.py", line 880, in format
    return fmt.format(record)
  File "C:\Users\Garrett\Anaconda3\lib\logging\__init__.py", line 622, in format
    s = self.formatMessage(record)
  File "C:\Users\Garrett\Anaconda3\lib\logging\__init__.py", line 591, in formatMessage
    return self._style.format(record)
  File "C:\Users\Garrett\Anaconda3\lib\logging\__init__.py", line 433, in format
    return self._fmt % record.__dict__
ValueError: unsupported format character '(' (0x28) at index 3
Call stack:
  File "C:\Users\Garrett\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Garrett\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Garrett\Anaconda3\lib\site-packages\ipykernel_launcher.py",

In [None]:
example.favorite_papers(papers_desired=15, store_papers=True)

In [4]:
logging.handlers

<module 'logging.handlers' from 'C:\\Users\\Garrett\\Anaconda3\\lib\\logging\\handlers.py'>