In [2]:
import requests
import xml.etree.ElementTree as ET

def search_arxiv(keyword, max_results=10):
    # Construct the query URL
    url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': f'all:{keyword}',
        'start': 0,
        'max_results': max_results
    }

    # Make the HTTP request to the arXiv API
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.content)
        
        # Namespace to access tags correctly; using the Atom namespace
        ns = {'': 'http://www.w3.org/2005/Atom'}
        
        # Loop through the entries and extract information
        for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
            title = entry.find('{http://www.w3.org/2005/Atom}title').text
            summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
            published = entry.find('{http://www.w3.org/2005/Atom}published').text
            # Extracting authors
            authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
            
            print('Title:', title)
            print('Summary:', summary[:150] + '...')  # Print the first 150 characters of the summary
            print('Published:', published)
            print('Authors:', ', '.join(authors))
            print('---')
    else:
        print('Failed to fetch data from arXiv.')

# Example usage
search_arxiv('self supervised', max_results=5)


Title: Self-supervised self-supervision by combining deep learning and
  probabilistic logic
Summary:   Labeling training examples at scale is a perennial challenge in machine
learning. Self-supervision methods compensate for the lack of direct
supervi...
Published: 2020-12-23T04:06:41Z
Authors: Hunter Lang, Hoifung Poon
---
Title: Targeted Self Supervision for Classification on a Small COVID-19 CT Scan
  Dataset
Summary:   Traditionally, convolutional neural networks need large amounts of data
labelled by humans to train. Self supervision has been proposed as a method ...
Published: 2020-11-20T03:07:17Z
Authors: Nicolas Ewen, Naimul Khan
---
Title: Analyzing the Sample Complexity of Self-Supervised Image Reconstruction
  Methods
Summary:   Supervised training of deep neural networks on pairs of clean image and noisy
measurement achieves state-of-the-art performance for many image recon...
Published: 2023-05-30T14:42:04Z
Authors: Tobit Klug, Dogukan Atik, Reinhard Heckel
---
Title: Bet

In [1]:
import requests
import xml.etree.ElementTree as ET
import os
import re

def sanitize_filename(filename):
    """
    Sanitizes filenames to remove characters that are illegal or problematic in file names across various operating systems.
    """
    # Remove illegal characters for filenames
    filename = re.sub(r'[<>:"/\\|?*\n\r]+', '_', filename)
    # Remove leading and trailing whitespaces
    filename = filename.strip()
    # Shorten the filename to avoid OS limitations
    filename = filename[:250]
    return filename

def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded PDF to {save_path}")

def search_arxiv(keyword, max_results=10, sort_by='relevance'):
    url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': f'all:{keyword}',
        'start': 0,
        'max_results': max_results,
        'sortBy': sort_by,
        'sortOrder': 'descending' if sort_by == 'submittedDate' else 'ascending'
    }

    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        root = ET.fromstring(response.content)
        
        for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
            title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
            # Extract the link to the PDF
            links = entry.findall('{http://www.w3.org/2005/Atom}link')
            pdf_link = [link.get('href') for link in links if link.get('type') == 'application/pdf'][0]
            
            # Sanitize and shorten the filename
            pdf_filename = sanitize_filename(title) + '.pdf'
            save_path = os.path.join('data', pdf_filename)  # Ensure 'data' directory exists or adjust path as needed
            download_pdf(pdf_link, save_path)
            
            print('Title:', title)
            print('PDF Link:', pdf_link)
            print('---')
    else:
        print('Failed to fetch data from arXiv.')

# Example usage
search_arxiv('self supervised', max_results=5, sort_by='submittedDate')


Downloaded PDF to data\Self-similar solutions in cylindrical magneto-hydrodynamic blast waves_  with energy injection at the centre.pdf
Title: Self-similar solutions in cylindrical magneto-hydrodynamic blast waves
  with energy injection at the centre
PDF Link: http://arxiv.org/pdf/2403.16675v1
---
Downloaded PDF to data\Who is bragging more online_ A large scale analysis of bragging in_  social media.pdf
Title: Who is bragging more online? A large scale analysis of bragging in
  social media
PDF Link: http://arxiv.org/pdf/2403.16668v1
---
Downloaded PDF to data\Phase separation dynamics in a symmetric binary mixture of ultrasoft_  particles.pdf
Title: Phase separation dynamics in a symmetric binary mixture of ultrasoft
  particles
PDF Link: http://arxiv.org/pdf/2403.16663v1
---
Downloaded PDF to data\Graph Augmentation for Recommendation.pdf
Title: Graph Augmentation for Recommendation
PDF Link: http://arxiv.org/pdf/2403.16656v1
---
Downloaded PDF to data\CLHA_ A Simple yet Effective 

In [9]:
import requests
import os
import re

def search_google_scholar(keyword, max_results=10):
    """
    Search Google Scholar for papers related to the keyword and sort by citation count.
    This is a hypothetical function and does not actually query Google Scholar due to their restrictions.
    """
    # Hypothetical URL and params for Google Scholar search
    url = 'https://api.google.scholar.com/search'
    params = {
        'q': keyword,
        'max_results': max_results,
        'sort_by': 'citations',  # Hypothetical parameter for sorting by citations
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        papers = response.json()['papers']  # Hypothetical structure

        for paper in papers:
            title = paper['title']
            pdf_link = paper.get('pdf_link', None)  # Hypothetical direct link to PDF
            if pdf_link:
                pdf_filename = sanitize_filename(title) + '.pdf'
                save_path = os.path.join('data', pdf_filename)
                download_pdf(pdf_link, save_path)

            print(f"Title: {title}")
            print(f"Citations: {paper['citations']}")
            if pdf_link:
                print(f"PDF Link: {pdf_link}")
            print('---')

    except requests.RequestException as e:
        print(f"Failed to fetch data from Google Scholar: {e}")


def sanitize_filename(filename):
    filename = re.sub(r'[<>:"/\\|?*\n\r]+', '_', filename)
    filename = filename.strip()
    filename = filename[:250]
    return filename

def download_pdf(url, save_path):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded PDF to {save_path}")
    except requests.RequestException as e:
        print(f"Failed to download {url}: {e}")

def search_semantic_scholar(keyword, max_results=5):
    url = 'https://api.semanticscholar.org/graph/v1/paper/search'
    params = {
        'query': keyword,
        'limit': max_results,
        'fields': 'title,arxivId,citationCount,url'
    }
    headers = {
        'x-api-key': 'fIfWm6EQgm6SAtyzLec2B2w4L6T4q9iN41raJ9pn'  # Replace with your actual API key
    }

    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        for paper in data['data']:
            title = paper['title']
            citation_count = paper['citationCount']
            arxiv_id = paper.get('arxivId', None)
            print(f"Title: {title}")
            print(f"Citations: {citation_count}")

            if arxiv_id:
                pdf_url = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
                pdf_filename = sanitize_filename(title) + '.pdf'
                save_path = os.path.join('data', pdf_filename)
                download_pdf(pdf_url, save_path)
            else:
                print("No arXiv link available for this paper.")
            print('---')
    else:
        print('Failed to fetch data from Semantic Scholar.')

# Example usage
search_semantic_scholar('quantum computing')



Failed to fetch data from Semantic Scholar.
