# Download all pdf from HAL in relation with Mnemosyne

### Get all articles links

In [1]:
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm 
import os 


# Define headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

# Get all pages
print('Getting all articles...')
page_links = []
for i in tqdm(range(1, 16)):
    # Get page
    link = f"https://inria.hal.science/search/index/?q=%2A&rows=30&sort=producedDate_tdate+desc&structId_i=188658&page={i}#"
    page = requests.get(link, headers=headers)

    # Found all articles page links
    soup = BeautifulSoup(page.content, 'html.parser')
    articles = soup.find_all('td', class_='pl-4 pl-sm-0')
    page_links += [a.find('a').get('href') for a in articles]

    # Sleep to avoid getting blocked
    time.sleep(1)

print(f'{len(page_links)} articles found')

Getting all articles...


100%|██████████| 15/15 [01:43<00:00,  6.89s/it]

443 articles found





### Download all pdf

In [None]:
import os

# Create folder to store pdfs
folder = 'pdfs'
if not os.path.exists(folder):
    os.makedirs(folder)

# For each page, get the pdf link & download it
print('Downloading pdfs...')
for page_name in tqdm(page_links):
    # Get page
    link = f"https://inria.hal.science{page_name}"
    page = requests.get(link, headers=headers)

    try:
        # Parse page
        soup = BeautifulSoup(page.content, 'html.parser')
        section = soup.find('div', class_="section-content section-shadow hal-visualize-button widget-files")
        link_to_pdf = section.find('a').get('href')

        # Download pdf
        pdf = requests.get(link_to_pdf, headers=headers)
        with open(f'{folder}/{page_name.split("/")[-1]}.pdf', 'wb') as f:
            f.write(pdf.content)
    
    except:
        print(f'Error downloading {page_name}')
    
    # Sleep to avoid getting blocked
    time.sleep(1)

# how many files are in the directory
path, dirs, files = next(os.walk("pdfs"))
file_count = len(files)
print(f'{file_count} pdfs downloaded')

### Convert all pdf to markdown

In [3]:
import os
from pdfminer.high_level import extract_text
from markdownify import markdownify as md
from tqdm import tqdm

# Create folder to store markdown files
if not os.path.exists('markdown'):
    os.makedirs('markdown')

def pdf_to_markdown(pdf_path, md_path):
    # Extract text from the PDF
    text = extract_text(pdf_path)
    
    # Convert the extracted text to Markdown
    markdown_text = md(text)
    
    # Write the Markdown text to a file
    with open(md_path, 'w', encoding='utf-8') as md_file:
        md_file.write(markdown_text)


for pdf in tqdm(os.listdir('pdfs')):
    try:
        pdf_path = f'pdfs/{pdf}'
        md_path = f'markdown/{pdf.replace(".pdf", ".md")}'
        pdf_to_markdown(pdf_path, md_path)
    except:
        print(f'Error converting {pdf}')

file_count = len(os.listdir('markdown'))
print(f'Converted {file_count} pdfs to markdown')

  1%|          | 3/315 [00:01<01:51,  2.80it/s]

Error converting hal-03145162v1.pdf


  7%|▋         | 22/315 [00:08<01:01,  4.80it/s]

Error converting hal-00828011v1.pdf


 52%|█████▏    | 164/315 [04:45<19:38,  7.81s/it] 

Error converting hal-00826704v1.pdf


 68%|██████▊   | 213/315 [06:39<02:19,  1.37s/it]

Error converting hal-03844358v1.pdf


 73%|███████▎  | 229/315 [06:50<00:56,  1.51it/s]

Error converting hal-01444568v1.pdf


100%|██████████| 315/315 [09:54<00:00,  1.89s/it]

Converted 369 pdfs to markdown



