In [1]:
import re
import os
import json
import requests
from bs4 import BeautifulSoup



In [108]:
from datetime import datetime
time_format = f'{datetime.now().strftime("%y%m%d")}'
print()

dict_keys(['tmlr', 'jmlr', 'pmlr'])

In [32]:

journal_pattern = {
    'pmlr': ['https://proceedings.mlr.press/v%s/', [1, 240], ['R0', 'R1', 'R2', 'R3', 'R4', 'R5']],
    'jmlr': ['https://www.jmlr.org/papers/v%s/', [1, 25]],
    'dmlr': ['https://data.mlr.press/volumes/%s.html', [1, 1]],
    'tmlr': ['https://jmlr.org/tmlr/papers/'],
    'mloss': ['https://www.jmlr.org/mloss/'],
}

all_papers = {}

tmlr_url='https://jmlr.org/tmlr/papers/'
data = requests.get(tmlr_url)
data.encoding = data.apparent_encoding
soup = BeautifulSoup(data.text, 'html.parser')

item_nocertificate = soup.find_all('li', {'class': 'item nocertificate'})
print(f"Number of papers in TMLR: {len(item_nocertificate)}")
tmlr_papers = []
for idp, item in enumerate(item_nocertificate):
    # print(item)
    paper = {}
    paper['title'], paper['url'] = item.find('h4').text, item.find('h4').find('a')['href']
    paper['author'], paper['time_pub'] = item.find('p').find('i').text, item.find('p').get_text(strip=True).split('[')[0].split(', ')[-1]
    hrefs = item.find('p').find_all('a')
    for href in hrefs:
        if 'bib' in href.text:            
            paper[href.text] = f"https://jmlr.org{href['href']}"
        else:
            paper[href.text] = href['href']
    # print(idp, paper)
    tmlr_papers.append(paper)
print(f"Total number in tmlr: {len(tmlr_papers)}")
all_papers['tmlr'] = tmlr_papers


dmlr_papers = []
dmlr_pattern = journal_pattern['dmlr']
for volume in range(dmlr_pattern[1][0], dmlr_pattern[1][1] + 1):
    dmlr_url = f'https://data.mlr.press/volumes/{volume:02d}.html'
    data = requests.get(dmlr_url)
    data.encoding = data.apparent_encoding
    soup = BeautifulSoup(data.text, 'html.parser')
    post_content = soup.find('div', {'class': 'post-content'})
    dl_soup = post_content.find_all('dl')
    print(f"Number of paper in volume {volume} url: {dmlr_url} : {len(dl_soup)}")

    for id_pp, pp_infor in enumerate(dl_soup):
        paper = {}
        paper['title'], paper['author'] = pp_infor.find('dt').get_text(strip=True), pp_infor.find('dd').get_text(strip=True).replace(',', ', ')
        paper['abstract'] = pp_infor.find('details').find('p').text
        hrefs = pp_infor.find_all('a')
        for href in hrefs:
            if '[' not in href.text:
                continue
            if 'bib' in href.text:            
                paper[href.text[1:-1].lower()] = f"https://data.mlr.press/{href['href']}"
            else:
                paper[href.text[1:-1].lower()] = href['href']
        dmlr_papers.append(paper)
print(f"Total number in dmlr: {len(dmlr_papers)}")
all_papers['dmlr'] = dmlr_papers


jmlr_papers = []
jmlr_pattern = journal_pattern['jmlr']
for volume in range(jmlr_pattern[1][0], jmlr_pattern[1][1] + 1):
    jmlr_url = f'https://www.jmlr.org/papers/v{volume}'
    data = requests.get(jmlr_url)
    data.encoding = data.apparent_encoding
    soup = BeautifulSoup(data.text, 'html.parser')
    print(f"Process to url volume: {jmlr_url}")
    if volume < 5:
        paper_list = soup.find_all('tr')
    else:
        paper_list = soup.find_all('dl')
        
    print(f"Number of paper in this volume {volume} : {len(paper_list)} ")
    for idp, item in enumerate(paper_list):
        # print(item)
        paper = {}
        paper['title'] = item.find('dt').text.split('\n')[0]
        dd_soup = item.find('dd')
        paper['author'], paper['time_pub'] = dd_soup.find('b').find('i').text, paper_list[0].find('dd').get_text(strip=True).split('[')[0].split('\n')[-1]
        hrefs = dd_soup.find_all('a')
        for href in hrefs:
            if 'abs' in href.text:            
                paper[href.text[1:-1].lower()] = f"{jmlr_url}/{href['href']}"
            else:
                paper[href.text[1:-1].lower()] = href['href']
        # print(idp, paper)
        jmlr_papers.append(paper)
print(f"Total number in jmlr: {len(jmlr_papers)}")
all_papers['jmlr'] = jmlr_papers

pmlr_papers = []
pmlr_pattern = journal_pattern['pmlr']
for volume in range(pmlr_pattern[1][0], pmlr_pattern[1][1] + 1):
    pmlr_url = f'https://proceedings.mlr.press/v{volume}'
    
    data = requests.get(pmlr_url)
    data.encoding = data.apparent_encoding
    soup = BeautifulSoup(data.text, 'html.parser')
    if 'File not found' in soup.text:
        print(f"*****Skip this url volume: {pmlr_url}")
        continue
    
    print(f"Process to url volume: {pmlr_url}")
        
    proceedings_name = soup.find('h2').text
    print(f"The name of volume: {proceedings_name}")
    paper_list = soup.find_all('div', {'class': 'paper'})
    print(f"Number of paper in volume {volume}: {len(paper_list)}")
    
    for idp, item in enumerate(paper_list):
        paper = {}
        
        paper['title'] = item.find('p', {'class': 'title'}).text
        paper['author'] = item.find('span', {'class': 'authors'}).text.replace('\xa0', ' ')
        paper['info'] = item.find('span', {'class': 'info'}).text
        paper['proceedings'] = proceedings_name
        hrefs = item.find('p', {'class': 'links'}).find_all('a')

        for href in hrefs:
            if 'pdf' in href.text.lower():            
                paper['pdf'] = href['href']
            else:
                paper[href.text.lower()] = href['href']
        # print(idp, paper)
        pmlr_papers.append(paper)
        
print(f"Total number in pmlr: {len(pmlr_papers)}")    
all_papers['pmlr'] = pmlr_papers


mloss_papers = []
data = requests.get('https://www.jmlr.org/mloss/')
data.encoding = data.apparent_encoding
soup = BeautifulSoup(data.text, 'html.parser')

paper_list = soup.find_all('dl')

print(f"Number of paper in mloss track : {len(paper_list)} ")
for idp, item in enumerate(paper_list):
    # print(item)
    paper = {}
    paper['title'] = item.find('dt').text.split('\n')[0]
    dd_soup = item.find('dd')
    paper['author'], paper['time_pub'] = dd_soup.find('b').find('i').text, paper_list[0].find('dd').get_text(strip=True).split('[')[0].split('; ')[-1]
    hrefs = dd_soup.find_all('a')
    for href in hrefs:
        if 'abs' in href.text or 'pdf' in href.text or 'bib' in href.text:
            paper[href.text] = f"https://www.jmlr.org{href['href']}"
        else:
            paper[href.text] = href['href']

all_papers['mloss'] = mloss_papers


with open('./tmp/prml_jmlr_dmlr_tmlr_mloss_240508.json', 'w') as fw:
    json.dump(all_papers, fw, indent=2)

Number of papers in TMLR: 861


In [134]:
with open('./tmp/prml_jmlr_dmlr_tmlr_mloss_240508.json', 'w') as fw:
    json.dump(all_papers, fw, indent=2)

In [122]:
mloss_papers = []
data = requests.get('https://www.jmlr.org/mloss/')
data.encoding = data.apparent_encoding
soup = BeautifulSoup(data.text, 'html.parser')

paper_list = soup.find_all('dl')

print(f"Number of paper in mloss track : {len(paper_list)} ")
for idp, item in enumerate(paper_list):
    # print(item)
    paper = {}
    paper['title'] = item.find('dt').text.split('\n')[0]
    dd_soup = item.find('dd')
    paper['author'], paper['time_pub'] = dd_soup.find('b').find('i').text, paper_list[0].find('dd').get_text(strip=True).split('[')[0].split('; ')[-1]
    hrefs = dd_soup.find_all('a')
    for href in hrefs:
        if 'abs' in href.text or 'pdf' in href.text or 'bib' in href.text:
            paper[href.text] = f"https://www.jmlr.org{href['href']}"
        else:
            paper[href.text] = href['href']
    mloss_papers.append(paper)
all_papers['mloss'] = mloss_papers

Number of paper in mloss track : 195 


In [118]:
data = requests.get('https://www.jmlr.org/mloss/')
data.encoding = data.apparent_encoding
soup = BeautifulSoup(data.text, 'html.parser')

paper_list = soup.find_all('dl')

print(f"Number of paper in this volume {volume} : {len(paper_list)} ")
for idp, item in enumerate(paper_list):
    # print(item)
    paper = {}
    paper['title'] = item.find('dt').text.split('\n')[0]
    dd_soup = item.find('dd')
    paper['author'], paper['time_pub'] = dd_soup.find('b').find('i').text, paper_list[0].find('dd').get_text(strip=True).split('[')[0].split('; ')[-1]
    hrefs = dd_soup.find_all('a')
    for href in hrefs:
        if 'abs' in href.text or 'pdf' in href.text or 'bib' in href.text:
            paper[href.text] = f"https://www.jmlr.org{href['href']}"
        else:
            paper[href.text] = href['href']
    # jmlr_papers.append(paper)

Number of paper in this volume 1 : 195 
0 {'title': 'QDax: A Library for Quality-Diversity and Population-based Algorithms with Hardware Acceleration', 'author': 'Felix Chalumeau, Bryan Lim, Raphaël Boige, Maxime Allard, Luca Grillotti, Manon Flageat, Valentin Macé, Guillaume Richard, Arthur Flajolet, Thomas Pierrot, Antoine Cully', 'time_pub': '(108):1−16, 2024.', 'abs': 'https://www.jmlr.org/papers/v25/23-1027.html', 'pdf': 'https://www.jmlr.org/papers/volume25/23-1027/23-1027.pdf', 'bib': 'https://www.jmlr.org/papers/v25/23-1027.bib', 'code': 'https://github.com/adaptive-intelligent-robotics/QDax'}


In [85]:
proceedings_name = soup.find('h2').text
print(proceedings_name)

Volume 1: Gaussian Processes in Practice, 12-13 June 2006, Bletchley Park, UK


In [96]:

item.find('span', {'class': 'authors'}).text.replace('\xa0', ' '), item.find('span', {'class': 'info'}).text


('Cedric Archambeau, Dan Cornford, Manfred Opper, John Shawe-Taylor',
 'Gaussian Processes in Practice, PMLR 1:1-16')

In [102]:
hrefs = item.find('p', {'class': 'links'}).find_all('a')

for href in hrefs:
    if 'pdf' in href.text.lower():            
        paper['pdf'] = href['href']
    else:
        paper[href.text.lower()] = href['href']
print(paper)

{'abs': 'https://proceedings.mlr.press/v1/archambeau07a.html', 'pdf': 'http://proceedings.mlr.press/v1/archambeau07a/archambeau07a.pdf'}


In [69]:
item.find('dt').text

'Learning with Mixtures of Trees\nMarina Meila, Michael I. Jordan; \n1(Oct):1-48, 2000.\n[abs]\n[pdf]\n[ps.gz]\n[ps]\n[html]\n'

In [119]:
tmlr_url='https://jmlr.org/tmlr/papers/'
data = requests.get(tmlr_url)
data.encoding = data.apparent_encoding
soup = BeautifulSoup(data.text, 'html.parser')

item_nocertificate = soup.find_all('li', {'class': 'item nocertificate'})
print(f"Number of papers in TMLR: {len(item_nocertificate)}")
tmlr_papers = []
for idp, item in enumerate(item_nocertificate):
    # print(item)
    paper = {}
    paper['title'], paper['url'] = item.find('h4').text, item.find('h4').find('a')['href']
    paper['author'], paper['time_pub'] = item.find('p').find('i').text, item.find('p').get_text(strip=True).split('[')[0].split(', ')[-1]
    hrefs = item.find('p').find_all('a')
    for href in hrefs:
        if 'bib' in href.text:            
            paper[href.text] = f"https://jmlr.org{href['href']}"
        else:
            paper[href.text] = href['href']
    # print(idp, paper)
    tmlr_papers.append(paper)
print(f"Total number in tmlr: {len(tmlr_papers)}")
all_papers['tmlr'] = tmlr_papers

Number of papers in TMLR: 861
Total number in tmlr: 861


In [133]:
jmlr_papers = []
jmlr_pattern = journal_pattern['jmlr']
for volume in range(jmlr_pattern[1][0], jmlr_pattern[1][1] + 1):
    jmlr_url = f'https://www.jmlr.org/papers/v{volume}'
    data = requests.get(jmlr_url)
    data.encoding = data.apparent_encoding
    soup = BeautifulSoup(data.text, 'html.parser')
    print(f"Process to url volume: {jmlr_url}")
    if volume < 5:
        paper_list = soup.find_all('tr')
    else:
        paper_list = soup.find_all('dl')
        
    print(f"Number of paper in this volume {volume} : {len(paper_list)} ")
    for idp, item in enumerate(paper_list):
        # print(item)
        paper = {}
        paper['title'] = item.find('dt').text.split('\n')[0]
        dd_soup = item.find('dd')
        paper['author'], paper['time_pub'] = dd_soup.find('b').find('i').text, paper_list[0].find('dd').get_text(strip=True).split('[')[0].split('\n')[-1]
        hrefs = dd_soup.find_all('a')
        for href in hrefs:
            if 'abs' in href.text:            
                paper[href.text[1:-1].lower()] = f"{jmlr_url}/{href['href']}"
            else:
                paper[href.text[1:-1].lower()] = href['href']
        # print(idp, paper)
        jmlr_papers.append(paper)
print(f"Total number in jmlr: {len(jmlr_papers)}")
all_papers['jmlr'] = jmlr_papers


Process to url volume: https://www.jmlr.org/papers/v1
Number of paper in this volume 1 : 11 
Process to url volume: https://www.jmlr.org/papers/v2
Number of paper in this volume 2 : 31 
Process to url volume: https://www.jmlr.org/papers/v3
Number of paper in this volume 3 : 59 
Process to url volume: https://www.jmlr.org/papers/v4
Number of paper in this volume 4 : 59 
Process to url volume: https://www.jmlr.org/papers/v5
Number of paper in this volume 5 : 56 
Process to url volume: https://www.jmlr.org/papers/v6
Number of paper in this volume 6 : 73 
Process to url volume: https://www.jmlr.org/papers/v7
Number of paper in this volume 7 : 100 
Process to url volume: https://www.jmlr.org/papers/v8
Number of paper in this volume 8 : 91 
Process to url volume: https://www.jmlr.org/papers/v9
Number of paper in this volume 9 : 95 
Process to url volume: https://www.jmlr.org/papers/v10
Number of paper in this volume 10 : 100 
Process to url volume: https://www.jmlr.org/papers/v11
Number of p

In [130]:
hrefs[0]['href']

'meila00a.html'