In [35]:
import requests
from bs4 import BeautifulSoup
import lxml
import pandas as pd
import time
import json
import pathlib

In [5]:
session = requests.Session()

In [6]:
session.headers

{'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}

In [7]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; en-US; rv:76.0) Gecko/20100101 Firefox/76.0'}
session.headers.update(headers)

In [8]:
params_search = {
    'fillQuickSearch':'false',
    'target':'advanced',
    'expand':'dl',
    'field1':'AllField',
    'text1':"Text AND Mining OR Clustering OR Classification OR Machine AND Learning OR Data AND Mining",
    'Ppub':"[20170315 TO 20220315]",
    'pageSize':50,
    'startPage':0,
    'rel':'nofollow',
    'ContentItemType':'research-article'
}

In [10]:
response = session.get('https://dl.acm.org/action/doSearch', params=params_search)

In [11]:
response.status_code

200

In [12]:
response.url

'https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=AllField&text1=Text+AND+Mining+OR+Clustering+OR+Classification+OR+Machine+AND+Learning+OR+Data+AND+Mining&Ppub=%5B20170315+TO+20220315%5D&pageSize=50&startPage=0&rel=nofollow&ContentItemType=research-article'

In [13]:
pages = 40
soup = BeautifulSoup(response.text, 'lxml')
count_articles = int(''.join(soup.select_one('.hitsLength').text.split(','))) // 50
if count_articles < pages:
    pages = count_articles

In [14]:
def parse_page(params, page):
    params['startPage'] = page
    titles_links = [[title.text, f"https://dl.acm.org{title.select_one('a')['href']}"] for title in soup.select('.hlFld-Title')]
    return titles_links

In [15]:
def parse_info(url):
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    authors = [author.text for author in soup.select('.loa__author-name')]
    meta = soup.select_one('.issue-item__detail a')['title']
    date = soup.select_one('.epub-section__date')
    if date is None:
        date = None
    else:
        date = date.text.strip()
    abstract = soup.select_one('.abstractSection').text.strip()
    return authors, meta, date, abstract

In [16]:
all_articles = {'title': [], 'authors': [], 'date': [], 'source': [], 'abstract': [], 'link': []}
for page in range(pages):
    titles_links = parse_page(params_search, page)
    for title, link in titles_links:
        authors, meta, date, abstract = parse_info(link)
        all_articles['title'] += [title]
        all_articles['authors'] += [authors]
        all_articles['date'] += [date]
        all_articles['source'] += [meta]
        all_articles['abstract'] += [abstract]
        all_articles['link'] += [link]

In [53]:
data = pd.DataFrame(all_articles)

In [54]:
data.iloc[1968,:]

title                             Text mining with HathiTrust
authors                [Eleanor Dickson Koehl, Ryan Dubnicek]
date                                                June 2019
source      JCDL '19: Proceedings of the 18th Joint Confer...
abstract    This tutorial will introduce attendees to the ...
link           https://dl.acm.org/doi/10.1109/JCDL.2019.00115
Name: 1968, dtype: object

In [55]:
data[:10]

Unnamed: 0,title,authors,date,source,abstract,link
0,Text mining for incoming tasks based on the ur...,[Yasser Ali Alshehri],March 2020,ICCDA 2020: Proceedings of the 2020 the 4th In...,"In workplaces, there is a massive amount of un...",https://dl.acm.org/doi/10.1145/3388142.3388153
1,Text mining for malware classification using m...,"[Konstantinos F. Xylogiannopoulos, Panagiotis ...",August 2019,ASONAM '19: Proceedings of the 2019 IEEE/ACM I...,Mobile phones have become nowadays a commodity...,https://dl.acm.org/doi/10.1145/3341161.3350841
2,Comparative Study between Traditional Machine ...,"[Cannannore Nidhi Kamath, Syed Saqib Bukhari, ...",August 2018,DocEng '18: Proceedings of the ACM Symposium o...,"In this contemporaneous world, it is an obliga...",https://dl.acm.org/doi/10.1145/3209280.3209526
3,Identification of Overpricing in the Purchase ...,"[Marco Aurelio O. S. Correa, Adriano Galindo L...",August 2018,ICCBDC '18: Proceedings of the 2018 2nd Intern...,Increasing the transparency level in his actio...,https://dl.acm.org/doi/10.1145/3264560.3264569
4,Computational Estimation by Scientific Data Mi...,[Aparna S. Varde],,ACM Transactions on Knowledge Discovery from Data,Experimental results are often plotted as 2-di...,https://dl.acm.org/doi/10.1145/3502736
5,Where is the road for issue reports classifica...,"[Qiang Fan, Yue Yu, Gang Yin, Tao Wang, Huaimi...",November 2017,ESEM '17: Proceedings of the 11th ACM/IEEE Int...,"Currently, open source projects receive variou...",https://dl.acm.org/doi/10.1109/ESEM.2017.19
6,Feature-based Facebook reviews process model f...,"[Anish Kumar Varudharajulu, Yongsheng Ma]",January 2019,IC4E '19: Proceedings of the 10th Internationa...,The data generated from online communication a...,https://dl.acm.org/doi/10.1145/3306500.3306514
7,Discriminative Topic Mining via Category-Name ...,"[Yu Meng, Jiaxin Huang, Guangyuan Wang, Zihan ...",April 2020,WWW '20: Proceedings of The Web Conference 2020,Mining a set of meaningful and distinctive top...,https://dl.acm.org/doi/10.1145/3366423.3380278
8,Comparative Study of Heart Disease Diagnosis U...,[I. Ketut Agung Enriko],June 2019,ICFET '19: Proceedings of the 5th Internationa...,"Data mining has been used for many purposes, e...",https://dl.acm.org/doi/10.1145/3338188.3338220
9,Text Mining Approach for Identifying Research ...,[Snezhana Sulova],June 2021,CompSysTech '21: Proceedings of the 22nd Inter...,"With the increase of unstructured data, the is...",https://dl.acm.org/doi/10.1145/3472410.3472433


In [69]:
with open('articles.json', 'w', encoding='utf-8') as json_file:
    json.dump(data.to_dict(orient='records'), json_file, ensure_ascii=False, indent=4)