In [1]:
#!usr/bin/env python  
# -*- coding:utf-8 -*-
""" 
@author:Lee HaiBo
@filename: journal_statistics.py
@Software:PyCharm
@time: 2021/8/27, 10:19
@email:lntulhb@163.com
@function: Springer journal statistics for recent articles
"""
import requests
import re
import os
import csv
from lxml import etree
import time


In [5]:
import requests
import re
import os
import csv
import time
from lxml import etree

# User-Agent header for requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'
}

# Create a directory for storing journal data if it does not exist
if not os.path.exists('./Journal_Acceptance_Statistics'):
    os.mkdir('./Journal_Acceptance_Statistics')


# Replace spaces in a string with "+"
def replace_space(string):
    return string.replace(' ', '+')


# Query volume and issue for specific journal
def query_volume_issue():
    print('Enter the volume and issue numbers (e.g., 1-1):')
    vol_issue_input = input('Volume-Issue (e.g., 1-1): ')

    # Construct URL
    url = f'https://link.springer.com/journal/{journal_id}/volumes-and-issues/{vol_issue_input}'
    print(f"Fetching data from URL: {url}")

    # Parse volume and issue numbers
    vol_issue_pattern = r'.*?volumes-and-issues/(\d+)-(\d+)$'
    vol_issue_match = re.findall(vol_issue_pattern, url, re.S)
    volume, issue = vol_issue_match[0]

    # Fetch the webpage
    response = requests.get(url=url, headers=headers).text

    # Extract article links
    article_link_pattern = r'<h3 class="c-card__title">.*?<a href="(.*?)".*?</h3>'
    article_links = re.findall(article_link_pattern, response, re.S)

    # Extract journal title
    title_pattern = r'<a href="/journal.*?>(.*?)</a>'
    journal_title = re.findall(title_pattern, response, re.S)[0]

    # Extract the number of articles
    article_count_pattern = r'<p class="app-volumes-and-issues__copy">(.*?)articles in this issue<\/p>'
    article_count = re.findall(article_count_pattern, response, re.S)[0]

    print(f'Starting to fetch articles for {journal_title}, Volume {volume}, Issue {issue}...')
    print(f'Total articles: {article_count}')

    # Fetch articles
    all_articles = []
    for i, article_url in enumerate(article_links):
        print(f"Fetching article {i+1}/{article_count}...")
        article_response = requests.get(url=article_url, headers=headers).text

        # Extract dates and their labels
        date_label_pattern = r'<li class="c-bibliographic-information__list-item"><p>(.*?)<span class="u-hide">: .*?</span>.*?</p></li>'
        date_labels = re.findall(date_label_pattern, article_response, re.S)

        date_pattern = r'<span class="c-bibliographic-information__value"><time datetime="(.*?)".*?</span>'
        dates = re.findall(date_pattern, article_response, re.S)

        article_data = dict(zip(date_labels, dates))
        all_articles.append(article_data)

    # Save results to CSV
    clean_title = re.sub(r'[<>:"/\\|?*]', '', journal_title)
    file_path = f'./Journal_Acceptance_Statistics/{clean_title}_Volume_{volume}_Issue_{issue}.csv'
    fieldnames = ['Received', 'Revised', 'Accepted', 'Published', 'Issue Date']

    with open(file_path, 'w', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_articles)

    print(f"Data saved to {file_path}")


# Query recent articles for a journal
def query_recent_articles():
    count = 0
    num_articles = input('Enter the number of recent articles to query: ')

    # Ensure the input is a valid number
    while not num_articles.isdigit():
        num_articles = input('Invalid input. Please enter a valid number: ')

    num_articles = int(num_articles)

    for issue_url in journal_issue_links:
        print(f"Processing URL: {issue_url}")
        response = requests.get(url=issue_url, headers=headers).text

        # Parse volume and issue numbers
        vol_issue_pattern = r'.*?volumes-and-issues/(\d+)-(\d+)$'
        vol_issue_match = re.findall(vol_issue_pattern, issue_url, re.S)
        volume, issue = vol_issue_match[0]

        # Extract article links
        article_link_pattern = r'<a href="(https://link\.springer\.com/article/.*?)"'
        article_links = re.findall(article_link_pattern, response)

        print(f"Fetching articles for Volume {volume}, Issue {issue}...")

        # Iterate through articles
        for article_url in article_links:
            article_page = requests.get(url=article_url, headers=headers).text

            # Extract dates and names for the articles
            date_label_pattern = r'<li class="c-bibliographic-information__list-item"><p>(.*?)<span class="u-hide">: .*?</span>.*?</p></li>'
            date_labels = re.findall(date_label_pattern, article_page, re.S)

            date_pattern = r'<span class="c-bibliographic-information__value"><time datetime="(.*?)".*?</span>'
            dates = re.findall(date_pattern, article_page, re.S)

            article_data = dict(zip(date_labels, dates))
            articles_data.append(article_data)

            count += 1
            print(f"Fetching article {count}/{num_articles}...")

            # Save results to a CSV file
            clean_title = re.sub(r'[<>:"/\\|?*]', '', journal_name)
            file_path = f'./Journal_Acceptance_Statistics/{clean_title}_latest_{num_articles}_articles.csv'
            fieldnames = ['Received', 'Revised', 'Accepted', 'Published', 'Issue Date']

            with open(file_path, 'w', encoding='utf-8', newline='') as file:
                writer = csv.DictWriter(file, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(articles_data)

            # Stop fetching if the desired number of articles is reached
            if count >= num_articles:
                print('Query completed!')
                return

            time.sleep(2)


# Main function to keep the program running
while True:
    articles_data = []
    journal_issue_links = []

    print('Input rules: Words are case-insensitive, replace "&" with a space, and ensure proper spacing for "-" or "&".')
    journal_name = input('Enter the journal name (Enter "q" or "Q" to quit): ')

    if journal_name.lower() == 'q':
        break

    # Replace spaces in the journal name
    journal_name_query = replace_space(journal_name)

    # Construct the Springer search URL
    search_url = f'https://link.springer.com/search?facet-content-type=Journal&query={journal_name_query}'
    search_page = requests.get(url=search_url, headers=headers).text
    search_tree = etree.HTML(search_page)

    # Extract the journal ID from the search results
    href = search_tree.xpath('//a[@class="app-card-open__link"]/@href')
    if not href:
        print("No results found for the given journal name.")
        continue

    journal_id = href[0].split('/')[-1]

    # Construct the journal issues URL
    journal_url = f'https://link.springer.com/journal/{journal_id}/volumes-and-issues'
    journal_page = requests.get(url=journal_url, headers=headers).text
    journal_tree = etree.HTML(journal_page)

    # Extract issue links and names
    issue_links = journal_tree.xpath('//li[@class="c-list-group__item"]/a[@class="c-list-group__link c-list-group__link--bold"]/@href')
    issue_names = journal_tree.xpath('//li[@class="c-list-group__item"]/a[@class="c-list-group__link c-list-group__link--bold"]/text()')

    # Construct full URLs for the issues
    for link in issue_links:
        full_link = f'https://link.springer.com{link}'
        journal_issue_links.append(full_link)

    # Display available issues
    issue_data = dict(zip(issue_links[::-1], issue_names[::-1]))
    for issue, name in issue_data.items():
        print(f"Issue: {issue}, Name: {name}")

    # Choose mode
    mode = input('Enter 0 to query recent articles, 1 to query a specific volume-issue, or q to quit: ')
    if mode == '0':
        query_recent_articles()
    elif mode == '1':
        query_volume_issue()
    elif mode.lower() == 'q':
        break
    else:
        print("Invalid input. Please try again.")


Input rules: Words are case-insensitive, replace "&" with a space, and ensure proper spacing for "-" or "&".
Issue: /journal/11134/volumes-and-issues/1-1, Name: Issue 1
Issue: /journal/11134/volumes-and-issues/1-2, Name: Issue 2
Issue: /journal/11134/volumes-and-issues/1-3, Name: Issue 3
Issue: /journal/11134/volumes-and-issues/1-4, Name: Issue 4
Issue: /journal/11134/volumes-and-issues/2-1, Name: Issue 1
Issue: /journal/11134/volumes-and-issues/2-2, Name: Issue 2
Issue: /journal/11134/volumes-and-issues/2-3, Name: Issue 3
Issue: /journal/11134/volumes-and-issues/2-4, Name: Issue 4
Issue: /journal/11134/volumes-and-issues/3-1, Name: Issue 1
Issue: /journal/11134/volumes-and-issues/3-2, Name: Issue 2
Issue: /journal/11134/volumes-and-issues/3-3, Name: Issue 3
Issue: /journal/11134/volumes-and-issues/3-4, Name: Issue 4
Issue: /journal/11134/volumes-and-issues/4-1, Name: Issue 1
Issue: /journal/11134/volumes-and-issues/4-2, Name: Issue 2
Issue: /journal/11134/volumes-and-issues/4-3, Name: