# Collection of articles - Scholarly
### Author : Maela Guillaume-Le Gall
#### Date : 13/02/2025

**The purpose of this code is to systematically extract academic articles on Environmental Impacts of AI in Europe. It uses the 'scholarly' package for systematic searches on google scholar.


This code sets up a headless Chrome browser using Selenium and installs necessary dependencies to scrape full abstracts from academic articles. It searches for articles related to the query "Environmental Impacts of AI, Europe" on Google Scholar, retrieves the first five results, and attempts to fetch and display the full abstract from each article's URL, handling cookie banners and potential variations in webpage structure. After the results are printed, the Selenium WebDriver is closed. **PB : all the abstracts are not displayed

In [1]:
#!pip install scholarly

# Import necessary libraries
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# Instead, ensure you have Chrome installed and adjust the binary location:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
# Update the binary location to your Windows Chrome path:
chrome_options.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'

# Continue with the rest of your setup:
driver = webdriver.Chrome(options=chrome_options)

# Automatically install and match the correct version of ChromeDriver
chromedriver_autoinstaller.install()

# Set up ChromeDriver with the specified options
driver = webdriver.Chrome(options=chrome_options)

In [32]:
import re
import time
from urllib.parse import urlparse
import pandas as pd


from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def clean_abstract(text):
    """
    Nettoie le texte de l'abstract en supprimant certaines lignes indésirables.
    - Supprime le mot "abstract" au début.
    - Dès qu'une ligne commence par "Highlights", on ignore les lignes suivantes jusqu'à ce qu'une ligne
      commençant par "abstract" soit rencontrée (celle-ci est conservée).
    - Supprime les lignes commençant par "Keywords:", "Download", "Graphical abstract", "Fig.", "Table",
      "Cookies", etc.
    - Supprime toute ligne contenant "Cite this article".
    - Dès qu'une ligne commence par "This is a preview", on arrête et ignore le reste.
    """
    # Supprimer "abstract" s'il apparaît en début de texte
    lines = text.split('\n')
    cleaned_lines = []
    skip_until_abstract = False

    for line in lines:
        if skip_until_abstract:
            if re.match(r'^\s*abstract', line, re.IGNORECASE):
                skip_until_abstract = False
                cleaned_lines.append(line.strip())
            continue

        if re.match(r'^\s*Highlights', line, re.IGNORECASE):
            skip_until_abstract = True
            continue

        if re.match(r'^\s*(Keywords:|Download|Graphical abstract)', line, re.IGNORECASE):
            continue

        if re.match(r'^\s*(Fig\.|Table|Cookies|Cookie Settings|©|All content on this site)', line, re.IGNORECASE):
            continue

        if re.search(r'Cite this article', line, re.IGNORECASE):
            continue
            

        cleaned_lines.append(line.strip())

    return "\n".join(cleaned_lines).strip()

def get_full_abstract(url, driver):
    """
    Récupère l'abstract complet en utilisant un dictionnaire de sélecteurs génériques pour différents sites.
    Si aucun sélecteur spécifique ne fonctionne, une méthode générique (parcours des paragraphes) est utilisée.
    """
    try:
        driver.get(url)
        time.sleep(3)

        # Tenter de fermer le bandeau de cookies
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, 
                    "//button[contains(text(),'Accept') or contains(text(),'Close') or contains(text(),'I agree')]"))
            )
            cookie_button.click()
            time.sleep(2)
        except:
            pass

        domain = urlparse(url).netloc.lower()
        abstract_text = ""

        # Dictionnaire des sélecteurs génériques par domaine
        selectors = {
            "mdpi.com": [
                ("css", "div#art-abstract"),
                ("css", "section.abstract"),
                ("css", "div.art-abstract")
            ],
            "sciencedirect.com": [
                ("css", "section#abstract"),
                ("css", "section#abstracts"),
                ("css", "div.Abstracts")
            ],
            "springer.com": [
                ("xpath", "//section[contains(@class,'Abstract')]"),
                ("xpath", "//div[contains(@class,'c-article-section__content')]")
            ],
            "arxiv.org": [
                ("css", "blockquote.abstract"),
                ("css", "blockquote.abstract.mathjax")
            ],
            "nature.com": [
                ("xpath", "//div[contains(@class,'Abstract')]"),
                ("xpath", "//section[contains(@class,'abstract')]")
            ]
        }

        found = False
        for key, sel_list in selectors.items():
            if key in domain:
                for sel_type, selector in sel_list:
                    try:
                        if sel_type == "css":
                            elem = WebDriverWait(driver, 5).until(
                                EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
                            )
                        else:
                            elem = WebDriverWait(driver, 5).until(
                                EC.visibility_of_element_located((By.XPATH, selector))
                            )
                        abstract_text = elem.text
                        found = True
                        break
                    except Exception:
                        continue
                if found:
                    break

        if not abstract_text:
            paragraphs = driver.find_elements(By.TAG_NAME, 'p')
            for para in paragraphs:
                text_para = para.text.strip()
                if text_para and len(text_para.split()) > 5:
                    abstract_text += text_para + "\n"

        abstract_text = clean_abstract(abstract_text)
        if not abstract_text.strip():
            return "No abstract found"
        return abstract_text

    except Exception as e:
        return f"Error fetching abstract: {str(e)}"

def relevance_score(title, query):
    """
    Calcule un score de pertinence en comparant les mots du titre avec ceux de la requête.
    """
    title_words = re.findall(r'\w+', title.lower())
    query_words = re.findall(r'\w+', query.lower())
    return len(set(title_words).intersection(query_words))

if __name__ == "__main__":
    driver = webdriver.Chrome()
    query = "Environmental Impacts Artificial Intelligence"
    search_query = scholarly.search_pubs(query)
    articles = []
    for i in range(20):
        try:
            article = next(search_query)
            articles.append(article)
        except StopIteration:
            break

    for article in articles:
        title = article.get('bib', {}).get('title', '')
        article['relevance'] = relevance_score(title, query)
        try:
            article['year'] = int(article.get('bib', {}).get('pub_year', 0))
        except:
            article['year'] = 0
        try:
            article['citations'] = int(article.get('num_citations', 0))
        except:
            article['citations'] = 0

    sorted_articles = sorted(
        [a for a in articles if a.get('year', 0) >= 2020],
        key=lambda a: (a['relevance'], a['year'], a['citations']),
        reverse=True
    )


    data = []

    for i, article in enumerate(sorted_articles[:10], start=1):
        # Récupérer les informations de l'article
        title = article['bib'].get('title', 'N/A')
        authors = article['bib'].get('author', 'N/A')
        # Si authors est une liste, on la transforme en chaîne de caractères
        if isinstance(authors, list):
            authors = ", ".join(authors)
        year = article.get('year', 'N/A')
        citations = article.get('citations', 'N/A')
        relevance = article.get('relevance', 'N/A')
        
        pub_url = article.get('pub_url', '')
        if pub_url:
            full_abstract = get_full_abstract(pub_url, driver)
        else:
            full_abstract = "No URL provided"
         # Supprimer le mot "Abstract" s'il est au début de full_abstract
        full_abstract = re.sub(r'^\s*Abstract[:]*\s*', '', full_abstract, flags=re.IGNORECASE)
    
        
        # Affichage dans la console
        print(f"Article {i}:")
        print(f"Title: {title}")
        print(f"Author(s): {authors}")
        print(f"Year: {year}")
        print(f"Citations: {citations}")
        print(f"Relevance Score: {relevance}")
        print(f"Full Abstract: {full_abstract}")
        print(f"URL: {pub_url}")
        print("-" * 40)
        
        # Ajout dans la liste de données pour le DataFrame
        data.append({
            "Article": f"Article {i}",
            "Title": title,
            "Author(s)": authors,
            "Year": year,
            "Citations": citations,
            "Relevance Score": relevance,
            "Full Abstract": full_abstract,
            "URL": pub_url
        })
    
# Création du DataFrame et export vers Excel
df = pd.DataFrame(data, columns=["Article", "Title", "Author(s)", "Year", "Citations", "Relevance Score", "Full Abstract", "URL"])
df.to_excel("articles.xlsx", index=False)
print("Export vers articles.xlsx réalisé avec succès.")

# Générer un lien de téléchargement (pour Jupyter Notebook)
from IPython.display import FileLink
display(FileLink("articles.xlsx"))

# Fermer le WebDriver après l'export
driver.quit()

Article 1:
Title: Environmental Impacts of Industrial Processes in Industry 4.0 Ecosystem: Artificial Intelligence Approach
Author(s): P Bhambri, S Rani, IS Dhanoa
Year: 2024
Citations: 6
Relevance Score: 4
Full Abstract: Please note, due to scheduled maintenance, eCommerce will be unavailable on Saturday 22nd of March 2025, between 02:30am to 12:00am GMT. We regret any inconvenience this may cause.
The chapter titled “Environmental Impacts of Industrial Processes in Industry 4.0 Ecosystem: Artificial Intelligence Approach” provides a comprehensive exploration of the increasingly critical intersection between Industry 4.0 and environmental sustainability. It delves into the transformative potential of advanced technologies, with a specific focus on artificial intelligence (AI), in monitoring, mitigating, and optimizing the environmental consequences of modern industrial processes. Through a rigorous and data-driven approach, the chapter examines how AI-driven solutions can be effective