In [11]:
import numpy as np
import pandas as pd
import subprocess
from src.utility import get_root

In [12]:
import json
import re
from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import os

# Define the namedtuple for storing paper information
Paper = namedtuple('Paper', ['id', 'title'])

def get_paper_links_from_github(readme_url):
    response = requests.get(readme_url)
    if response.status_code == 200:
        readme_content = response.text

        # Extract paper links and titles from the README.md
        paper_links_with_titles = re.findall(r'\* (.*?) \[(.*?)\]\((https://arxiv\.org/abs/\d{4}\.\d{5}(?:v\d+)?)\)', readme_content)
        
        # Create Paper objects with extracted titles and IDs
        papers = [Paper(id=link[2].split('/')[-1], title=f"{link[0]} [{link[1]}]") for link in paper_links_with_titles]
        return papers
    else:
        print("Failed to fetch README.md from GitHub")
        return []

# Fetch text content from arXiv-Vanity
def get_text_from_arxiv_vanity(paper):
    url = f"https://www.arxiv-vanity.com/papers/{paper.id}/"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        text_content = ""
        abstract_selector = "div.ltx_abstract"
        section_selectors = [f"#S{i}" for i in range(1, 16)] + [f"#Ch{i}" for i in range(1, 16)]

        # Get abstract
        abstract_element = soup.select_one(abstract_selector)
        if abstract_element:
            text_content += "Abstract:\n" + abstract_element.get_text() + "\n\n"
        
        # Get sections
        for selector in section_selectors:
            section_element = soup.select_one(selector)
            if section_element:
                text_content += section_element.get_text() + "\n\n"
            else:
                break
        
        return text_content.strip()
    return f"Failed to fetch text for paper {paper.id}"

# Save the fetched content to a text file
def save_paper_text(title, text_content):
    # Ensure directory exists
    if not os.path.exists("output"):
        os.makedirs("output")

    file_path = os.path.join("output", f"{title}.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text_content)
    print(f"Saved {file_path}")

if __name__ == "__main__":
    readme_url = "https://raw.githubusercontent.com/samuelrince/awesome-green-ai/main/README.md"

    # Extract paper links and titles from the README.md
    papers = get_paper_links_from_github(readme_url)
    if not papers:
        print("No papers found.")
        exit(1)

    # Parallelize the fetching of text content
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(get_text_from_arxiv_vanity, paper): paper for paper in papers}
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            paper = futures[future]
            text_content = future.result()
            if text_content and not text_content.startswith("Failed"):
                print(f"Successfully loaded paper {paper.title}")
                save_paper_text(paper.title, text_content)
            else:
                print(f"Failed to load paper {paper.title}")

    print("Finished fetching texts.")


  4%|▍         | 1/23 [00:00<00:11,  1.94it/s]

Successfully loaded paper Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model - [Luccioni et al. (2022)]
Saved output/Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model - [Luccioni et al. (2022)].txt
Successfully loaded paper Energy and Policy Considerations for Deep Learning in NLP - [Strubell et al. (2019)]


  9%|▊         | 2/23 [00:00<00:08,  2.49it/s]

Saved output/Energy and Policy Considerations for Deep Learning in NLP - [Strubell et al. (2019)].txt


 13%|█▎        | 3/23 [00:01<00:09,  2.08it/s]

Failed to load paper Carbon Emissions and Large Neural Network Training - [Patterson, et al. (2021)]
Successfully loaded paper Quantifying the Carbon Emissions of Machine Learning - [Lacoste et al. (2019)]


 17%|█▋        | 4/23 [00:01<00:07,  2.44it/s]

Saved output/Quantifying the Carbon Emissions of Machine Learning - [Lacoste et al. (2019)].txt
Successfully loaded paper Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions - [Ligozat et al. (2022)]
Saved output/Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions - [Ligozat et al. (2022)].txt


 26%|██▌       | 6/23 [00:02<00:06,  2.61it/s]

Successfully loaded paper Measuring the Carbon Intensity of AI in Cloud Instances - [Dodge et al. (2022)]
Saved output/Measuring the Carbon Intensity of AI in Cloud Instances - [Dodge et al. (2022)].txt
Successfully loaded paper Carbontracker: Tracking and Predicting the Carbon Footprint of Training Deep Learning Models - [Anthony et al. (2020)]
Saved output/Carbontracker: Tracking and Predicting the Carbon Footprint of Training Deep Learning Models - [Anthony et al. (2020)].txt
Failed to load paper Towards the Systematic Reporting of the Energy and Carbon Footprints of Machine Learning - [Henderson et al. (2022)]


 43%|████▎     | 10/23 [00:03<00:03,  3.90it/s]

Successfully loaded paper Eco2AI: carbon emissions tracking of machine learning models as the first step towards sustainable AI - [Budennyy et al. (2022)]
Saved output/Eco2AI: carbon emissions tracking of machine learning models as the first step towards sustainable AI - [Budennyy et al. (2022)].txt
Failed to load paper The Carbon Footprint of Machine Learning Training Will Plateau, Then Shrink - [Patterson et al. (2022)]


 48%|████▊     | 11/23 [00:03<00:03,  3.38it/s]

Successfully loaded paper Bridging Fairness and Environmental Sustainability in Natural Language Processing - [Hessenthaler et al. (2022)]
Saved output/Bridging Fairness and Environmental Sustainability in Natural Language Processing - [Hessenthaler et al. (2022)].txt
Failed to load paper Toward Sustainable HPC: Carbon Footprint Estimation and Environmental Implications of HPC Systems - [Li et al. (2023)]


 61%|██████    | 14/23 [00:04<00:02,  3.57it/s]

Failed to load paper Towards Sustainable Artificial Intelligence: An Overview of Environmental Protection Uses and Issues - [Pachot et al. (2022)]
Successfully loaded paper Sustainable AI: Environmental Implications, Challenges and Opportunities - [Wu et al. (2022)]
Saved output/Sustainable AI: Environmental Implications, Challenges and Opportunities - [Wu et al. (2022)].txt
Successfully loaded paper Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - [Luccioni et al. (2023)]


 70%|██████▉   | 16/23 [00:04<00:01,  4.57it/s]

Saved output/Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - [Luccioni et al. (2023)].txt
Successfully loaded paper Making AI Less "Thirsty": Uncovering and Addressing the Secret Water Footprint of AI Models - [Li et al. (2023)]
Saved output/Making AI Less "Thirsty": Uncovering and Addressing the Secret Water Footprint of AI Models - [Li et al. (2023)].txt


 78%|███████▊  | 18/23 [00:05<00:01,  3.87it/s]

Successfully loaded paper A Survey on Green Deep Learning - [Xu et al. (2021)]
Saved output/A Survey on Green Deep Learning - [Xu et al. (2021)].txt
Successfully loaded paper Towards Greener LLMs: Bringing Energy-Efficiency to the Forefront of LLM Inference - [Stojkovic et al. (2024)]
Saved output/Towards Greener LLMs: Bringing Energy-Efficiency to the Forefront of LLM Inference - [Stojkovic et al. (2024)].txt


 83%|████████▎ | 19/23 [00:05<00:01,  3.20it/s]

Successfully loaded paper Green AI: Exploring Carbon Footprints, Mitigation Strategies, and Trade Offs in Large Language Model Training - [Liu et al. (2024)]
Saved output/Green AI: Exploring Carbon Footprints, Mitigation Strategies, and Trade Offs in Large Language Model Training - [Liu et al. (2024)].txt


 96%|█████████▌| 22/23 [00:06<00:00,  4.42it/s]

Failed to load paper Beyond Efficiency: Scaling AI Sustainably - [Wu et al. (2024)]
Successfully loaded paper Power Hungry Processing: Watts Driving the Cost of AI Deployment? - [Luccioni et al. (2023)]
Saved output/Power Hungry Processing: Watts Driving the Cost of AI Deployment? - [Luccioni et al. (2023)].txt
Successfully loaded paper LLMCarbon: Modeling the End-To-End Carbon Footprint of Large Language Models - [Faiz et al. (2023)]
Saved output/LLMCarbon: Modeling the End-To-End Carbon Footprint of Large Language Models - [Faiz et al. (2023)].txt


100%|██████████| 23/23 [00:07<00:00,  3.21it/s]

Successfully loaded paper Perseus: Removing Energy Bloat from Large Model Training - [Chung et al. (2023)]
Saved output/Perseus: Removing Energy Bloat from Large Model Training - [Chung et al. (2023)].txt
Finished fetching texts.





In [13]:
import re
from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import os

# Define the namedtuple for storing paper information
Paper = namedtuple('Paper', ['id', 'title'])

def get_paper_links_from_github(readme_url):
    response = requests.get(readme_url)
    if response.status_code == 200:
        readme_content = response.text

        # Extract paper links and titles from the README.md
        paper_links_with_titles = re.findall(r'- (.*?) \([^()]*\) \[\[Paper\]\]\((https://arxiv\.org/(?:pdf|abs)/\d{4}\.\d{5}(?:v\d+)?\.pdf)\)', readme_content)
        
        # Create Paper objects with extracted titles and IDs
        papers = [Paper(id=link[1].split('/')[-1].replace('.pdf', ''), title=link[0]) for link in paper_links_with_titles]
        return papers
    else:
        print("Failed to fetch README.md from GitHub")
        return []

# Fetch text content from arXiv-Vanity
def get_text_from_arxiv_vanity(paper):
    url = f"https://www.arxiv-vanity.com/papers/{paper.id}/"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        text_content = ""
        abstract_selector = "div.ltx_abstract"
        section_selectors = [f"#S{i}" for i in range(1, 16)] + [f"#Ch{i}" for i in range(1, 16)]

        # Get abstract
        abstract_element = soup.select_one(abstract_selector)
        if abstract_element:
            text_content += "Abstract:\n" + abstract_element.get_text() + "\n\n"
        
        # Get sections
        for selector in section_selectors:
            section_element = soup.select_one(selector)
            if section_element:
                text_content += section_element.get_text() + "\n\n"
            else:
                break
        
        return text_content.strip()
    return f"Failed to fetch text for paper {paper.id}"

# Save the fetched content to a text file
def save_paper_text(title, text_content):
    # Ensure directory exists
    if not os.path.exists("output"):
        os.makedirs("output")

    # Sanitize title to be used as a filename
    safe_title = re.sub(r'[\\/*?:"<>|]', "_", title)
    file_path = os.path.join("output", f"{safe_title}.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text_content)
    print(f"Saved {file_path}")

if __name__ == "__main__":
    readme_url = "https://raw.githubusercontent.com/ejhusom/green-ai/main/README.md"

    # Extract paper links and titles from the README.md
    papers = get_paper_links_from_github(readme_url)
    if not papers:
        print("No papers found.")
        exit(1)

    # Parallelize the fetching of text content
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(get_text_from_arxiv_vanity, paper): paper for paper in papers}
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            paper = futures[future]
            text_content = future.result()
            if text_content and not text_content.startswith("Failed"):
                print(f"Successfully loaded paper {paper.title}")
                save_paper_text(paper.title, text_content)
            else:
                print(f"Failed to load paper {paper.title}")

    print("Finished fetching texts.")


  5%|▌         | 1/20 [00:00<00:14,  1.35it/s]

Successfully loaded paper **Energy and Policy Considerations for Deep Learning in NLP**
Saved output/__Energy and Policy Considerations for Deep Learning in NLP__.txt
Successfully loaded paper Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model


 10%|█         | 2/20 [00:00<00:07,  2.30it/s]

Saved output/Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model.txt
Successfully loaded paper Quantifying the Carbon Emissions of Machine Learning


 20%|██        | 4/20 [00:01<00:04,  3.48it/s]

Saved output/Quantifying the Carbon Emissions of Machine Learning.txt
Successfully loaded paper Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions
Saved output/Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions.txt


 25%|██▌       | 5/20 [00:01<00:05,  2.63it/s]

Successfully loaded paper Carbontracker: Tracking and Predicting the Carbon Footprint of Training Deep Learning Models
Saved output/Carbontracker_ Tracking and Predicting the Carbon Footprint of Training Deep Learning Models.txt


 30%|███       | 6/20 [00:02<00:04,  2.86it/s]

Failed to load paper Towards the Systematic Reporting of the Energy and Carbon Footprints of Machine Learning


 40%|████      | 8/20 [00:03<00:04,  2.82it/s]

Successfully loaded paper Chasing Carbon: The Elusive Environmental Footprint of Computing
Saved output/Chasing Carbon_ The Elusive Environmental Footprint of Computing.txt
Successfully loaded paper Measuring the Carbon Intensity of AI in Cloud Instances
Saved output/Measuring the Carbon Intensity of AI in Cloud Instances.txt
Successfully loaded paper Eco2AI: carbon emissions tracking of machine learning models as the first step towards sustainable AI


 55%|█████▌    | 11/20 [00:03<00:01,  5.50it/s]

Saved output/Eco2AI_ carbon emissions tracking of machine learning models as the first step towards sustainable AI.txt
Successfully loaded paper Bridging Fairness and Environmental Sustainability in Natural Language Processing
Saved output/Bridging Fairness and Environmental Sustainability in Natural Language Processing.txt
Successfully loaded paper Sustainable AI: Environmental Implications, Challenges and Opportunities
Saved output/Sustainable AI_ Environmental Implications, Challenges and Opportunities.txt


 60%|██████    | 12/20 [00:04<00:02,  3.03it/s]

Successfully loaded paper Making AI Less "Thirsty": Uncovering and Addressing the Secret Water Footprint of AI Models
Saved output/Making AI Less _Thirsty__ Uncovering and Addressing the Secret Water Footprint of AI Models.txt


 65%|██████▌   | 13/20 [00:04<00:02,  2.75it/s]

Successfully loaded paper Power Hungry Processing: Watts Driving the Cost of AI Deployment?
Saved output/Power Hungry Processing_ Watts Driving the Cost of AI Deployment_.txt


 75%|███████▌  | 15/20 [00:05<00:01,  3.03it/s]

Successfully loaded paper Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning
Saved output/Counting Carbon_ A Survey of Factors Influencing the Emissions of Machine Learning.txt
Successfully loaded paper A Synthesis of Green Architectural Tactics for ML-Enabled Systems
Saved output/A Synthesis of Green Architectural Tactics for ML-Enabled Systems.txt


 80%|████████  | 16/20 [00:05<00:01,  2.14it/s]

Successfully loaded paper LLMCarbon: Modeling the End-To-End Carbon Footprint of Large Language Models
Saved output/LLMCarbon_ Modeling the End-To-End Carbon Footprint of Large Language Models.txt
Successfully loaded paper **A Systematic Review of Green AI**
Saved output/__A Systematic Review of Green AI__.txt


 90%|█████████ | 18/20 [00:06<00:00,  2.56it/s]

Successfully loaded paper A first look into the carbon footprint of federated learning
Saved output/A first look into the carbon footprint of federated learning.txt


 95%|█████████▌| 19/20 [00:06<00:00,  2.48it/s]

Successfully loaded paper A framework for energy and carbon footprint analysis of distributed and federated edge learning
Saved output/A framework for energy and carbon footprint analysis of distributed and federated edge learning.txt


100%|██████████| 20/20 [00:07<00:00,  2.75it/s]

Successfully loaded paper **A Survey on Green Deep Learning**
Saved output/__A Survey on Green Deep Learning__.txt
Finished fetching texts.





In [16]:
import fitz  # PyMuPDF
import os

def pdf_to_text(pdf_path, txt_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""

    # Iterate through each page
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        # Extract text from the page
        text += page.get_text()

    # Write the extracted text to a text file
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)

def process_pdfs_in_folder(folder_path, txt_folder):
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            txt_path = os.path.join(txt_folder, filename[:-4] + '.txt')
            pdf_to_text(pdf_path, txt_path)
            print(f"Processed {filename}")

txt_folder = 'output'
folder_path = os.path.join(get_root(), 'data', 'pdfs')
process_pdfs_in_folder(folder_path, txt_folder)


Processed 2021.sustainlp-1.2.pdf
Processed Advanced Science - 2021 - Lannelongue - Green Algorithms  Quantifying the Carbon Footprint of Computation.pdf
Processed A_Practical_Guide_to_Quantifying_Carbon_Emissions.pdf
Processed A simplified machine learning product carbon footprint evaluation.pdf.pdf
Processed CHASING LOW-CARBON ELECTRICITY FOR PRACTICAL AND SUSTAINABLE DNN TRAINING.pdf
Processed Environmental assessment of projects involving AI.pdf
Processed Estimating the environmental impact of Generative-AI.pdf
Processed GreenAI - ROY SCHWARTZ, JESSE DODGE.pdf
Processed Kaack_2021_Aligning.pdf
Processed MEASURING THE ENVIRONMENTAL IMPACTS OF AI COMPUTING.pdf
Processed New universal sustainability metrics to assess edge intelligence.pdf
Processed Timeshifting strategies for carbon-efficient long-running large language model training.pdf
Processed Zeus: Understanding and Optimizing GPU Energy.pdf
Processed A Survey on Green Deep Learning.pdf


In [17]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def read_paper_files(directory="output"):
    papers = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
                papers[filename] = f.read()
    return papers

def find_duplicates(papers, threshold=0.9):
    filenames = list(papers.keys())
    texts = list(papers.values())

    vectorizer = TfidfVectorizer().fit_transform(texts)
    vectors = vectorizer.toarray()

    similarity_matrix = cosine_similarity(vectors)

    duplicates = set()
    duplicate_pairs = []
    for i in range(len(filenames)):
        for j in range(i + 1, len(filenames)):
            if similarity_matrix[i, j] > threshold:
                if len(filenames[i]) < len(filenames[j]):
                    duplicates.add(filenames[j])
                    duplicate_pairs.append((filenames[i], filenames[j]))
                else:
                    duplicates.add(filenames[i])
                    duplicate_pairs.append((filenames[j], filenames[i]))

    return duplicates, duplicate_pairs

def delete_files(files, directory="output"):
    for filename in files:
        file_path = os.path.join(directory, filename)
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Deleted {filename}")

if __name__ == "__main__":
    paper_texts = read_paper_files()
    duplicate_files, duplicate_pairs = find_duplicates(paper_texts)

    if duplicate_files:
        print("Found duplicates:")
        for dup1, dup2 in duplicate_pairs:
            print(f"{dup2} is similar to {dup1}\n")

        delete_files(duplicate_files)
    else:
        print("No duplicates found.")


No duplicates found.


In [27]:
import os
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import namedtuple

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

Paper = namedtuple('Paper', ['id', 'title', 'abstract', 'journal_ref'])


def get_greenai_articles(data_file):
    with open(data_file, 'r') as f:
        for line in f:
            paper_dict = json.loads(line)
            id_value = paper_dict.get('id', "")
            title = paper_dict.get('title', "").lower()
            abstract = paper_dict.get('abstract', "").lower()
            yield Paper(id_value, title, abstract, paper_dict.get('journal_ref'))

def get_text_from_arxiv_vanity(paper):
    url = f"https://www.arxiv-vanity.com/papers/{paper}/"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        text_content = ""
        abstract_selector = "div.ltx_abstract"
        section_selectors = [f"#{prefix}{i}" for prefix in ["S", "Ch"] for i in range(1, 16)]

        # Get abstract
        abstract_element = soup.select_one(abstract_selector)
        if abstract_element:
            text_content += "Abstract:\n" + abstract_element.get_text() + "\n\n"
        
        # Get sections
        for selector in section_selectors:
            section_element = soup.select_one(selector)
            if section_element:
                text_content += section_element.get_text() + "\n\n"
            else:
                break
        
        return text_content.strip()
    return f"Failed to fetch text for paper {paper}"

def read_paper_file(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        content = ' '.join([line.strip() for line in lines])
    return content

def encode_abstract(abstract):
    return embed([abstract])[0]

def encode_abstracts_parallel(abstracts):
    with ThreadPoolExecutor() as executor:
        abstract_embeddings = list(tqdm(executor.map(encode_abstract, abstracts), total=len(abstracts)))
    return abstract_embeddings

def find_similar_papers(query_abstract_embedding, existing_abstracts, existing_filenames, upper_thresh=0.92, lower_thresh=0.72):
    similar_papers = {}
    similarities = cosine_similarity([query_abstract_embedding], existing_abstracts)[0]
    for i, sim in enumerate(similarities):
        if sim < upper_thresh and sim > lower_thresh:
            paper_title = os.path.splitext(os.path.basename(existing_filenames[i]))[0]
            similar_papers[paper_title] = sim
    return similar_papers

if __name__ == "__main__":
    root = get_root()
    data_file = os.path.join(root, 'data/arxiv/arxiv-metadata-oai-snapshot.json')

    # Fetch new papers from ArXiv
    papers_generator = get_greenai_articles(data_file)
    
    print("ArXiv generator is loaded")
    # Load existing papers
    existing_paper_folder = "output"
    existing_filenames = [os.path.join(existing_paper_folder, filename) for filename in os.listdir(existing_paper_folder) if filename.endswith(".txt")]

    existing_abstracts = []
    for filename in tqdm(existing_filenames):
        text_content = read_paper_file(filename)
        existing_abstracts.append(encode_abstract(text_content))  # Encode the text to get the numerical representation
    
    SimilarPaper = namedtuple("SimilarPaper", ['paper', 'similar_to'])
    similar_papers = []
    # Loop through new papers to find similar ones in existing papers
    for paper in tqdm(papers_generator):
        abstract_embedding = encode_abstract(paper.abstract)
        similar_to = find_similar_papers(abstract_embedding, existing_abstracts, existing_filenames)
        
        if similar_to:
            print(paper.title)
            similar_papers.append(SimilarPaper(paper, similar_to))
    if similar_papers:
        print("Found similar papers:")
        for similar_paper in similar_papers:
            print(f"{similar_paper.paper.title} is similar to: {', '.join(similar_paper.similar_to.keys())}")
            print("\n")
            text_content = get_text_from_arxiv_vanity(similar_paper.paper.id)
            save_paper_text(similar_paper.paper.title, text_content)
            print("\n")
    else:
        print("No similar papers found.")

ReadTimeout: HTTPSConnectionPool(host='www.arxiv-vanity.com', port=443): Read timed out. (read timeout=None)

In [28]:
if __name__ == "__main__":
    duplicate_files, duplicate_pairs = find_duplicates(paper_texts)

    if duplicate_files:
        print("Found duplicates:")
        for dup1, dup2 in duplicate_pairs:
            print(f"{dup2} is similar to {dup1}\n")

        delete_files(duplicate_files)
    else:
        print("No duplicates found.")


Found duplicates:
Quantifying the Carbon Emissions of Machine Learning - [Lacoste et al. (2019)].txt is similar to Quantifying the Carbon Emissions of Machine Learning.txt

Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model - [Luccioni et al. (2022)].txt is similar to Estimating the Carbon Footprint of BLOOM a 176B Parameter Language Model.txt

Measuring the Carbon Intensity of AI in Cloud Instances - [Dodge et al. (2022)].txt is similar to Measuring the Carbon Intensity of AI in Cloud Instances.txt

Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions - [Ligozat et al. (2022)].txt is similar to Unraveling the Hidden Environmental Impacts of AI Solutions for Environment Life Cycle Assessment of AI Solutions.txt

Energy and Policy Considerations for Deep Learning in NLP - [Strubell et al. (2019)].txt is similar to __Energy and Policy Considerations for Deep Learning in NLP__.txt

Carbontracker: Tracking a