In [None]:
import pandas as pd
import requests

In [None]:
# Pipeline here: ADS_Search -> Download Clean Papers -> Summarize -> Concat Together -> N-Gram -> Output CSV

# Download Papers

In [None]:
import requests
import os
from PyPDF2 import PdfReader
import io
import re
from pathlib import Path

def process_text(text):
    """Applies a series of cleaning steps to the text."""
    keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    
    text = find_and_remove_references(text, keywords)
    text = remove_text_in_brackets(text)
    text = remove_lines_starting_with_number_or_symbol(text)
    text = remove_lines_with_one_word(text)
    text = remove_empty_lines(text)
    
    return text

def find_and_remove_references(text, keywords):
    """Finds and removes text after the first occurrence of any of the given keywords."""
    earliest_position = float('inf')
    for keyword in keywords:
        position = text.find(keyword)
        if position != -1:
            earliest_position = min(position, earliest_position)
    
    if earliest_position != float('inf'):
        text = text[:earliest_position]
    return text

def remove_text_in_brackets(text):
    """Removes text enclosed in parentheses or square brackets."""
    pattern = r'\([^)]*\)|\[[^\]]*\]'
    return re.sub(pattern, '', text)

def remove_lines_starting_with_number_or_symbol(text):
    """Removes lines starting with a number or symbol."""
    pattern = r'^[\d\W].*$'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def remove_lines_with_one_word(text):
    """Removes lines containing only one word."""
    lines = text.split('\n')
    pattern = r'^\s*\w+\s*$'
    filtered_lines = [line for line in lines if not re.match(pattern, line)]
    return '\n'.join(filtered_lines)

def remove_empty_lines(text):
    """Removes empty lines."""
    lines = text.split('\n')
    non_empty_lines = [line for line in lines if line.strip() != '']
    return '\n'.join(non_empty_lines)

def process_arxiv_paper(arxiv_id, save_path='content'):
    # Create directory if it doesn't exist
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # Format the PDF URL
    url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
    try:
        # Download PDF and convert to text
        response = requests.get(url)
        response.raise_for_status()
        
        # Create a PDF reader object from the downloaded content
        pdf_file = io.BytesIO(response.content)
        pdf_reader = PdfReader(pdf_file)
        
        # Extract text from all pages
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
        
        # Apply text processing steps
        text = process_text(text)
        
        # Clean the text (remove special characters but keep whitespace)
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        
        # Save as text file
        txt_filename = os.path.join(save_path, f'{arxiv_id}.txt')
        with open(txt_filename, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
            
        print(f"Successfully downloaded, converted, and cleaned: {txt_filename}")
        
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
    except Exception as e:
        print(f"Error processing file: {e}")

arxiv_id = '0910.2715'
process_arxiv_paper(arxiv_id)

In [None]:
# import requests
# import os
# from PyPDF2 import PdfReader
# import io
# import re
# from pathlib import Path

# def process_text(text):
#     """Applies a series of cleaning steps to the text."""
#     keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    
#     text = find_and_remove_references(text, keywords)
#     text = remove_text_in_brackets(text)
#     text = remove_lines_starting_with_number_or_symbol(text)
#     text = remove_lines_with_one_word(text)
#     text = remove_empty_lines(text)
    
#     return text

# def find_and_remove_references(text, keywords):
#     """Finds and removes text after the first occurrence of any of the given keywords."""
#     earliest_position = float('inf')
#     for keyword in keywords:
#         position = text.find(keyword)
#         if position != -1:
#             earliest_position = min(position, earliest_position)
    
#     if earliest_position != float('inf'):
#         text = text[:earliest_position]
#     return text

# def remove_text_in_brackets(text):
#     """Removes text enclosed in parentheses or square brackets."""
#     pattern = r'\([^)]*\)|\[[^\]]*\]'
#     return re.sub(pattern, '', text)

# def remove_lines_starting_with_number_or_symbol(text):
#     """Removes lines starting with a number or symbol."""
#     pattern = r'^[\d\W].*$'
#     return re.sub(pattern, '', text, flags=re.MULTILINE)

# def remove_lines_with_one_word(text):
#     """Removes lines containing only one word."""
#     lines = text.split('\n')
#     pattern = r'^\s*\w+\s*$'
#     filtered_lines = [line for line in lines if not re.match(pattern, line)]
#     return '\n'.join(filtered_lines)

# def remove_empty_lines(text):
#     """Removes empty lines."""
#     lines = text.split('\n')
#     non_empty_lines = [line for line in lines if line.strip() != '']
#     return '\n'.join(non_empty_lines)

# def process_arxiv_paper(arxiv_id, save_path='content'):
#     # Create directory if it doesn't exist
#     if not os.path.exists(save_path):
#         os.makedirs(save_path)
    
#     # Format the PDF URL
#     url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
#     try:
#         # Download PDF and convert to text
#         response = requests.get(url)
#         response.raise_for_status()
        
#         # Create a PDF reader object from the downloaded content
#         pdf_file = io.BytesIO(response.content)
#         pdf_reader = PdfReader(pdf_file)
        
#         # Extract text from all pages
#         text = ""
#         for page in pdf_reader.pages:
#             text += page.extract_text() + "\n"
        
#         # Apply text processing steps
#         text = process_text(text)
        
#         # Clean the text (remove special characters and whitespace)
#         cleaned_text = re.sub(r'[^a-zA-Z0-9]', '', text)
        
#         # Save as text file
#         txt_filename = os.path.join(save_path, f'{arxiv_id}.txt')
#         with open(txt_filename, 'w', encoding='utf-8') as f:
#             f.write(cleaned_text)
            
#         print(f"Successfully downloaded, converted, and cleaned: {txt_filename}")
        
#     except requests.exceptions.RequestException as e:
#         print(f"Error downloading PDF: {e}")
#     except Exception as e:
#         print(f"Error processing file: {e}")

# arxiv_id = '0910.2715'
# process_arxiv_paper(arxiv_id)

Successfully downloaded, converted, and cleaned: content\0910.2715.txt


In [None]:
# import requests
# import os
# from PyPDF2 import PdfReader
# import io

# def pdf_to_text(arxiv_id, save_path='content'):
#     # Create directory if it doesn't exist
#     if not os.path.exists(save_path):
#         os.makedirs(save_path)
    
#     # Format the PDF URL
#     url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
#     try:
#         # Download PDF
#         response = requests.get(url)
#         response.raise_for_status()
        
#         # Create a PDF reader object from the downloaded content
#         pdf_file = io.BytesIO(response.content)
#         pdf_reader = PdfReader(pdf_file)
        
#         # Extract text from all pages
#         text = ""
#         for page in pdf_reader.pages:
#             text += page.extract_text() + "\n"
        
#         # Save as text file
#         txt_filename = os.path.join(save_path, f'{arxiv_id}.txt')
#         with open(txt_filename, 'w', encoding='utf-8') as f:
#             f.write(text)
            
#         print(f"Successfully converted and saved to: {txt_filename}")
        
#     except requests.exceptions.RequestException as e:
#         print(f"Error downloading PDF: {e}")
#     except Exception as e:
#         print(f"Error processing PDF: {e}")


# arxiv_id = '2411.04177'
# pdf_to_text(arxiv_id)

In [None]:
# import os
# import re
# from pathlib import Path

# def clean_text_files(directory):
#     # Convert directory to Path object
#     dir_path = Path(directory)
    
#     # Walk through all files in directory and subdirectories
#     for file_path in dir_path.rglob('*.txt'):
#         try:
#             # Read the original content
#             with open(file_path, 'r', encoding='utf-8') as file:
#                 content = file.read()
            
#             # First remove special characters (keeping letters and numbers)
#             cleaned_content = re.sub(r'[^a-zA-Z0-9]', '', content)
            
#             # Write the cleaned content back to the file
#             with open(file_path, 'w', encoding='utf-8') as file:
#                 file.write(cleaned_content)
            
#             print(f"Cleaned: {file_path}")
            
#         except Exception as e:
#             print(f"Error processing {file_path}: {str(e)}")

# clean_text_files('./content/')