In [11]:
import pandas as pd
import ads
import operator
import re
import nltk
from nltk import ngrams, word_tokenize, bigrams, trigrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
import fnmatch
import requests
import io
from PyPDF2 import PdfReader
from TextAnalysis import stopword_loader, count_words, topwords, topbigrams, toptrigrams 

In [12]:
# Pipeline here: ADS_Search -> Download Clean Papers -> Summarize -> Concat Together -> N-Gram -> Output CSV

# Download Papers

In [13]:
import requests
import io
from PyPDF2 import PdfReader
import re

def clean_text(text):
    """Clean and process the extracted text"""
    # Remove references and acknowledgements sections
    keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    for keyword in keywords:
        pos = text.find(keyword)
        if pos != -1:
            text = text[:pos]
    
    # Clean up the text
    text = re.sub(r'\([^)]*\)|\[[^\]]*\]', '', text)  # Remove bracketed content
    
    # Process text line by line
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        # Skip empty lines, single words, and lines starting with numbers/symbols
        if (line and 
            not re.match(r'^\s*\w+\s*$', line) and 
            not re.match(r'^[\d\W].*$', line)):
            cleaned_lines.append(line)
    
    return ' '.join(cleaned_lines)

def get_arxiv_text(arxiv_id):  # Updated
    """Get cleaned text from arXiv PDF"""
    arxiv_id = arxiv_id.split('arXiv:')[-1] #add .pdf
    url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    try:
        response = requests.get(url, stream=True) # stream = true since some arXiv files can be large
        response.raise_for_status()
        pdf = PdfReader(io.BytesIO(response.content))
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
        return clean_text(text)
    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
        return ""
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return ""

In [14]:
def get_summary(arxiv_id):
    return get_arxiv_text(arxiv_id)

In [15]:
def process_csv(csv_filepath, directorypath):
    df = pd.read_csv(csv_filepath)
    df['summaries'] = ""
    df['topwords'] = ""
    df['topbigrams'] = ""
    df['toptrigrams'] = ""

    for index, row in df.iterrows():
        arxiv_ids_str = row['Identifier']

        try:
            arxiv_ids = eval(arxiv_ids_str)
            all_summaries = []
            for arxiv_id in arxiv_ids:
                try:
                    summary = get_summary(arxiv_id)
                    if summary:
                        all_summaries.append(summary)
                except Exception as e:
                    print(f"Error processing {arxiv_id}: {e}")


            combined_summary = ' '.join(all_summaries)


            if combined_summary:
                # Use functions from text_analysis module
                top10words = topwords(combined_summary, directorypath)  
                top10bigrams = topbigrams(combined_summary, directorypath)
                top10trigrams = toptrigrams(combined_summary, directorypath)


                df.at[index, 'topwords'] = top10words
                df.at[index, 'topbigrams'] = top10bigrams
                df.at[index, 'toptrigrams'] = top10trigrams
                df.at[index, 'summaries'] = combined_summary

            else:
                df.at[index, 'summaries'] = None
                df.at[index, 'topwords'] = None
                df.at[index, 'topbigrams'] = None
                df.at[index, 'toptrigrams'] = None





        except (SyntaxError, NameError) as e:
            print(f"Error evaluating identifier string in row {index}: {e}")
            df.at[index, 'summaries'] = None
            df.at[index, 'topwords'] = None
            df.at[index, 'topbigrams'] = None
            df.at[index, 'toptrigrams'] = None



    return df

In [21]:
nltk.download('punkt')
nltk.download('wordnet')

directorypath = 'stopwords.txt' 
csv_file = 'small_identifier_sample.csv' 
processed_df = process_csv(csv_file, directorypath)
print(processed_df.head())
processed_df.to_csv('combined_output.csv', index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Error downloading PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/10.1088/0004-637X/707/1/103.pdf
Error downloading PDF: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2009ApJ...707..103E
Error downloading PDF: 404 Client Error: Not Found for url: http://arxiv.org/pdf/10.48550/arXiv.0910.2715
                                          Identifier  ...                                        toptrigrams
0  ['arXiv:0910.2715', '10.1088/0004-637X/707/1/1...  ...  [((density, pro, le), 7), ((envelope, infall, ...

[1 rows x 5 columns]


In [17]:
# import requests
# import os
# from PyPDF2 import PdfReader
# import io
# import re
# from pathlib import Path

# def process_text(text):
#     """Applies a series of cleaning steps to the text."""
#     keywords = ["REFERENCES", "ACKNOWLEDGEMENTS", "References", "Acknowledgements"]
    
#     text = find_and_remove_references(text, keywords)
#     text = remove_text_in_brackets(text)
#     text = remove_lines_starting_with_number_or_symbol(text)
#     text = remove_lines_with_one_word(text)
#     text = remove_empty_lines(text)
    
#     return text

# def find_and_remove_references(text, keywords):
#     """Finds and removes text after the first occurrence of any of the given keywords."""
#     earliest_position = float('inf')
#     for keyword in keywords:
#         position = text.find(keyword)
#         if position != -1:
#             earliest_position = min(position, earliest_position)
    
#     if earliest_position != float('inf'):
#         text = text[:earliest_position]
#     return text

# def remove_text_in_brackets(text):
#     """Removes text enclosed in parentheses or square brackets."""
#     pattern = r'\([^)]*\)|\[[^\]]*\]'
#     return re.sub(pattern, '', text)

# def remove_lines_starting_with_number_or_symbol(text):
#     """Removes lines starting with a number or symbol."""
#     pattern = r'^[\d\W].*$'
#     return re.sub(pattern, '', text, flags=re.MULTILINE)

# def remove_lines_with_one_word(text):
#     """Removes lines containing only one word."""
#     lines = text.split('\n')
#     pattern = r'^\s*\w+\s*$'
#     filtered_lines = [line for line in lines if not re.match(pattern, line)]
#     return '\n'.join(filtered_lines)

# def remove_empty_lines(text):
#     """Removes empty lines."""
#     lines = text.split('\n')
#     non_empty_lines = [line for line in lines if line.strip() != '']
#     return '\n'.join(non_empty_lines)

# def process_arxiv_paper(arxiv_id, save_path='content'):
#     # Create directory if it doesn't exist
#     if not os.path.exists(save_path):
#         os.makedirs(save_path)
    
#     # Format the PDF URL
#     url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
#     try:
#         # Download PDF and convert to text
#         response = requests.get(url)
#         response.raise_for_status()
        
#         # Create a PDF reader object from the downloaded content
#         pdf_file = io.BytesIO(response.content)
#         pdf_reader = PdfReader(pdf_file)
        
#         # Extract text from all pages
#         text = ""
#         for page in pdf_reader.pages:
#             text += page.extract_text() + "\n"
        
#         # Apply text processing steps
#         text = process_text(text)
        
#         # Clean the text (remove special characters and whitespace)
#         cleaned_text = re.sub(r'[^a-zA-Z0-9]', '', text)
        
#         # Save as text file
#         txt_filename = os.path.join(save_path, f'{arxiv_id}.txt')
#         with open(txt_filename, 'w', encoding='utf-8') as f:
#             f.write(cleaned_text)
            
#         print(f"Successfully downloaded, converted, and cleaned: {txt_filename}")
        
#     except requests.exceptions.RequestException as e:
#         print(f"Error downloading PDF: {e}")
#     except Exception as e:
#         print(f"Error processing file: {e}")

# arxiv_id = '0910.2715'
# process_arxiv_paper(arxiv_id)

In [18]:
# import requests
# import os
# from PyPDF2 import PdfReader
# import io

# def pdf_to_text(arxiv_id, save_path='content'):
#     # Create directory if it doesn't exist
#     if not os.path.exists(save_path):
#         os.makedirs(save_path)
    
#     # Format the PDF URL
#     url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
#     try:
#         # Download PDF
#         response = requests.get(url)
#         response.raise_for_status()
        
#         # Create a PDF reader object from the downloaded content
#         pdf_file = io.BytesIO(response.content)
#         pdf_reader = PdfReader(pdf_file)
        
#         # Extract text from all pages
#         text = ""
#         for page in pdf_reader.pages:
#             text += page.extract_text() + "\n"
        
#         # Save as text file
#         txt_filename = os.path.join(save_path, f'{arxiv_id}.txt')
#         with open(txt_filename, 'w', encoding='utf-8') as f:
#             f.write(text)
            
#         print(f"Successfully converted and saved to: {txt_filename}")
        
#     except requests.exceptions.RequestException as e:
#         print(f"Error downloading PDF: {e}")
#     except Exception as e:
#         print(f"Error processing PDF: {e}")


# arxiv_id = '2411.04177'
# pdf_to_text(arxiv_id)

In [19]:
# import os
# import re
# from pathlib import Path

# def clean_text_files(directory):
#     # Convert directory to Path object
#     dir_path = Path(directory)
    
#     # Walk through all files in directory and subdirectories
#     for file_path in dir_path.rglob('*.txt'):
#         try:
#             # Read the original content
#             with open(file_path, 'r', encoding='utf-8') as file:
#                 content = file.read()
            
#             # First remove special characters (keeping letters and numbers)
#             cleaned_content = re.sub(r'[^a-zA-Z0-9]', '', content)
            
#             # Write the cleaned content back to the file
#             with open(file_path, 'w', encoding='utf-8') as file:
#                 file.write(cleaned_content)
            
#             print(f"Cleaned: {file_path}")
            
#         except Exception as e:
#             print(f"Error processing {file_path}: {str(e)}")

# clean_text_files('./content/')

In [20]:
from benchmark import benchmark

benchmark('the cat is on the mat','the feline pet sat on the rug')

0.8099421958128612