In [172]:
# Importing the required libraries. 
import requests
import xml.etree.ElementTree as ET
import csv
import re
import pandas as pd

In [173]:
# Cleaning the file

def clean_latex(text):
    # Remove LaTeX math expressions
    text = re.sub(r'\$.*?\$', '', text)  # Inline math ($...$)
    text = re.sub(r'\\begin{.*?}.*?\\end{.*?}', '', text, flags=re.DOTALL)  # Math environments (e.g., equation)
    
    # Remove LaTeX commands (like \cite, \textbf, \section, etc.)
    text = re.sub(r'\\[a-zA-Z]+{.*?}', '', text)  # Commands like \textbf{}
    text = re.sub(r'\\[a-zA-Z]+\s*', '', text)  # Standalone commands like \section
    
    # Remove curly braces used in LaTeX for grouping
    text = text.replace('{', '').replace('}', '')
    
    # Remove citations (e.g., \cite{some_ref})
    text = re.sub(r'\\cite{.*?}', '', text)

    # Remove figure/table references (e.g., Figure 1, Table 2)
    text = re.sub(r'(Figure|Table) \d+', '', text)

    # Remove excessive technical terms related to data or experiments that are irrelevant
    text = re.sub(r'\d+\s+samples|\d+\s+epochs|training\s+set|validation\s+set|test\s+set', '', text, flags=re.IGNORECASE)

    return text

In [174]:
# Define the ArXiv API endpoint and query parameters
# https://arxiv.org/category_taxonomy Link 

url = "http://export.arxiv.org/api/query"
params = {
    'search_query': '(cat:cs.LG)',
    'start': 0,
    'max_results': 200,  # Increase the number of papers
    'sortBy': 'relevance',
    'sortOrder': 'descending'
}

In [175]:
# Send request to ArXiv API
response = requests.get(url, params=params)

In [176]:
# Parsing the response

root = ET.fromstring(response.content)

# Function to extract useful information from each paper

def parse_papers(root):
    papers = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        link = entry.find('{http://www.w3.org/2005/Atom}id').text
        papers.append({
            'title': title,
            'summary': summary,
            'link': link
        })
    return papers

In [177]:
keywords = [
    'model selection', 'cross-validation', 'hyperparameter tuning', 'grid search',
    'Bayesian optimization', 'train-test split', 'performance metrics', 'k-fold cross-validation',
    'leave-one-out cross-validation', 'regularization', 'L1 regularization', 'L2 regularization',
    'AUC-ROC', 'hyperparameter optimization', 'early stopping', 'overfitting prevention',
    'bias-variance tradeoff', 'dropout', 'weight decay'
]

In [178]:
# Function to filter papers by keywords

def filter_papers_by_keywords(papers, keywords):
    filtered_papers = []
    for paper in papers:
        if any(keyword.lower() in paper['summary'].lower() for keyword in keywords):
            filtered_papers.append(paper)
    return filtered_papers

In [179]:
# Filtering
# Get the papers and filter them by keywords

papers = parse_papers(root)
filtered_papers = filter_papers_by_keywords(papers, keywords)

In [180]:
for paper in filtered_papers:
    paper['cleaned_summary'] = clean_latex(paper['summary'])

In [181]:
# Display filtered papers
for idx, paper in enumerate(filtered_papers, 1):
    print(f"Filtered Paper {idx}: {paper['title']}")
    print(f"Original Summary: {paper['summary']}")
    print(f"Cleaned Summary: {paper['cleaned_summary']}")
    print(f"Link: {paper['link']}\n")

Filtered Paper 1: Efficient algorithms for decision tree cross-validation
Original Summary:   Cross-validation is a useful and generally applicable technique often
employed in machine learning, including decision tree induction. An important
disadvantage of straightforward implementation of the technique is its
computational overhead. In this paper we show that, for decision trees, the
computational overhead of cross-validation can be reduced significantly by
integrating the cross-validation with the normal decision tree induction
process. We discuss how existing decision tree algorithms can be adapted to
this aim, and provide an analysis of the speedups these adaptations may yield.
The analysis is supported by experimental results.

Cleaned Summary:   Cross-validation is a useful and generally applicable technique often
employed in machine learning, including decision tree induction. An important
disadvantage of straightforward implementation of the technique is its
computational over

In [184]:
def save_to_csv(papers, filename="filtered_papers (ArXiv).csv"):
    # Check if the list is empty
    if not papers:
        print("No filtered papers to save.")
        return
    
    # Specify the headers for the CSV: Title, Link, and Summary (cleaned_summary is renamed to Summary)
    keys = ['title', 'link', 'summary']  # Renaming 'cleaned_summary' to 'summary' in the CSV
    
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        
        # Write each paper's title, link, and cleaned summary
        for paper in papers:
            dict_writer.writerow({
                'title': paper['title'],
                'link': paper['link'],
                'summary': paper['cleaned_summary']  # Rename cleaned_summary as 'summary' here
            })

# Save the filtered papers with cleaned summaries
save_to_csv(filtered_papers)


#  Counting total number of papers in the CSV file.

def count_papers(filename="filtered_papers (ArXiv).csv"):
    df = pd.read_csv(filename)
    return len(df)

# Example usage
total_papers = count_papers("filtered_papers (ArXiv).csv")
print(f"Total number of research papers in the CSV: {total_papers}")

Total number of research papers in the CSV: 17
