In [55]:
# We will be using firecrawl to index documentation sites and subpages
# Install with pip install firecrawl
from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key='fc-9a96cbb456904f48848f7318b1021786')

crawl_result = app.crawl_url('https://link.springer.com/content/pdf/10.1007/s11427-023-2561-0.pdf', params={
'limit': 10,
'scrapeOptions': {
	'formats': [ 'markdown' ],
  }
})

In [56]:
# List of crawled sites

# https://www.nature.com/articles/s41586-020-03182-8
# https://link.springer.com/content/pdf/10.1007/s11427-023-2561-0.pdf
# https://en.wikipedia.org/wiki/Single-cell_transcriptomics
# https://en.wikipedia.org/wiki/Single-cell_sequencing

# https://hpcdocs.hpc.arizona.edu/
# https://satijalab.org/seurat/articles/
# https://bioconductor.org/packages/release/bioc/vignettes/methylKit/inst/doc/methylKit.html
# https://lashlock.github.io/compbio/R_presentation.html


In [None]:
# Inspecting the structure of crawl_result
import json
print(json.dumps(crawl_result, indent=2))

In [51]:
import os
import json
import markdown
from bs4 import BeautifulSoup
import re

def preprocess_markdown(markdown_content):
    """
    Converts markdown content to plain text.
    
    1. Converts markdown to HTML.
    2. Parses the HTML with BeautifulSoup.
    3. Extracts and cleans the text.
    
    Args:
      markdown_content (str): The markdown-formatted string.
      
    Returns:
      str: Cleaned plain text.
    """
    html_content = markdown.markdown(markdown_content)
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text(separator=" ")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# # Assume 'crawl_result' is loaded from your JSON source
# # For demonstration, let's say it's loaded from a file:
# with open("crawl_result_collection.json", "r", encoding="utf-8") as f:
#     crawl_result = json.load(f)

processed_entries = []
for entry in crawl_result.get("data", []):
    metadata = entry.get("metadata", {})
    status = metadata.get("statusCode", 200)  # default to 200 if not present
    # Skip entries that indicate an error (e.g., statusCode != 200) or missing markdown
    if status != 200 or not entry.get("markdown"):
        continue

    raw_markdown = entry.get("markdown", "")
    processed_text = preprocess_markdown(raw_markdown)
    processed_entries.append({
         "processed_text": processed_text,
         "metadata": metadata
    })

In [None]:
# check other proc entries
print(json.dumps(processed_entries[0], indent=2))

In [53]:
import os
import json

# Define the target directory relative to the current working directory.
# target_dir = os.path.join("..", "data", "KBs")
target_dir = os.path.join("..", "data", "pdfs")
os.makedirs(target_dir, exist_ok=True)

# Define the full path for the output file. UPDATE for each new collection.
file_path = os.path.join(target_dir, "proc2txt_s11427-023-2561-0.json")

# Save processed_entries to the JSON file.
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(processed_entries, f, ensure_ascii=False, indent=2)

print("Processed entries have been saved to:", file_path)


Processed entries have been saved to: ../data/pdfs/proc2txt_s11427-023-2561-0.json


In [1]:
############
import transformers
import flashinfer
# from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
