# ArXiv to Wiki Conversion
This notebook demonstrates downloading an arXiv paper, converting the HTML to wiki markup using Pandoc, cleaning the markup with an LLM, and preparing it for upload.

In [None]:
from marovi.modules.download import ArXivDownloader
from marovi.modules.parsing.pandoc import PandocParser
from marovi.modules.steps.marovi_api import CleanTextStep
from marovi.storage.document.paper_storage import PaperStorage
from marovi.pipelines.context import PipelineContext

storage = PaperStorage('papers')
arxiv_id = '2301.00001'


In [None]:
downloader = ArXivDownloader(storage)
paper_dir = downloader.download_document(arxiv_id)
paper_dir


In [None]:
html_path = paper_dir / f'{arxiv_id}.html'
html_text = html_path.read_text(encoding='utf-8')
parser = PandocParser(html_text)
wiki_path = paper_dir / 'wiki' / f'{arxiv_id}.wiki'
parser.convert_html_to_wiki(output_file=str(wiki_path))
wiki_path


In [None]:
raw_wiki = wiki_path.read_text(encoding='utf-8')
cleaner = CleanTextStep(format_value='wiki')
cleaned = cleaner.run_with_retries([raw_wiki], PipelineContext())[0]
print(cleaned[:500])
