In [47]:
import sys
import os
import json

# Add the project root to sys.path to import src
# Adjust this path depending on where you run the notebook
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '23127011'))
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Added to path: {project_root}")

from src.parser import find_root_tex_file, LatexFlattener, LatexStructureBuilder, LatexContentProcessor
from src.processing import ContentDeduplicator

Added to path: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\23127011


In [48]:
!pip install bibtexparser texsoup



In [49]:
import os
import re
import bibtexparser
from bibtexparser.bparser import BibTexParser
from TexSoup import TexSoup

class ReferenceProcessor:
    def __init__(self, paper_id, version, root_dir):
        self.paper_id = paper_id
        self.version = version
        self.root_dir = root_dir
        
        # D√πng Dictionary thay v√¨ List ƒë·ªÉ t·ª± ƒë·ªông kh·ª≠ tr√πng l·∫∑p theo Key khi parse nhi·ªÅu ngu·ªìn
        self.raw_refs = {} 
        
        # --- REGEX C·∫§U H√åNH CHO BIBITEM ---
        # Lookahead (?=...) ho·∫°t ƒë·ªông ho√†n h·∫£o nh·ªù k·ªπ thu·∫≠t Sentinel th√™m v√†o khi x·ª≠ l√Ω
        self.REGEX_BIBITEM = re.compile(
            r'\\bibitem\s*(?:\[(.*?)\])?\s*\{(.*?)\}\s*(.*?)(?=(?:\\bibitem)|(?:\\end\{thebibliography\})|\Z)', 
            re.DOTALL
        )

    def process_references(self, flat_content):
        """
        H√†m ch√≠nh: Gi·ªØ nguy√™n signature c≈© ƒë·ªÉ t∆∞∆°ng th√≠ch v·ªõi code c·ªßa b·∫°n.
        Input: flat_content (str)
        Output: flat_content (str), final_refs (list)
        """
        print(f"   üîç Scanning references for {self.version} using Hybrid Parser...")
        
        # --- B∆Ø·ªöC 1: PH√ÇN T√çCH C·∫§U TR√öC B·∫∞NG TEXSOUP ---
        # TexSoup gi√∫p t√¨m file v√† citation ch√≠nh x√°c, b·ªè qua comment
        try:
            soup = TexSoup(flat_content)
        except Exception as e:
            print(f"      ‚ö†Ô∏è TexSoup parsing failed (syntax error?): {e}")
            # N·∫øu file l·ªói qu√° n·∫∑ng, tr·∫£ v·ªÅ r·ªóng ho·∫∑c b·∫°n c√≥ th·ªÉ fallback v·ªÅ regex c≈© ·ªü ƒë√¢y
            return flat_content, []

        # 1.1: T√¨m danh s√°ch c√°c key ƒê∆Ø·ª¢C CITE trong b√†i (Usage Filter)
        used_keys = set()
        # C√°c l·ªánh cite ph·ªï bi·∫øn: \cite, \citet, \citep, \nocite...
        citation_cmds = soup.find_all(['cite', 'citet', 'citep', 'nocite', 'citeauthor', 'citeyear'])
        
        for cmd in citation_cmds:
            # L·∫•y argument cu·ªëi c√πng (th∆∞·ªùng l√† list keys)
            if cmd.args:
                # TexSoup node -> string
                key_str = cmd.args[-1].string
                if key_str:
                    for k in key_str.split(','):
                        used_keys.add(k.strip())
        
        # 1.2: T√¨m v√† Parse c√°c file ngu·ªìn (.bib / .bbl)
        # T√¨m l·ªánh \bibliography{...} ho·∫∑c \addbibresource{...}
        bib_cmds = soup.find_all(['bibliography', 'addbibresource'])
        processed_files = set()

        for cmd in bib_cmds:
            if cmd.args:
                filenames = cmd.args[-1].string
                if filenames:
                    for fname in filenames.split(','):
                        fname = fname.strip()
                        if not fname or fname in processed_files: continue
                        
                        # Logic: ∆Øu ti√™n .bib -> N·∫øu kh√¥ng c√≥ th√¨ t√¨m .bbl
                        if self._try_parse_bib(fname):
                            processed_files.add(fname)
                        elif self._try_parse_bbl(fname):
                            processed_files.add(fname)

        # 1.3: T√¨m v√† Parse Embedded Reference (Vi·∫øt th·∫≥ng trong .tex)
        embedded_envs = soup.find_all('thebibliography')
        for env in embedded_envs:
            # Chuy·ªÉn node TexSoup th√†nh string ƒë·ªÉ regex x·ª≠ l√Ω bibitem nhanh h∆°n
            self._parse_bibitem_content(str(env), source_type="embedded")

        # --- B∆Ø·ªöC 2: FILTER & FORMAT OUTPUT ---
        # Chuy·ªÉn t·ª´ Dict n·ªôi b·ªô -> List k·∫øt qu·∫£ (ch·ªâ l·∫•y nh·ªØng ref ƒë∆∞·ª£c cite)
        final_refs = []
        skipped_count = 0
        
        # Check usage: N·∫øu key n·∫±m trong used_keys ho·∫∑c d√πng \nocite{*}
        is_wildcard = '*' in used_keys
        
        for key, ref_obj in self.raw_refs.items():
            if is_wildcard or key in used_keys:
                final_refs.append(ref_obj)
            else:
                skipped_count += 1
                
        print(f"      Found {len(self.raw_refs)} raw refs. Kept {len(final_refs)} unique refs (Skipped {skipped_count}).")
        
        return flat_content, final_refs

    def _try_parse_bib(self, filename):
        """X·ª≠ l√Ω file .bib (d√πng bibtexparser)"""
        # Fix extension
        if not filename.lower().endswith('.bib'): filename += '.bib'
        path = os.path.join(self.root_dir, filename)
        
        if not os.path.exists(path): return False
        
        print(f"      üìñ Reading .bib file (Lib): {filename}")
        try:
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                # C·∫•u h√¨nh parser
                parser = BibTexParser(common_strings=True)
                parser.ignore_nonstandard_types = True 
                parser.homogenise_fields = False
                
                db = bibtexparser.load(f, parser=parser)
            
            count = 0
            for entry in db.entries:
                key = entry.get('ID', '').strip()
                if not key: continue

                # L∆∞u v√†o dict raw_refs
                self.raw_refs[key] = {
                    "key": key,
                    "raw_text": self._dict_to_bibtex_string(entry),
                    "type": f"bib_{entry.get('ENTRYTYPE', 'misc').lower()}",
                    "parsed_fields": entry,
                    "source": filename
                }
                count += 1
                
            print(f"      -> Parsed {count} entries from .bib")
            return True
        except Exception as e:
            print(f"      ‚ùå Error parsing .bib: {e}")
            return False

    def _try_parse_bbl(self, filename):
        """X·ª≠ l√Ω file .bbl (d√πng Regex + Sentinel)"""
        # Fix extension (ƒë·∫£m b·∫£o t√¨m ƒë√∫ng file .bbl)
        base_name = os.path.splitext(filename)[0]
        path = os.path.join(self.root_dir, base_name + '.bbl')
        
        if not os.path.exists(path): return False
        
        print(f"      üìñ Reading .bbl file: {base_name}.bbl")
        try:
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            count_before = len(self.raw_refs)
            self._parse_bibitem_content(content, source_type=f"{base_name}.bbl")
            print(f"      -> Parsed {len(self.raw_refs) - count_before} new entries from .bbl")
            return True
        except Exception as e:
            print(f"      ‚ùå Error parsing .bbl: {e}")
            return False

    def _parse_bibitem_content(self, text, source_type="bibitem"):
        """
        H√†m parse l√µi cho \bibitem. 
        S·ª≠ d·ª•ng k·ªπ thu·∫≠t Sentinel ƒë·ªÉ ƒë·∫£m b·∫£o Regex lu√¥n b·∫Øt ƒë∆∞·ª£c item cu·ªëi c√πng.
        """
        # --- FIX QUAN TR·ªåNG: SENTINEL TRICK ---
        text += "\n\\bibitem{SENTINEL_MARKER_FIX}"
        
        matches = self.REGEX_BIBITEM.findall(text)
        
        for label, key, content in matches:
            clean_key = key.strip()
            
            # B·ªè qua item gi·∫£
            if clean_key == "SENTINEL_MARKER_FIX": continue
                
            # Clean content
            clean_content = re.sub(r'\s+', ' ', content).strip()
            
            # Ch·ªâ l∆∞u n·∫øu ch∆∞a t·ªìn t·∫°i (∆Øu ti√™n .bib ƒë√£ parse tr∆∞·ªõc ƒë√≥ n·∫øu c√≥ tr√πng key)
            if clean_key not in self.raw_refs:
                self.raw_refs[clean_key] = {
                    "key": clean_key,
                    "raw_text": clean_content,
                    "type": "bibitem",
                    "label": label.strip() if label else None,
                    "source": source_type
                }

    def _dict_to_bibtex_string(self, entry):
        """Helper t√°i t·∫°o string t·ª´ object bibtexparser"""
        lines = [f"@{entry.get('ENTRYTYPE', 'misc')}{{{entry.get('ID', '')},"]
        for k, v in entry.items():
            if k in ['ENTRYTYPE', 'ID']: continue
            lines.append(f"  {k} = {{{v}}},")
        lines.append("}")
        return "\n".join(lines)

In [50]:
import os
import re
import bibtexparser
from bibtexparser.bparser import BibTexParser
from TexSoup import TexSoup

class ReferenceParser:
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.raw_refs = {} # D√πng Dict ƒë·ªÉ t·ª± ƒë·ªông kh·ª≠ tr√πng l·∫∑p theo Key
        
        # Regex chuy√™n d·ª•ng ƒë·ªÉ qu√©t n·ªôi dung c·ªßa \bibitem (cho .bbl v√† embedded)
        # L∆∞u √Ω: Lookahead (?=...) s·∫Ω ho·∫°t ƒë·ªông ho√†n h·∫£o nh·ªù k·ªπ thu·∫≠t Sentinel th√™m v√†o sau.
        self.REGEX_BIBITEM_CONTENT = re.compile(
            r'\\bibitem\s*(?:\[(.*?)\])?\s*\{(.*?)\}\s*(.*?)(?=(?:\\bibitem)|(?:\\end\{thebibliography\})|\Z)', 
            re.DOTALL
        )

    def parse(self, flat_tex_content):
        """
        H√†m ch√≠nh: Nh·∫≠n v√†o n·ªôi dung file .tex ƒë√£ ƒë∆∞·ª£c flatten.
        Tr·∫£ v·ªÅ: (Danh s√°ch key ƒë∆∞·ª£c cite, Danh s√°ch object reference chi ti·∫øt)
        """
        print("üöÄ Starting Reference Parser...")
        
        # --- B∆Ø·ªöC 1: D√πng TexSoup ƒë·ªÉ ph√¢n t√≠ch c·∫•u tr√∫c file .tex ---
        # TexSoup t·ª± ƒë·ªông b·ªè qua c√°c d√≤ng comment (%) -> D·ªØ li·ªáu s·∫°ch
        try:
            soup = TexSoup(flat_tex_content)
        except Exception as e:
            print(f"‚ö†Ô∏è TexSoup failed to parse main content (using raw fallback): {e}")
            return [], [] # Ho·∫∑c fallback sang regex thu·∫ßn n·∫øu c·∫ßn

        # 1.1 T√¨m c√°c citations (ƒë·ªÉ bi·∫øt key n√†o th·ª±c s·ª± ƒë∆∞·ª£c d√πng)
        used_keys = set()
        # C√°c l·ªánh cite ph·ªï bi·∫øn: \cite, \citet, \citep, ...
        citation_cmds = soup.find_all(['cite', 'citet', 'citep', 'nocite'])
        for cmd in citation_cmds:
            # L·∫•y argument cu·ªëi c√πng (th∆∞·ªùng l√† list keys)
            if cmd.args:
                key_str = cmd.args[-1].string
                if key_str:
                    for k in key_str.split(','):
                        used_keys.add(k.strip())
        
        print(f"‚ÑπÔ∏è Found {len(used_keys)} unique citations in text.")

        # --- B∆Ø·ªöC 2: X·ª≠ l√Ω 3 tr∆∞·ªùng h·ª£p ngu·ªìn Reference ---

        # TR∆Ø·ªúNG H·ª¢P 1: File .bib (External)
        # T√¨m l·ªánh \bibliography{file} ho·∫∑c \addbibresource{file}
        bib_cmds = soup.find_all(['bibliography', 'addbibresource'])
        for cmd in bib_cmds:
            if cmd.args:
                filenames = cmd.args[-1].string
                if filenames:
                    for fname in filenames.split(','):
                        self._process_external_bib(fname.strip())

        # TR∆Ø·ªúNG H·ª¢P 2: File .bbl (External - Th∆∞·ªùng ƒëi k√®m n·∫øu ko c√≥ .bib)
        # Logic: N·∫øu t√¨m th·∫•y file .bbl tr√πng t√™n v·ªõi file .tex ch√≠nh ho·∫∑c ƒë∆∞·ª£c include
        # (Th∆∞·ªùng ta qu√©t th∆∞ m·ª•c ƒë·ªÉ t√¨m file .bbl t∆∞∆°ng ·ª©ng n·∫øu .bib kh√¥ng t·ªìn t·∫°i)
        # ·ªû ƒë√¢y gi·∫£ s·ª≠ ta qu√©t c√°c file ƒë∆∞·ª£c khai b√°o trong bibliography nh∆∞ng ƒëu√¥i l√† .bbl 
        # ho·∫∑c t·ª± ƒë·ªông check file c√πng t√™n.
        # (B·∫°n c√≥ th·ªÉ th√™m logic qu√©t th∆∞ m·ª•c t·∫°i ƒë√¢y n·∫øu c·∫ßn)
        
        # TR∆Ø·ªúNG H·ª¢P 3: Embedded (Vi·∫øt th·∫≥ng trong .tex)
        # T√¨m m√¥i tr∆∞·ªùng \begin{thebibliography}
        embedded_envs = soup.find_all('thebibliography')
        for env in embedded_envs:
            print("‚ÑπÔ∏è Found embedded 'thebibliography' environment.")
            # Convert node TexSoup ng∆∞·ª£c l·∫°i th√†nh string ƒë·ªÉ d√πng Regex parse bibitem
            # L√Ω do: TexSoup parse bibitem item ƒë√¥i khi b·ªã ch·∫≠m/l·ªói n·∫øu text r√°c nhi·ªÅu.
            self._parse_bibitem_content(str(env), source="embedded")

        # --- B∆Ø·ªöC 3: L·ªåC K·∫æT QU·∫¢ ---
        final_results = []
        for key, ref_obj in self.raw_refs.items():
            # N·∫øu key n·∫±m trong list cite HO·∫∂C key l√† nocite (*)
            if key in used_keys or '*' in used_keys:
                final_results.append(ref_obj)
        
        print(f"‚úÖ Finished. Captured {len(final_results)} references.")
        return list(used_keys), final_results

    def _process_external_bib(self, filename):
        """X·ª≠ l√Ω file .bib (d√πng bibtexparser)"""
        # Fix extension
        if not filename.endswith('.bib'): 
            # Check xem file .bib hay .bbl t·ªìn t·∫°i
            if os.path.exists(os.path.join(self.root_dir, filename + '.bib')):
                filename += '.bib'
            elif os.path.exists(os.path.join(self.root_dir, filename + '.bbl')):
                # N·∫øu ch·ªâ c√≥ .bbl -> Chuy·ªÉn sang x·ª≠ l√Ω bbl
                return self._process_external_bbl(filename + '.bbl')
            else:
                return # File kh√¥ng t·ªìn t·∫°i

        path = os.path.join(self.root_dir, filename)
        print(f"   üìñ Parsing .bib file: {filename}")
        
        try:
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                # C·∫•u h√¨nh parser cho ph√©p l·ªói nh·∫π
                parser = BibTexParser(common_strings=True)
                parser.ignore_nonstandard_types = True
                parser.homogenise_fields = False
                
                db = bibtexparser.load(f, parser=parser)
                
                for entry in db.entries:
                    key = entry.get('ID')
                    if key:
                        self.raw_refs[key] = {
                            'key': key,
                            'type': entry.get('ENTRYTYPE'),
                            'raw_text': str(entry), # Ho·∫∑c build l·∫°i string t·ª´ dict
                            'data': entry, # D·ªØ li·ªáu c·∫•u tr√∫c s·∫°ch
                            'source_file': filename
                        }
        except Exception as e:
            print(f"   ‚ùå Error parsing .bib: {e}")

    def _process_external_bbl(self, filename):
        """X·ª≠ l√Ω file .bbl (d√πng Regex + Sentinel)"""
        path = os.path.join(self.root_dir, filename)
        print(f"   üìñ Parsing .bbl file: {filename}")
        
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                self._parse_bibitem_content(content, source=filename)

    def _parse_bibitem_content(self, text_content, source="unknown"):
        """
        Core function ƒë·ªÉ parse n·ªôi dung d·∫°ng \bibitem{...} ...
        D√πng k·ªπ thu·∫≠t Sentinel ƒë·ªÉ ƒë·∫£m b·∫£o l·∫•y ƒë∆∞·ª£c item cu·ªëi c√πng.
        """
        # --- SENTINEL TRICK ---
        # Th√™m m·ªôt item gi·∫£ v√†o cu·ªëi chu·ªói. 
        # ƒêi·ªÅu n√†y gi√∫p Regex Lookahead (?=\\bibitem) lu√¥n ƒë√∫ng cho item th·∫≠t cu·ªëi c√πng.
        text_content += "\n\\bibitem{__SENTINEL_END__}"
        
        matches = self.REGEX_BIBITEM_CONTENT.findall(text_content)
        
        for label, key, content in matches:
            key = key.strip()
            if key == "__SENTINEL_END__": continue # B·ªè qua l√≠nh canh
            
            # L√†m s·∫°ch content (x√≥a xu·ªëng d√≤ng th·ª´a)
            clean_content = re.sub(r'\s+', ' ', content).strip()
            
            # Ch·ªâ th√™m n·∫øu ch∆∞a t·ªìn t·∫°i (∆∞u ti√™n .bib n·∫øu ƒë√£ parse tr∆∞·ªõc ƒë√≥)
            if key not in self.raw_refs:
                self.raw_refs[key] = {
                    'key': key,
                    'type': 'bibitem',
                    'raw_text': clean_content,
                    'label': label.strip() if label else None,
                    'source_file': source
                }

# --- C√ÅCH S·ª¨ D·ª§NG ---
# parser = ReferenceParser(root_dir="./my_paper_folder")
#
# # ƒê·ªçc file tex ch√≠nh (gi·∫£ s·ª≠ b·∫°n ƒë√£ flatten n√≥ r·ªìi)
# with open("./my_paper_folder/main_flat.tex", "r", encoding="utf-8") as f:
#     flat_content = f.read()
#
# used_keys, refs = parser.parse(flat_content)
#
# for r in refs:
#     print(f"[{r['key']}] -> {r.get('raw_text')[:50]}...")

In [51]:
# Configuration
DATA_RAW_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data_raw'))

print(f"Data raw path: {DATA_RAW_PATH}")

if not os.path.exists(DATA_RAW_PATH):
    print("‚ùå Warning: Data raw path does not exist. Please check the path.")
else:
    print("‚úÖ Data raw path found.")

Data raw path: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_raw
‚úÖ Data raw path found.


In [52]:
paper_base_id = '2403-00531'
versions_to_process = '2403-00531v2'


In [53]:
# %%
# --- TEST RUNNER CHO REFERENCE PROCESSOR ---

# 1. C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n cho Version c·ª• th·ªÉ
ver_path = os.path.join(DATA_RAW_PATH, paper_base_id, 'tex', versions_to_process)
print(f"üìÇ Testing directory: {ver_path}")

# 2. B∆∞·ªõc ti√™n quy·∫øt: Flatten n·ªôi dung tr∆∞·ªõc (v√¨ ReferenceProcessor c·∫ßn text ph·∫≥ng)
root_file = find_root_tex_file(ver_path)

if not root_file:
    print("‚ùå Error: Could not find root tex file.")
else:
    print(f"‚úÖ Root file found: {os.path.basename(root_file)}")
    
    # Flatten
    flattener = LatexFlattener(root_file, paper_id=paper_base_id, version=versions_to_process, remove_references=False)
    flat_result = flattener.flatten()
    flat_content = flat_result['content']
    print(f"üìÑ Flattened content size: {len(flat_content)} characters")
    print('\\bibliography{apssamp}' in flat_content)
# \bibliography{apssamp}% Produces the bibliography via BibTeX.
# 
    # 3. Kh·ªüi t·∫°o v√† ch·∫°y ReferenceProcessor
    print("\n--- START REFERENCE PROCESSING ---")
    print(ver_path)
    ref_processor = ReferenceProcessor(paper_base_id, versions_to_process, ver_path)
    
    # H√†m n√†y s·∫Ω in ra log chi ti·∫øt v·ªÅ vi·ªác t√¨m th·∫•y .bbl hay .bib v√† s·ªë l∆∞·ª£ng ref l·ªçc ƒë∆∞·ª£c
    content_after_process, final_refs = ref_processor.process_references(flat_content)
    
    print("--- END REFERENCE PROCESSING ---\n")

    # 4. Ki·ªÉm tra k·∫øt qu·∫£ ƒë·∫ßu ra
    print(f"üìä Total Valid References: {len(final_refs)}")
    
    if len(final_refs) > 0:
        print("\nüîé Sample Reference 1:")
        print(json.dumps(final_refs[0], indent=2, ensure_ascii=False))
        
        # Ki·ªÉm tra xem key c·ªßa ref c√≥ ƒë√∫ng format kh√¥ng
        print(f"\nKey: {final_refs[0]['key']}")
        print(f"Source Type: {final_refs[0]['type']}")
    else:
        print("‚ö†Ô∏è No references found. Check if the paper actually has citations or if regex needs adjustment.")

üìÇ Testing directory: d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_raw\2403-00531\tex\2403-00531v2
‚úÖ Root file found: apssamp.tex
üìù Kh·ªüi t·∫°o LatexFlattener cho Paper: 2403-00531, Version: 2403-00531v2
   Remove references: No
üìÑ Flattened content size: 72207 characters
True

--- START REFERENCE PROCESSING ---
d:\Coding\School\Y3-K1\Intro2DS\DS - LAB 2\Milestone2_Project\data_raw\2403-00531\tex\2403-00531v2
   üîç Scanning references for 2403-00531v2 using Hybrid Parser...
      üìñ Reading .bib file (Lib): apssamp.bib
      -> Parsed 67 entries from .bib
      Found 67 raw refs. Kept 67 unique refs (Skipped 0).
--- END REFERENCE PROCESSING ---

üìä Total Valid References: 67

üîé Sample Reference 1:
{
  "key": "Lelli:2016zqa",
  "raw_text": "@article{Lelli:2016zqa,\n  year = {2016},\n  pages = {157},\n  volume = {152},\n  journal = {Astron. J.},\n  doi = {10.3847/0004-6256/152/6/157},\n  primaryclass = {astro-ph.GA},\n  archiveprefix = {arXiv},\n 

In [54]:
with open(f'references-{paper_base_id}-v2.json', 'w', encoding = 'utf-8') as f:
    json.dump(final_refs, f, indent=2, ensure_ascii=False)

In [55]:

print(flat_content)





















\documentclass[
 reprint,










 amsmath,amssymb,
 aps,
pra,




floatfix,
]{revtex4-2}

\usepackage{graphicx}

\usepackage{csvsimple}
\usepackage{dcolumn}
\usepackage{array} 


\usepackage{bm}
\usepackage[dvipsnames]{xcolor}

\usepackage{subcaption}
\usepackage{multirow}
\usepackage{times}
\usepackage{placeins}
\usepackage{hyperref}
\usepackage{booktabs}
\bibliographystyle{apsrev4-1}

\begin{document}
\hbadness=99999

\preprint{APS/123-QED}

\title{Phenomenology of renormalization group improved gravity from the kinematics of SPARC galaxies.}


\author{Esha Bhatia}
\email{b.esha@iitg.ac.in}
\author{Sayan Chakrabarti}
\email{sayan.chakrabarti@iitg.ac.in}
\author{Sovan Chakraborty}
\email{sovan@iitg.ac.in}



\affiliation{Department of Physics, Indian Institute of Technology, Guwahati 781039, India}




\date{\today}
             

\begin{abstract}
Renormalization Group correction to General Relativity (RGGR) proposes a logarithmic running of the gravitational co

In [56]:
import hashlib
import re

class ReferenceDeduplicator:
    def __init__(self):
        # Kho ch·ª©a reference duy nh·∫•t: { fingerprint: {data} }
        self.unique_refs_pool = {} 
        
        # Danh s√°ch key chu·∫©n theo th·ª© t·ª±: ['ref_0', 'ref_1', ...]
        self.canonical_keys = []
        
        # Map ƒë·ªÉ thay th·∫ø trong text: { version: { old_key: new_key } }
        self.version_maps = {}

    def _create_fingerprint(self, text):
        """
        T·∫°o d·∫•u v√¢n tay t·ª´ n·ªôi dung reference.
        Ch·ªâ gi·ªØ l·∫°i ch·ªØ c√°i v√† s·ªë, b·ªè qua d·∫•u c√¢u v√† kho·∫£ng tr·∫Øng ƒë·ªÉ so s√°nh ch√≠nh x√°c.
        """
        # X√≥a m·ªçi th·ª© kh√¥ng ph·∫£i ch·ªØ/s·ªë v√† chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
        clean_text = re.sub(r'[^a-z0-9]', '', text.lower())
        return hashlib.md5(clean_text.encode('utf-8')).hexdigest()

    def add_references(self, version, refs_list):
        """
        Nh·∫≠n danh s√°ch refs t·ª´ m·ªôt version v√† x·ª≠ l√Ω dedup.
        """
        print(f"   üß© Deduplicating refs for {version}...")
        self.version_maps[version] = {}
        
        for ref in refs_list:
            old_key = ref['key']
            raw_text = ref['raw_text']
            
            # 1. T·∫°o fingerprint
            fp = self._create_fingerprint(raw_text)
            
            canonical_key = None
            
            # 2. Ki·ªÉm tra tr√πng l·∫∑p
            if fp in self.unique_refs_pool:
                # ƒê√£ t·ªìn t·∫°i -> L·∫•y key chu·∫©n c≈©
                canonical_key = self.unique_refs_pool[fp]['canonical_key']
                
                # Unionize: C·∫≠p nh·∫≠t th√¥ng tin n·∫øu b·∫£n m·ªõi ƒë·∫ßy ƒë·ªß h∆°n b·∫£n c≈© (Optional)
                # V√≠ d·ª•: N·∫øu b·∫£n c≈© ng·∫Øn qu√°, l·∫•y b·∫£n m·ªõi d√†i h∆°n l√†m raw_text ch√≠nh
                if len(raw_text) > len(self.unique_refs_pool[fp]['raw_text']):
                     self.unique_refs_pool[fp]['raw_text'] = raw_text
                     
            else:
                # Ch∆∞a t·ªìn t·∫°i -> T·∫°o m·ªõi
                new_idx = len(self.canonical_keys)
                canonical_key = f"ref_{new_idx}" # T·∫°o key chu·∫©n: ref_0, ref_1...
                
                self.unique_refs_pool[fp] = {
                    'canonical_key': canonical_key,
                    'raw_text': raw_text,
                    'original_refs': [] # ƒê·ªÉ trace l·∫°i n·∫øu c·∫ßn
                }
                self.canonical_keys.append(canonical_key)
            
            # 3. L∆∞u mapping cho version n√†y (Old Key -> Canonical Key)
            # Ch·ªâ l∆∞u n·∫øu key kh√°c nhau ƒë·ªÉ t·ªëi ∆∞u hi·ªáu nƒÉng replace
            if old_key != canonical_key:
                self.version_maps[version][old_key] = canonical_key

    def get_replacements(self, version):
        """Tr·∫£ v·ªÅ dict {old: new} ƒë·ªÉ replace trong text c·ªßa version ƒë√≥"""
        return self.version_maps.get(version, {})

    def export_bib_string(self):
        """Xu·∫•t chu·ªói BibTeX chu·∫©n cho file refs.bib"""
        bib_content = ""
        # Duy·ªát theo th·ª© t·ª± canonical keys ƒë·ªÉ file ƒë·∫πp
        for fp in self.unique_refs_pool:
            item = self.unique_refs_pool[fp]
            key = item['canonical_key']
            text = item['raw_text']
            
            # T·∫°o entry @misc ƒë∆°n gi·∫£n ch·ª©a full text
            # V√¨ ta tr√≠ch xu·∫•t t·ª´ \bibitem n√™n kh√¥ng c√≥ field t√°ch bi·ªát, t·ªëng h·∫øt v√†o title/note
            entry = f"@misc{{{key},\n  text = {{{text}}}\n}}\n\n"
            bib_content += entry
            
        return bib_content

def replace_citations_in_text(text, replacement_map):
    """
    Thay th·∫ø \cite{old} th√†nh \cite{new} trong to√†n b·ªô vƒÉn b·∫£n.
    """
    if not replacement_map:
        return text
        
    # C√°ch ƒë∆°n gi·∫£n: Replace chu·ªói. 
    # An to√†n h∆°n: D√πng Regex v·ªõi h√†m callback ƒë·ªÉ ch·ªâ replace TRONG l·ªánh \cite
    # Nh∆∞ng v·ªõi b√†i n√†y, replace chu·ªói key th∆∞·ªùng ƒë·ªß an to√†n v√¨ key bibtex √≠t tr√πng t·ª´ th∆∞·ªùng.
    
    # ƒê·ªÉ an to√†n t·ªëi ƒëa, ta d√πng Regex Callback:
    def replace_match(match):
        # match.group(0) l√† to√†n b·ªô \cite{...}
        # match.group(1) l√† n·ªôi dung b√™n trong {}
        inner = match.group(1)
        keys = [k.strip() for k in inner.split(',')]
        
        new_keys = []
        for k in keys:
            # N·∫øu key c√≥ trong map th√¨ ƒë·ªïi, kh√¥ng th√¨ gi·ªØ nguy√™n
            new_keys.append(replacement_map.get(k, k))
            
        return f"\\cite{{{', '.join(new_keys)}}}"

    # Regex b·∫Øt \cite{...}
    pattern = re.compile(r'(\\cite[a-z]*\s*(?:\[.*?\])?\s*\{)([^}]+)(\})', re.IGNORECASE)
    
    # group 1: \cite{
    # group 2: key1, key2
    # group 3: }
    return pattern.sub(replace_match, text)

In [57]:
# 1. Setup
paper_base_id = '2403-00531'
versions = ['2403-00531v1', '2403-00531v2'] # List c√°c version
ref_deduplicator = ReferenceDeduplicator()
content_deduplicator = ContentDeduplicator()

# L∆∞u tr·ªØ n·ªôi dung ph·∫≥ng t·∫°m th·ªùi ƒë·ªÉ parse sau khi dedup ref
temp_flat_contents = {} 

print("üöÄ STEP 1: GATHERING & REFERENCE EXTRACTION")
for ver in versions:
    ver_path = os.path.join(DATA_RAW_PATH, paper_base_id, 'tex', ver)
    
    # A. Flatten
    root_file = find_root_tex_file(ver_path)
    if not root_file: continue
    
    flattener = LatexFlattener(root_file, paper_id=paper_base_id, version=ver,remove_references=False)
    flat_res = flattener.flatten()
    flat_text = flat_res['content']
    
    # B. Reference Extract (D√πng class m·ªõi b·∫°n ƒë√£ test)
    ref_proc = ReferenceProcessor(paper_base_id, ver, ver_path)
    flat_text, refs = ref_proc.process_references(flat_text)
    
    # C. Add to Dedup
    ref_deduplicator.add_references(ver, refs)
    
    # L∆∞u l·∫°i text g·ªëc ƒë·ªÉ t√≠ n·ªØa s·ª≠a
    temp_flat_contents[ver] = flat_text

print("\nüöÄ STEP 2: TEXT REPLACEMENT & PARSING")
for ver in versions:
    if ver not in temp_flat_contents: continue
    
    # A. L·∫•y map thay th·∫ø (Old -> New)
    replacements = ref_deduplicator.get_replacements(ver)
    raw_text = temp_flat_contents[ver]
    
    # B. S·ª≠a Key trong Text
    # H√†m n√†y s·∫Ω ƒë·ªïi \cite{hinton06} th√†nh \cite{ref_0}
    updated_text = replace_citations_in_text(raw_text, replacements)
    
    print(f"   üìù Parsed {ver} with {len(replacements)} citation updates.")
    
    # C. Parse Structure (D√πng text ƒê√É S·ª¨A)
    builder = LatexStructureBuilder(updated_text, paper_base_id, ver)
    root_node = builder.build_coarse_tree()
    
    # D. Clean Content
    processor = LatexContentProcessor(paper_base_id, ver)
    processor.process_tree(root_node)
    
    # E. Content Dedup
    content_deduplicator.process_version(ver, root_node)

print("\nüöÄ STEP 3: EXPORT")

# 1. Export refs.bib
bib_str = ref_deduplicator.export_bib_string()
with open('refs.bib', 'w', encoding='utf-8') as f:
    f.write(bib_str)
print("‚úÖ Exported refs.bib")

# 2. Export hierarchy.json
final_hier = content_deduplicator.get_final_json()
with open('hierarchy.json', 'w', encoding='utf-8') as f:
    json.dump(final_hier, f, indent=2, ensure_ascii=False)
print("‚úÖ Exported hierarchy.json")

üöÄ STEP 1: GATHERING & REFERENCE EXTRACTION
üìù Kh·ªüi t·∫°o LatexFlattener cho Paper: 2403-00531, Version: 2403-00531v1
   Remove references: No
   üîç Scanning references for 2403-00531v1 using Hybrid Parser...
      üìñ Reading .bib file (Lib): ref.bib
      -> Parsed 54 entries from .bib
      Found 54 raw refs. Kept 49 unique refs (Skipped 5).
   üß© Deduplicating refs for 2403-00531v1...
üìù Kh·ªüi t·∫°o LatexFlattener cho Paper: 2403-00531, Version: 2403-00531v2
   Remove references: No
   üîç Scanning references for 2403-00531v2 using Hybrid Parser...
      üìñ Reading .bib file (Lib): apssamp.bib
      -> Parsed 67 entries from .bib
      Found 67 raw refs. Kept 67 unique refs (Skipped 0).
   üß© Deduplicating refs for 2403-00531v2...

üöÄ STEP 2: TEXT REPLACEMENT & PARSING
   üìù Parsed 2403-00531v1 with 49 citation updates.
üîç X·ª≠ l√Ω Preamble ƒë·ªÉ tr√≠ch xu·∫•t Title, Authors, Abstract...
üîÑ Processing hierarchy for version: 1 (from 2403-00531v1)
   üìù P