In [41]:
import re

def extract_function_signatures(code):
    signatures = {}
    matches = re.finditer(r"define\s+([^{]+?)\s+@(\w+)\(([^)]*)\)", code)
    for match in matches:
        ret_type, func_name, args = match.groups()

        # Clean argument types
        arg_list = []
        for arg in args.split(','):
            parts = arg.strip().split()
            if parts:
                # Drop variable name like %0
                if parts[-1].startswith('%'):
                    arg_list.append(' '.join(parts[:-1]))
                else:
                    arg_list.append(' '.join(parts))
        cleaned_args = ', '.join(arg_list)

        signatures[func_name] = f"declare {ret_type} @{func_name}({cleaned_args})"
    return signatures


In [42]:
import os
import re
import subprocess
from collections import defaultdict

def extract_header_and_functions(code):
    header_match = re.search(r"^(.*?)(?=^define\s)", code, re.DOTALL | re.MULTILINE)
    header = header_match.group(1).strip() if header_match else ""
    function_blocks = re.findall(r"(define\s+[^{]+{[^}]*})", code, re.DOTALL)
    return header, function_blocks

def get_function_name(func_block):
    match = re.search(r"@([\w\d_]+)\(", func_block)
    return match.group(1) if match else None

def get_called_functions(func_block):
    return set(re.findall(r"call.*@([\w\d_]+)\(", func_block))

def partition_and_fix_declarations(input_ll_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    with open(input_ll_path, "r") as f:
        code = f.read()

    header, functions = extract_header_and_functions(code)

    # ✅ Extract all actual function signatures
    func_signatures = extract_function_signatures(code)

    defined_funcs = {}
    calls_in_func = defaultdict(set)

    for func in functions:
        name = get_function_name(func)
        if name:
            defined_funcs[name] = func
            calls_in_func[name] = get_called_functions(func)

    output_paths = []

    for func_name, func_body in defined_funcs.items():
        output_path = os.path.join(output_dir, f"{func_name}.ll")

        declared = []
        for called in calls_in_func[func_name]:
            if called not in defined_funcs or called == func_name:
                continue
            # ✅ Use actual declaration if available
            decl = func_signatures.get(called)
            if decl:
                declared.append(decl)

        with open(output_path, "w") as f:
            f.write(header + "\n\n")
            for d in declared:
                f.write(d + "\n")
            f.write("\n" + func_body + "\n")

        output_paths.append(output_path)
        print(f"[✔] Wrote {output_path} with {len(declared)} declarations.")

    return output_paths



In [43]:
input_ll = "demo.ll"
output_dir = "out_parts"
part_files = partition_and_fix_declarations(input_ll, output_dir)


[✔] Wrote out_parts/square.ll with 0 declarations.
[✔] Wrote out_parts/increment.ll with 0 declarations.
[✔] Wrote out_parts/main.ll with 2 declarations.


In [44]:
import re

def normalize_ll_code(code: str) -> str:
    """
    Normalize LLVM IR code:
    - Remove comments, ModuleID, source_filename, and datalayout
    - Remove extra whitespace and blank lines
    - Sort declare statements
    - Preserve define order
    """
    code = re.sub(r"^\s*;.*", "", code, flags=re.MULTILINE)
    code = re.sub(r'^\s*source_filename\s*=.*', '', code, flags=re.MULTILINE)
    code = re.sub(r'^\s*target datalayout\s*=.*', '', code, flags=re.MULTILINE)
    code = "\n".join(line.strip() for line in code.splitlines() if line.strip())
    
    declares = []
    defines = []
    for line in code.splitlines():
        if line.startswith("declare"):
            declares.append(line.strip())
        else:
            defines.append(line)

    declares = sorted(declares)
    triple_match = re.search(r'(target triple\s*=\s*".*?")', code)
    triple_line = triple_match.group(1) if triple_match else None

    normalized = []
    if triple_line:
        normalized.append(triple_line)
    normalized.extend(declares)
    normalized.extend(defines)

    return "\n".join(normalized)


In [45]:
import hashlib
import shutil

def compute_file_hash(file_path):
    with open(file_path, 'r') as f:
        code = f.read()
    normalized_code = normalize_ll_code(code)
    return hashlib.sha256(normalized_code.encode('utf-8')).hexdigest()



In [46]:
CACHE_DIR = "obj_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def compile_with_cache(ll_path):
    file_hash = compute_file_hash(ll_path)
    cached_obj = os.path.join(CACHE_DIR, f"{file_hash}.o")

    # If cached object exists, reuse it
    if os.path.exists(cached_obj):
        print(f"[🌀] Cache hit: {ll_path} → {cached_obj}")
    else:
        result = subprocess.run(
            ["llc", ll_path, "-filetype=obj", "-o", cached_obj],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            print(f"[✘] Compile failed: {ll_path}\n{result.stderr}")
            return None
        print(f"[✔] Compiled & cached: {ll_path} → {cached_obj}")

    # Copy cached object to the output folder
    base = os.path.splitext(os.path.basename(ll_path))[0]
    o_path = os.path.join(os.path.dirname(ll_path), f"{base}.o")
    shutil.copyfile(cached_obj, o_path)
    return o_path


In [47]:
from multiprocessing import Pool

def parallel_compile_ll_with_cache(ll_files, num_workers=None):
    if num_workers is None:
        num_workers = min(len(ll_files), os.cpu_count())
    with Pool(num_workers) as pool:
        object_files = pool.map(compile_with_cache, ll_files)
    return [f for f in object_files if f]


In [48]:
object_files = parallel_compile_ll_with_cache(part_files)


[🌀] Cache hit: out_parts/square.ll → obj_cache/016609967ae7315418e851b478458aa0a124c982ba63ce37791c430b655a7c0c.o[🌀] Cache hit: out_parts/increment.ll → obj_cache/f1f1d3a1646f32af048bf5cd74d32243fdd253eb5ce4c3993de35d5c586888cd.o[🌀] Cache hit: out_parts/main.ll → obj_cache/0b84befafe0532807469e965defbf36224da434a5caff224e40ab9468c219ab5.o




In [12]:
import os

def get_exec_name_from_ll(ll_path):
    base = os.path.splitext(os.path.basename(ll_path))[0]
    return f"{base}_exec"


In [13]:
def link_objects(object_files, output_exec="final_exec"):
    result = subprocess.run(["clang"] + object_files + ["-o", output_exec], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"[✅] Linked executable created: {output_exec}")
    else:
        print(f"[✘] Linking failed:\n{result.stderr}")


In [14]:
exec_name = get_exec_name_from_ll("demo2.ll")  # replace "demo.ll" with your actual input file name
link_objects(object_files, exec_name)


[✅] Linked executable created: demo2_exec
