## This notebook is used to convert the models to EXL2 format to reduce server rent cost by using ExLlamaV2 inference backend to accelerate token/sec generation.

In [1]:
import os, sys, shutil, subprocess, shlex
from pathlib import Path

In [2]:
# EXL2 conversion parameters
os.environ["LOCAL_PATH"] = "G:/LLM_MODELS"
LOCAL_PATH = os.environ.get("LOCAL_PATH")

# the folder that contains the model
os.environ["LLM_MODEL"] = "llama3-8b-cpt-sahabatai-v1-instruct"
os.environ["LLM_MODEL"] = "Llama-SEA-LION-v3-8B-IT"
os.environ["LLM_MODEL"] = "Qwen2.5-7B-Instruct"
LLM_MODEL = os.environ.get("LLM_MODEL")

# e.g. 4.25 to approximate nf4
os.environ["BITS_TARGET"] = "4.65"
BITS_TARGET = os.environ.get("BITS_TARGET")

# temp folder for jobs
TEMP_PATH = f"./LLM_MODELS_EXL2/{LLM_MODEL}-EXL2-Indonesia-Focus-temp"
os.environ["TEMP_PATH"] = TEMP_PATH
TEMP_PATH = os.environ.get("TEMP_PATH")

# base output folder for converted models
# constructing the path using an f-string
OUT_BASE_PATH = f"./LLM_MODELS_EXL2/{LLM_MODEL}-EXL2-Indonesia-Focus"
os.environ["OUT_BASE"] = OUT_BASE_PATH
OUT_BASE = os.environ.get("OUT_BASE")

# optional calibration parquet file (leave blank to skip)
os.environ["CALIB_PATH"] = "./misc/ind_corpus.parquet"
CALIB_PATH = os.environ.get("CALIB_PATH", "")

In [3]:
input_dir = Path(LOCAL_PATH) / LLM_MODEL
# If cloned with repo name instead of nested path, allow both flavors:
if not input_dir.exists() and (Path(LOCAL_PATH) / LLM_MODEL.split('/')[-1]).exists():
    input_dir = Path(LOCAL_PATH) / LLM_MODEL.split('/')[-1]

if not input_dir.exists():
    raise FileNotFoundError(f"Input model folder not found: {input_dir}")

safe_name = LLM_MODEL.replace("/", "_")
out_dir = Path(OUT_BASE)
temp_dir = Path(TEMP_PATH)
calib_file = Path(CALIB_PATH)

if out_dir.exists():
    print(f"Removing existing output folder: {out_dir}")
    shutil.rmtree(out_dir)

out_dir.mkdir(parents=True, exist_ok=True)

In [4]:
print(f"Converting model from {input_dir} to {out_dir} with bitwidth {BITS_TARGET}")
print(f"Using calibration file: {CALIB_PATH if CALIB_PATH else 'None'}")

Converting model from G:\LLM_MODELS\Qwen2.5-7B-Instruct to LLM_MODELS_EXL2\Qwen2.5-7B-Instruct-EXL2-Indonesia-Focus with bitwidth 4.65
Using calibration file: ./misc/ind_corpus.parquet


In [5]:
# find convert.py: prefer local convert.py in cwd, otherwise try exllamav2 package
import traceback

convert_script = Path("..\exllamav2\convert.py")
if not convert_script.exists():
    try:
        import exllamav2
        pkg_dir = Path(exllamav2.__file__).resolve().parent
        candidates = [
            pkg_dir / "convert.py",
            pkg_dir / "scripts" / "convert.py",
            pkg_dir / "tools" / "convert.py",
        ]
        found = None
        for c in candidates:
            if c.exists():
                found = c
                break
        if not found:
            # fallback: any convert*.py in package folder
            for c in pkg_dir.iterdir():
                if c.is_file() and c.name.lower().startswith("convert") and c.suffix == ".py":
                    found = c
                    break
        if found is None:
            raise FileNotFoundError(f"convert.py not found in exllamav2 package dir {pkg_dir}")
        convert_script = found
    except Exception as e:
        traceback.print_exc()
        
# allow user override
explicit = os.environ.get("EXLLAMA_CONVERT")
if explicit:
    convert_script = Path(explicit)
    if not convert_script.exists():
        raise FileNotFoundError(f"EXLLAMA_CONVERT set but file not found: {convert_script}")

In [6]:
# build command
cmd = [
    sys.executable, str(convert_script),
    "-i", str(input_dir),
    "-o", str(temp_dir),
    "-b", str(BITS_TARGET),
    "-cf", str(out_dir),
    "-c", str(calib_file),
    "-nr",
    "-hb", "8"
]

print("Running:", " ".join(shlex.quote(x) for x in cmd))
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print("\n== RETURN CODE:", proc.returncode)
print("\n== STDOUT (first 2000 chars) ==\n")
print(proc.stdout[:2000])
print("\n== STDERR (first 8000 chars) ==\n")
print(proc.stderr[:8000])

Path(out_dir).mkdir(parents=True, exist_ok=True)
Path(out_dir, "convert_stdout.log").write_text(proc.stdout, encoding="utf-8")
Path(out_dir, "convert_stderr.log").write_text(proc.stderr, encoding="utf-8")
print(f"\nLogs written to {out_dir}/convert_*.log")

Running: 'g:\Documents\Schools\University\UI\Sem_12\TA\Aristotle\venv\Scripts\python.exe' '..\exllamav2\convert.py' -i 'G:\LLM_MODELS\Qwen2.5-7B-Instruct' -o 'LLM_MODELS_EXL2\Qwen2.5-7B-Instruct-EXL2-Indonesia-Focus-temp' -b 4.65 -cf 'LLM_MODELS_EXL2\Qwen2.5-7B-Instruct-EXL2-Indonesia-Focus' -c 'misc\ind_corpus.parquet' -nr -hb 8

== RETURN CODE: 0

== STDOUT (first 2000 chars) ==

Loading exllamav2_ext extension (JIT)...

Building C++/CUDA extension ------------------------------   0% 0:00:00 -:--:--
Building C++/CUDA extension ------------------------------   0% 0:00:00 -:--:--
 -- Created output directory: LLM_MODELS_EXL2\Qwen2.5-7B-Instruct-EXL2-Indonesia-Focus-temp
 -- Beginning new job
 -- Input: G:\LLM_MODELS\Qwen2.5-7B-Instruct
 -- Output: LLM_MODELS_EXL2\Qwen2.5-7B-Instruct-EXL2-Indonesia-Focus-temp
 -- Calibration dataset: misc\ind_corpus.parquet, 100 / 16 rows, 2048 tokens per sample
 -- Target bits per weight: 4.65 (decoder), 8 (head)
 -- Max shard size: 8192 MB
 -- Full mo