Release llamafile v0.8.1

Mozilla-Ocho · Apr 26, 2024 · 2095d50 · 2095d50
1 parent 7e6a488
commit 2095d50
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 25 deletions.
diff --git a/llamafile/copy.sh b/llamafile/copy.sh
@@ -15,6 +15,7 @@ scp llama.cpp/ggml-cuda.cu \
     llamafile/tinyblas.cu \
     llamafile/llamafile.h \
     llamafile/rocm.bat \
+    llamafile/rocm.sh \
     llamafile/cuda.bat \
     llamafile/cuda.sh \
     $HOST:lfbuild/
diff --git a/llamafile/cuda.bat b/llamafile/cuda.bat
@@ -4,8 +4,13 @@
 :: NVCUDA DLLs are provided by the installation of the windows GPU
 :: driver on a Windows system that has a CUDA-capable GPU installed.
 
-nvcc -arch=all ^
-     --shared ^
+nvcc --shared ^
+     -gencode=arch=compute_50,code=sm_50 ^
+     -gencode=arch=compute_60,code=sm_60 ^
+     -gencode=arch=compute_70,code=sm_70 ^
+     -gencode=arch=compute_75,code=sm_75 ^
+     -gencode=arch=compute_80,code=sm_80 ^
+     -gencode=arch=compute_90,code=sm_90 ^
      --forward-unknown-to-host-compiler ^
      -Xcompiler="/nologo /EHsc /O2 /GR /MT" ^
      -DNDEBUG ^

diff --git a/llamafile/cuda.sh b/llamafile/cuda.sh
@@ -1,19 +1,46 @@
 #!/bin/sh
 
+TMP=$(mktemp -d) || exit
+
+cp llama.cpp/ggml-cuda.cu \
+   llama.cpp/ggml-cuda.h \
+   llama.cpp/ggml-impl.h \
+   llama.cpp/ggml-alloc.h \
+   llama.cpp/ggml-common.h \
+   llama.cpp/ggml-backend.h \
+   llama.cpp/ggml-backend-impl.h \
+   llama.cpp/ggml.h \
+   llamafile/tinyblas.h \
+   llamafile/tinyblas.cu \
+   llamafile/llamafile.h \
+   llamafile/rocm.bat \
+   llamafile/rocm.sh \
+   llamafile/cuda.bat \
+   llamafile/cuda.sh \
+   "$TMP" || exit
+
+cd "$TMP"
+
 /usr/local/cuda/bin/nvcc \
-    -arch=all \
-    --shared \
-    --forward-unknown-to-host-compiler \
-    -use_fast_math \
-    --compiler-options "-fPIC -O3 -march=native -mtune=native" \
-    -DNDEBUG \
-    -DGGML_BUILD=1 \
-    -DGGML_SHARED=1 \
-    -DGGML_CUDA_MMV_Y=1 \
-    -DGGML_MULTIPLATFORM \
-    -DGGML_CUDA_DMMV_X=32 \
-    -DK_QUANTS_PER_ITERATION=2 \
-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-    -DGGML_USE_TINYBLAS \
-    -o ggml-cuda.so \
-    ggml-cuda.cu
+  --shared \
+  -gencode=arch=compute_50,code=sm_50 \
+  -gencode=arch=compute_60,code=sm_60 \
+  -gencode=arch=compute_70,code=sm_70 \
+  -gencode=arch=compute_75,code=sm_75 \
+  -gencode=arch=compute_80,code=sm_80 \
+  -gencode=arch=compute_90,code=sm_90 \
+  --forward-unknown-to-host-compiler \
+  --compiler-options \
+  "-fPIC -O3 -march=native -mtune=native" \
+  -DNDEBUG \
+  -DGGML_BUILD=1 \
+  -DGGML_SHARED=1 \
+  -DGGML_CUDA_MMV_Y=1 \
+  -DGGML_MULTIPLATFORM \
+  -DGGML_CUDA_DMMV_X=32 \
+  -DK_QUANTS_PER_ITERATION=2 \
+  -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
+  -DGGML_USE_TINYBLAS \
+  -o ~/ggml-cuda.so \
+  ggml-cuda.cu \
+  -lcuda
diff --git a/llamafile/rocm.sh b/llamafile/rocm.sh
@@ -10,20 +10,17 @@ hipcc \
   -fPIC \
   -shared \
   -DNDEBUG \
-  -march=native \
-  -mtune=native \
-  -use_fast_math \
   -DGGML_BUILD=1 \
   -DGGML_SHARED=1 \
   -Wno-return-type \
   -Wno-unused-result \
   -DGGML_USE_HIPBLAS \
+  -DGGML_USE_TINYBLAS \
   -DGGML_CUDA_MMV_Y=1 \
   -DGGML_MULTIPLATFORM \
   -DGGML_CUDA_DMMV_X=32 \
   -DK_QUANTS_PER_ITERATION=2 \
   -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
+  --amdgpu-target=gfx1100,gfx1031,gfx1030,gfx1032,gfx906,gfx1101,gfx1102,gfx1103 \
   -o ggml-rocm.so \
-  ggml-cuda.cu \
-  -lhipblas \
-  -lrocblas
+  ggml-cuda.cu
diff --git a/llamafile/version.h b/llamafile/version.h
@@ -2,7 +2,7 @@
 
 #define LLAMAFILE_MAJOR 0
 #define LLAMAFILE_MINOR 8
-#define LLAMAFILE_PATCH 0
+#define LLAMAFILE_PATCH 1
 #define LLAMAFILE_VERSION \
     (100000000 * LLAMAFILE_MAJOR + 1000000 * LLAMAFILE_MINOR + LLAMAFILE_PATCH)