From 1867a0c6923eaebb7a53965f6cdbc0ace55142a3 Mon Sep 17 00:00:00 2001
From: Incarnas <119618389+bit-incarnas@users.noreply.github.com>
Date: Sun, 17 May 2026 21:37:12 -0700
Subject: [PATCH 1/5] update bid to match each layers MTP source (#23237)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update bid to match each layers MTP source

* Update conversion/qwen.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 conversion/qwen.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conversion/qwen.py b/conversion/qwen.py
index 4b86404262a..45d1f98c266 100644
--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@@ -600,6 +600,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             if name.find("layers.") != -1:
                 assert bid is not None
                 name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
+                bid = bid + n_layer
             else:
                 remapper = {
                     "mtp.fc":                    "model.layers.{bid}.eh_proj",

From e98bcfec2812f83bee29fdd4a4bb827f6cd273b2 Mon Sep 17 00:00:00 2001
From: Neo Zhang <zhang.jianyu@outlook.com>
Date: Mon, 18 May 2026 13:11:19 +0800
Subject: [PATCH 2/5] sycl : fix error when use -mg 1 error (#23140)

---
 examples/sycl/start-svr.sh      | 1 -
 examples/sycl/test.sh           | 1 -
 examples/sycl/win-start-svr.bat | 1 -
 examples/sycl/win-test.bat      | 1 -
 4 files changed, 4 deletions(-)

diff --git a/examples/sycl/start-svr.sh b/examples/sycl/start-svr.sh
index b7ea3096486..ce31ec51d2b 100755
--- a/examples/sycl/start-svr.sh
+++ b/examples/sycl/start-svr.sh
@@ -111,7 +111,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
     echo "Use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
     GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
     echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
     echo "Use all Intel GPUs, including iGPU & dGPU"
diff --git a/examples/sycl/test.sh b/examples/sycl/test.sh
index 38d2e926896..116047cd2ea 100755
--- a/examples/sycl/test.sh
+++ b/examples/sycl/test.sh
@@ -119,7 +119,6 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
     echo "Use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
     GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
-    export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
     echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
 else
     echo "Use all Intel GPUs, including iGPU & dGPU"
diff --git a/examples/sycl/win-start-svr.bat b/examples/sycl/win-start-svr.bat
index 4d850cbaa6f..13b5159e002 100644
--- a/examples/sycl/win-start-svr.bat
+++ b/examples/sycl/win-start-svr.bat
@@ -164,7 +164,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" (
   echo Use %GGML_SYCL_DEVICE% as main GPU
   REM Use single GPU only.
   set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
-  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
   echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
 ) else (
   echo Use all Intel GPUs, including iGPU ^& dGPU
diff --git a/examples/sycl/win-test.bat b/examples/sycl/win-test.bat
index 781d17705db..39640908b07 100644
--- a/examples/sycl/win-test.bat
+++ b/examples/sycl/win-test.bat
@@ -186,7 +186,6 @@ if not "%GGML_SYCL_DEVICE%"=="-1" (
   echo Use %GGML_SYCL_DEVICE% as main GPU
   REM Use single GPU only.
   set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
-  set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
   echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
 ) else (
   echo Use all Intel GPUs, including iGPU ^& dGPU

From 5511965b197c131b38d8b5a0beb3ea7606d80b10 Mon Sep 17 00:00:00 2001
From: Intel AI Get-to Market Customer Success and Solutions
 <ai.gtm.css@gmail.com>
Date: Sun, 17 May 2026 22:11:51 -0700
Subject: [PATCH 3/5] sycl: route small f32 matmuls to oneMKL, bypass oneDNN
 (#22150)

Signed-off-by: Chun Tao <chun.tao@intel.com>
Co-authored-by: Chun Tao <chun.tao@intel.com>
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index f5d10b56de0..ebe7c5b351c 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2385,21 +2385,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
         const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
 
+        {
+            const int64_t gemm_flops = (int64_t)row_diff * src1_ncols * ne10;
+            const bool use_mkl_direct = gemm_flops < 256 * 256 * 256;
 #if GGML_SYCL_DNNL
-        if (!g_ggml_sycl_disable_dnn) {
-            DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
-                                      DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
-                                      dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
-        }
-        else
+            if (!g_ggml_sycl_disable_dnn && !use_mkl_direct) {
+                DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i,
+                                          DnnlGemmWrapper::to_dt<float>(), src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
+                                          dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
+            }
+            else
 #endif
-        {
-            const float alpha = 1.0f;
-            const float beta  = 0.0f;
-            SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-                *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
-                src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
-                dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
+            {
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+                    *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
+                    src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
+                    dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
+            }
         }
     }
     GGML_UNUSED(dst);

From 0caf2a1d48d2b678f2ea2fdfcb07ee35816f9f5e Mon Sep 17 00:00:00 2001
From: Intel AI Get-to Market Customer Success and Solutions
 <ai.gtm.css@gmail.com>
Date: Sun, 17 May 2026 22:12:21 -0700
Subject: [PATCH 4/5] sycl: scalar SWAR byte-subtract in Q6_K MMVQ dot product
 (#22156)

Signed-off-by: Chun Tao <chun.tao@intel.com>
Co-authored-by: Chun Tao <chun.tao@intel.com>
---
 ggml/src/ggml-sycl/vecdotq.hpp | 99 ++++++++++++++++------------------
 1 file changed, 46 insertions(+), 53 deletions(-)

diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp
index d7770047424..16b2d65d271 100644
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -85,6 +85,32 @@ static __dpct_inline__ int get_int_from_uint8_aligned(
       (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
 }
 
+static __dpct_inline__ int byte_sub_4(const int a, const int b) {
+  const uint32_t ua = static_cast<uint32_t>(a);
+  const uint32_t ub = static_cast<uint32_t>(b);
+  return static_cast<int>(((ua | 0x80808080u) - ub) ^ 0x80808080u);
+}
+
+static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq_scalar(
+    const int vl, const int vh, const int u0, const int u1, const int8_t sc0,
+    const int8_t sc1, const float d, const float d80, const float d81) {
+    static_assert(QR6_K == 2, "q6_K MMVQ scalar fast path assumes QR6_K == 2");
+
+    const int vil0 = (vl >> 0) & 0x0F0F0F0F;
+    const int vih0 = ((vh >> 0) << 4) & 0x30303030;
+    const int vi0 = byte_sub_4(vil0 | vih0, 0x20202020);
+
+    const int vil1 = (vl >> 4) & 0x0F0F0F0F;
+    const int vih1 = ((vh >> 4) << 4) & 0x30303030;
+    const int vi1 = byte_sub_4(vil1 | vih1, 0x20202020);
+
+    const float sumf =
+        d80 * (dpct::dp4a(vi0, u0, 0) * sc0) +
+        d81 * (dpct::dp4a(vi1, u1, 0) * sc1);
+
+    return d * sumf;
+}
+
 static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
                                                   const uint8_t *values,
                                                   int &val1, int &val2) {
@@ -279,24 +305,8 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
                             const int *__restrict__ u,
                             const int8_t *__restrict__ scales, const float &d,
                             const float *__restrict__ d8) {
-
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = dpct::vectorized_binary<sycl::char4>(
-            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
+    return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
+        vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
 }
 
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
@@ -542,23 +552,8 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
     __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
                                                       const int8_t * __restrict__ scales, const float d,
                                                       const float * __restrict__ d8) {
-        float sumf = 0.0f;
-
-#pragma unroll
-        for (int i = 0; i < QR6_K; ++i) {
-            const int sc = scales[4 * i];
-
-            const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
-
-            const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
-
-            const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
-                                                                dpct::sub_sat());  // vi = (vil | vih) - 32
-
-            sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc);                        // SIMD dot product
-        }
-
-        return d * sumf;
+        return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
+            vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
     }
 
     __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
@@ -579,16 +574,15 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
 
         const int8_t * scs = scales + scale_offset;
 
-        int   u[QR6_K];
-        float d8[QR6_K];
+        const int u0 = get_int_from_int8_aligned(
+            q8_1_quant_ptr + bq8_offset * QK8_1, iqs % QI8_1);
+        const int u1 = get_int_from_int8_aligned(
+            q8_1_quant_ptr + (bq8_offset + 2) * QK8_1, iqs % QI8_1);
+        const float d80 = (*(q8_1_ds + bq8_offset + 0))[0];
+        const float d81 = (*(q8_1_ds + bq8_offset + 2))[0];
 
-#pragma unroll
-        for (int i = 0; i < QR6_K; ++i) {
-            u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
-            const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
-            d8[i]                       = ds_values[0];
-        }
-        return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
+        return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
+            vl, vh, u0, u1, scs[0], scs[4], *d, d80, d81);
     }
 };
 #define VDR_Q4_0_Q8_1_MMVQ 2
@@ -1167,16 +1161,15 @@ vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
 
     const int8_t * scales = bq6_K->scales + scale_offset;
 
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
-    }
+    const int u0 = get_int_from_int8_aligned(
+        bq8_1[bq8_offset + 0].qs, iqs % QI8_1);
+    const int u1 = get_int_from_int8_aligned(
+        bq8_1[bq8_offset + 2].qs, iqs % QI8_1);
+    const float d80 = bq8_1[bq8_offset + 0].ds[0];
+    const float d81 = bq8_1[bq8_offset + 2].ds[0];
 
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+    return vec_dot_q6_K_q8_1_impl_mmvq_scalar(
+        vl, vh, u0, u1, scales[0], scales[4], bq6_K->d, d80, d81);
 }
 
 

From c3f95c1f069c91e21b8063b09907a5fba38d1695 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 May 2026 08:57:28 +0300
Subject: [PATCH 5/5] scripts : allow wc2wt with an existing branch (#23189)

---
 scripts/wc2wt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/wc2wt.sh b/scripts/wc2wt.sh
index 157881b458f..b6e92f86934 100755
--- a/scripts/wc2wt.sh
+++ b/scripts/wc2wt.sh
@@ -37,7 +37,7 @@ dir=$(basename $(pwd))
 # sanitize branch name for directory name (replace / with -)
 dir_suffix=$(echo "$BRANCH" | tr '/' '-')
 
-git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD
+git worktree add "../$dir-$dir_suffix" "$BRANCH" || git worktree add -b "$BRANCH" "../$dir-$dir_suffix" HEAD
 
 og_path=$(pwd)
 wt_path=$(cd "../$dir-$dir_suffix" && pwd)