In [3]:
def accumulator_simd(accm_op, variables, func="_mm512_add_pd", indent_level=1):
    indent = " " * 4 * indent_level
    
    accm_op.append("\n")
    if len(variables) == 1:
        return variables[0]

    variables_next_ = []
    variables_ = variables[:-1] if len(variables) % 2 == 1 else variables
    for i in range(0, len(variables_), 2):
        accm_op.append(f"{indent}{variables_[i]} ={func}({variables_[i]}, {variables_[i+1]});")
        variables_next_.append(f"{variables_[i]}")
        
    if len(variables) % 2 == 1:
        variables_next_.append(f"{variables[-1]}")
        
    accumulator_simd(accm_op, variables_next_, indent_level = indent_level)

for unroll in range(1, 8+1, 1):
    with open(f"./small_dgemv_naive_c_implementation_ver3-2_ZmmNaive_Unroll-OuterLoop-{unroll}.c", "w") as p:
        p.write("#include <stddef.h>\n")
        p.write("#include <stdio.h>\n")
        p.write("#include <stdlib.h>\n")
        p.write("#include <stdint.h>\n")
        p.write("#include <math.h>\n")
        p.write("#include <omp.h>\n")
        p.write("#include <immintrin.h>\n") 
        p.write("#include <xmmintrin.h>\n")
        
        p.write(f"""
double sum_zmm_elements_v3_2_{unroll}(__m512d zmmX){{
    __m256d ymm_upper = _mm512_extractf64x4_pd(zmmX, 1);
    __m256d ymm_lower = _mm512_extractf64x4_pd(zmmX, 0);

    __m256d ymm_sum = _mm256_add_pd(ymm_upper, ymm_lower);

    __m128d xmm_sum = _mm_add_pd(_mm256_extractf128_pd(ymm_sum, 1), _mm256_extractf128_pd(ymm_sum, 0));

    __m128d xmm_high_low = _mm_add_pd(xmm_sum, _mm_unpackhi_pd(xmm_sum, xmm_sum));

    double result;
    _mm_store_sd(&result, xmm_high_low);

    return result;
}}  
       
        """)
        
        # Normal ----------------------------------------------------------------------------------------------------------
        y_start, y_end = 1, 1+unroll
        a_start, a_end = 1+unroll, 1+unroll*2

        remain_op_i = "".join([f"""
            y[i] += a[i + (k+{j})*lda] * x[k+{j}];""" for j in range(unroll)])
            
        load_x = "".join([f"""
        zmm{i} = _mm512_set1_pd(x[k+{i}]);""" for i in range(unroll)])
        
        load_y = f"""
            zmm{unroll+0} = _mm512_loadu_pd(&y[i]);"""
        
        load_a = "".join([f"""
            zmm{i+unroll+1} = _mm512_loadu_pd(&a[i+(k+{i})*lda]);""" for i in range(unroll)])
        
        mul_ax = "".join([f"""
            zmm{i+unroll+1} = _mm512_mul_pd(zmm{i+unroll+1}, zmm{i});""" for i in range(unroll)])
        
        accm_op = []
        variables = [f"zmm{unroll+0}"] + [f"zmm{i+unroll+1}" for i in range(unroll)]
        accumulator_simd(accm_op, variables, indent_level=3)
        accm_op = "\n".join(accm_op)

        code = f"""
void mydgemv_n_ver3_2_unroll{unroll}(double a[], double x[], double y[], int64_t lda, int64_t ldx, int64_t ldy){{
    __m512d zmm0, {", ".join([f"zmm{i}" for i in range(1, 2+unroll*2)])};
    double tmp_x;
    for (int64_t i=0; i<lda; i++){{
        y[i] = 0.0;
    }}

    int64_t k_remain = ldx % {unroll}, i_remain = lda % 8;
    for (int64_t k=0; k<ldx-k_remain; k+={unroll}){{
        {load_x}
        for (int64_t i=0; i<lda-i_remain; i+=8){{
            {load_y}
            {load_a}
            {mul_ax}
            
            {accm_op}
            _mm512_storeu_pd(&y[i], zmm{unroll});
        }}
        for (int64_t i = lda-i_remain; i<lda; i++){{
            {remain_op_i}
        }}
    }}
    
    for (int64_t k=ldx-k_remain; k<ldx; k++){{
        zmm0 = _mm512_set1_pd(x[k]);        
        for (int64_t i=0; i<lda-i_remain; i+=8){{
            zmm1 = _mm512_loadu_pd(&y[i]);
            zmm2 = _mm512_loadu_pd(&a[i+k*lda]);
            
            zmm2 = _mm512_mul_pd(zmm0, zmm2);
            
            zmm1 = _mm512_add_pd(zmm1, zmm2);
            
            _mm512_storeu_pd(&y[i], zmm1);
        }}
        
        for (int64_t i = lda-i_remain; i<lda; i++){{
            y[i] += a[i+k*lda] * x[k+0];
        }}         
    }}
}}
        """
        p.write(code)

        # Transposed ----------------------------------------------------------------------------------------------------------
        remain_op_i = f"""
    for (int64_t i=lda-i_remain; i<lda; i++){{
        double tmp=0.0;
        for (int64_t k=0; k<ldx-k_remain; k++){{
            tmp += a_t[k+i*ldx] * x[k];
        }}
        y[i] = tmp;
    }}
        """
            
        init_y = "".join([f"""zmm{i} = _mm512_setzero_pd();
        """ for i in range(unroll)])
        
        load_x = f"zmm{unroll} = _mm512_loadu_pd(&x[k]);"
        
        load_a = "".join([f"""zmm{i+unroll+1} = _mm512_loadu_pd(&a_t[k+(i+{i})*ldx]);
            """ for i in range(unroll)])
        
        mul_ax = "".join([f"""zmm{i+unroll+1} = _mm512_mul_pd(zmm{unroll}, zmm{i+unroll+1});
            """ for i in range(unroll)])
        
        add_ax2y = "".join([f"""zmm{i} = _mm512_add_pd(zmm{i}, zmm{i+unroll+1});
            """ for i in range(unroll)])
        
        store_y = "".join([f"""y[i+{i}] = sum_zmm_elements_v3_2_{unroll}(zmm{i});
        """ for i in range(unroll)])
        
        code = f"""
void mydgemv_t_ver3_2_unroll{unroll}(double a_t[], double x[], double y[], int64_t lda, int64_t ldx, int64_t ldy){{
    __m512d zmm0, {", ".join([f"zmm{i}" for i in range(1, 1+unroll*2)])};

    int64_t k_remain = ldx % 8, i_remain = lda % {unroll};
    for (int64_t i=0; i<lda-i_remain; i+={unroll}){{
        {init_y}
        for (int64_t k=0; k<ldx-k_remain; k+=8){{
            {load_x}
            
            {load_a}
            {mul_ax}
            {add_ax2y}
        }}
        {store_y}
    }}
    {remain_op_i}
}}
        """
        p.write(code)
