In [5]:
def accumulator_simd(accm_op, variables, func="_mm_add_pd", indent_level=1):
    indent = " " * 4 * indent_level
    
    accm_op.append("\n")
    if len(variables) == 1:
        return variables[0]

    variables_next_ = []
    variables_ = variables[:-1] if len(variables) % 2 == 1 else variables
    for i in range(0, len(variables_), 2):
        accm_op.append(f"{indent}{variables_[i]} ={func}({variables_[i]}, {variables_[i+1]});")
        variables_next_.append(f"{variables_[i]}")
        
    if len(variables) % 2 == 1:
        variables_next_.append(f"{variables[-1]}")
        
    accumulator_simd(accm_op, variables_next_, indent_level = indent_level)

for unroll in range(1, 8+1, 1):
    with open(f"./small_dgemv_naive_c_implementation_ver1-2_XmmNaive_Unroll-OuterLoop-{unroll}.c", "w") as p:
        p.write("#include <stddef.h>\n")
        p.write("#include <stdio.h>\n")
        p.write("#include <stdlib.h>\n")
        p.write("#include <stdint.h>\n")
        p.write("#include <math.h>\n")
        p.write("#include <omp.h>\n")
        p.write("#include <immintrin.h>\n") 
        p.write("#include <xmmintrin.h>\n")
        
        p.write(f"""
double sum_xmm_elements_v1_2_{unroll}(__m128d xmmX) {{
    __m128d xmm_high_low = _mm_add_pd(xmmX, _mm_unpackhi_pd(xmmX, xmmX));

    double result;
    _mm_store_sd(&result, xmm_high_low);

    return result;
}}        
        """)
        
        
        # Normal ----------------------------------------------------------------------------------------------------------
        y_start, y_end = 1, 1+unroll
        a_start, a_end = 1+unroll, 1+unroll*2

        remain_op_i = "".join([f"""
            y[i] += a[i+(k+{j})*lda] * x[k+{j}];""" for j in range(unroll)])
            
        load_x = "".join([f"""
        xmm{i} = _mm_set1_pd(x[k+{i}]);""" for i in range(unroll)])
        
        load_y = f"""
            xmm{unroll+0} = _mm_loadu_pd(&y[i]);"""
        
        load_a = "".join([f"""
            xmm{i+unroll+1} = _mm_loadu_pd(&a[i+(k+{i})*lda]);""" for i in range(unroll)])
        
        mul_ax = "".join([f"""
            xmm{i+unroll+1} = _mm_mul_pd(xmm{i+unroll+1}, xmm{i});""" for i in range(unroll)])
        
        accm_op = []
        variables = [f"xmm{unroll+0}"] + [f"xmm{i+unroll+1}" for i in range(unroll)]
        accumulator_simd(accm_op, variables, indent_level=3)
        accm_op = "\n".join(accm_op)

        code = f"""
void mydgemv_n_ver1_2_unroll{unroll}(double a[], double x[], double y[], int64_t lda, int64_t ldx, int64_t ldy){{
    __m128d xmm0, {", ".join([f"xmm{i}" for i in range(1, 2+unroll*2)])};
    double tmp_x;
    for (int64_t i=0; i<lda; i++){{
        y[i] = 0.0;
    }}

    int64_t k_remain = ldx % {unroll}, i_remain = lda % 2;
    for (int64_t k=0; k<ldx-k_remain; k+={unroll}){{
        {load_x}
        for (int64_t i=0; i<lda-i_remain; i+=2){{
            {load_y}
            {load_a}
            {mul_ax}
            
            {accm_op}
            _mm_storeu_pd(&y[i], xmm{unroll});
        }}
        for (int64_t i = lda-i_remain; i<lda; i++){{
            {remain_op_i}
        }}        
    }}
    
    for (int64_t k=ldx-k_remain; k<ldx; k++){{
        xmm0 = _mm_set1_pd(x[k]);        
        for (int64_t i=0; i<lda-i_remain; i+=2){{
            xmm1 = _mm_loadu_pd(&y[i]);
            xmm2 = _mm_loadu_pd(&a[i+k*lda]);
            
            xmm2 = _mm_mul_pd(xmm0, xmm2);
            
            xmm1 = _mm_add_pd(xmm1, xmm2);
            
            _mm_storeu_pd(&y[i], xmm1);
        }}
        
        for (int64_t i = lda-i_remain; i<lda; i++){{
            y[i] += a[i+k*lda] * x[k+0];
        }}         
    }}
}}
        """
        p.write(code)

        # Transposed ----------------------------------------------------------------------------------------------------------
        remain_op_i = f"""
    for (int64_t i=lda-i_remain; i<lda; i++){{
        double tmp=0.0;
        for (int64_t k=0; k<ldx; k++){{
            tmp += a_t[k+i*ldx] * x[k];
        }}
        y[i] = tmp;
    }}
        """
            
        init_y = "".join([f"""xmm{i} = _mm_setzero_pd();
        """ for i in range(unroll)])
        
        load_x = f"xmm{unroll} = _mm_loadu_pd(&x[k]);"
        
        load_a = "".join([f"""xmm{i+unroll+1} = _mm_loadu_pd(&a_t[k+(i+{i})*ldx]);
            """ for i in range(unroll)])
        
        mul_ax = "".join([f"""xmm{i+unroll+1} = _mm_mul_pd(xmm{unroll}, xmm{i+unroll+1});
            """ for i in range(unroll)])
        
        add_ax2y = "".join([f"""xmm{i} = _mm_add_pd(xmm{i}, xmm{i+unroll+1});
            """ for i in range(unroll)])
        
        store_y = "".join([f"""y[i+{i}] = sum_xmm_elements_v1_2_{unroll}(xmm{i});
        """ for i in range(unroll)])
        
        code = f"""
void mydgemv_t_ver1_2_unroll{unroll}(double a_t[], double x[], double y[], int64_t lda, int64_t ldx, int64_t ldy){{
    __m128d xmm0, {", ".join([f"xmm{i}" for i in range(1, 1+unroll*2)])};

    int64_t k_remain = ldx % 2, i_remain = lda % {unroll};
    for (int64_t i=0; i<lda-i_remain; i+={unroll}){{
        {init_y}
        for (int64_t k=0; k<ldx-k_remain; k+=2){{
            {load_x}
            
            {load_a}
            {mul_ax}
            {add_ax2y}
        }}
        {store_y}
    }}
    {remain_op_i}
}}
        """
        p.write(code)
