From d5339ea6090cab94650173b9ad677980b6c5a941 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Thu, 2 Jan 2014 15:08:14 -0600
Subject: [PATCH 01/12] move reducedim related stuff to a separate file

---
 base/abstractarray.jl | 25 -------------------------
 base/reducedim.jl     | 28 ++++++++++++++++++++++++++++
 base/sysimg.jl        |  1 +
 3 files changed, 29 insertions(+), 25 deletions(-)
 create mode 100644 base/reducedim.jl

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index c62a26b75a47e..c55b0ef651c30 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1409,34 +1409,9 @@ function nnz{T}(a::AbstractArray{T})
     return n
 end
 
-# for reductions that expand 0 dims to 1
-reduced_dims(A, region) = ntuple(ndims(A), i->(in(i, region) ? 1 :
-                                               size(A,i)))
-
-# keep 0 dims in place
-reduced_dims0(A, region) = ntuple(ndims(A), i->(size(A,i)==0 ? 0 :
-                                                in(i, region) ? 1 :
-                                                size(A,i)))
-
-reducedim(f::Function, A, region, v0) =
-    reducedim(f, A, region, v0, similar(A, reduced_dims(A, region)))
-
-maximum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmax,A,region,typemin(T))
-minimum{T}(A::AbstractArray{T}, region) =
-    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmin,A,region,typemax(T))
-sum{T}(A::AbstractArray{T}, region)  = reducedim(+,A,region,zero(T))
-prod{T}(A::AbstractArray{T}, region) = reducedim(*,A,region,one(T))
-
-all(A::AbstractArray{Bool}, region) = reducedim(&,A,region,true)
-any(A::AbstractArray{Bool}, region) = reducedim(|,A,region,false)
-sum(A::AbstractArray{Bool}, region) = reducedim(+,A,region,0,similar(A,Int,reduced_dims(A,region)))
 sum(A::AbstractArray{Bool}) = nnz(A)
 prod(A::AbstractArray{Bool}) =
     error("use all() instead of prod() for boolean arrays")
-prod(A::AbstractArray{Bool}, region) =
-    error("use all() instead of prod() for boolean arrays")
-
 
 # a fast implementation of sum in sequential order (from left to right)
 function sum_seq{T}(a::AbstractArray{T}, ifirst::Int, ilast::Int)
diff --git a/base/reducedim.jl b/base/reducedim.jl
new file mode 100644
index 0000000000000..57d59d61b2fe2
--- /dev/null
+++ b/base/reducedim.jl
@@ -0,0 +1,28 @@
+
+# for reductions that expand 0 dims to 1
+reduced_dims(A, region) = ntuple(ndims(A), i->(in(i, region) ? 1 :
+                                               size(A,i)))
+
+# keep 0 dims in place
+reduced_dims0(A, region) = ntuple(ndims(A), i->(size(A,i)==0 ? 0 :
+                                                in(i, region) ? 1 :
+                                                size(A,i)))
+
+reducedim(f::Function, A, region, v0) =
+    reducedim(f, A, region, v0, similar(A, reduced_dims(A, region)))
+
+maximum{T}(A::AbstractArray{T}, region) =
+    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmax,A,region,typemin(T))
+minimum{T}(A::AbstractArray{T}, region) =
+    isempty(A) ? similar(A,reduced_dims0(A,region)) : reducedim(scalarmin,A,region,typemax(T))
+sum{T}(A::AbstractArray{T}, region)  = reducedim(+,A,region,zero(T))
+prod{T}(A::AbstractArray{T}, region) = reducedim(*,A,region,one(T))
+
+all(A::AbstractArray{Bool}, region) = reducedim(&,A,region,true)
+any(A::AbstractArray{Bool}, region) = reducedim(|,A,region,false)
+sum(A::AbstractArray{Bool}, region) = reducedim(+,A,region,0,similar(A,Int,reduced_dims(A,region)))
+
+prod(A::AbstractArray{Bool}, region) =
+    error("use all() instead of prod() for boolean arrays")
+
+    
\ No newline at end of file
diff --git a/base/sysimg.jl b/base/sysimg.jl
index e5da016db2fba..1503d7ebebf6b 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -42,6 +42,7 @@ include("pointer.jl")
 
 include("float.jl")
 include("reduce.jl")
+include("reducedim.jl")
 include("complex.jl")
 include("rational.jl")
 

From 994bbceb6813e8178caf1a94884c62ce52c04f59 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Thu, 2 Jan 2014 15:18:43 -0600
Subject: [PATCH 02/12] move reducedim codes from array.jl to reducedim.jl

---
 base/array.jl     | 76 -----------------------------------------------
 base/reducedim.jl | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 77 deletions(-)

diff --git a/base/array.jl b/base/array.jl
index 197e203229bb6..7ddecb892de91 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -1401,82 +1401,6 @@ function indcopy(sz::Dims, I::(RangeIndex...))
 end
 
 
-## Reductions ##
-
-# TODO:
-# - find out why inner loop with dimsA[i] instead of size(A,i) is way too slow
-
-let reducedim_cache = nothing
-# generate the body of the N-d loop to compute a reduction
-function gen_reducedim_func(n, f)
-    ivars = { symbol(string("i",i)) for i=1:n }
-    # limits and vars for reduction loop
-    lo    = { symbol(string("lo",i)) for i=1:n }
-    hi    = { symbol(string("hi",i)) for i=1:n }
-    rvars = { symbol(string("r",i)) for i=1:n }
-    setlims = { quote
-        # each dim of reduction is either 1:sizeA or ivar:ivar
-        if in($i,region)
-            $(lo[i]) = 1
-            $(hi[i]) = size(A,$i)
-        else
-            $(lo[i]) = $(hi[i]) = $(ivars[i])
-        end
-               end for i=1:n }
-    rranges = { :( $(lo[i]):$(hi[i]) ) for i=1:n }  # lo:hi for all dims
-    body =
-    quote
-        _tot = v0
-        $(setlims...)
-        $(make_loop_nest(rvars, rranges,
-                         :(_tot = ($f)(_tot, A[$(rvars...)]))))
-        R[_ind] = _tot
-        _ind += 1
-    end
-    quote
-        local _F_
-        function _F_(f, A, region, R, v0)
-            _ind = 1
-            $(make_loop_nest(ivars, { :(1:size(R,$i)) for i=1:n }, body))
-        end
-        _F_
-    end
-end
-
-global reducedim
-function reducedim(f::Function, A, region, v0, R)
-    ndimsA = ndims(A)
-
-    if is(reducedim_cache,nothing)
-        reducedim_cache = Dict()
-    end
-
-    key = ndimsA
-    fname = :f
-
-    if  (is(f,+)     && (fname=:+;true)) ||
-        (is(f,*)     && (fname=:*;true)) ||
-        (is(f,scalarmax)   && (fname=:scalarmax;true)) ||
-        (is(f,scalarmin)   && (fname=:scalarmin;true)) ||
-        (is(f,&)     && (fname=:&;true)) ||
-        (is(f,|)     && (fname=:|;true))
-        key = (fname, ndimsA)
-    end
-
-    if !haskey(reducedim_cache,key)
-        fexpr = gen_reducedim_func(ndimsA, fname)
-        func = eval(fexpr)
-        reducedim_cache[key] = func
-    else
-        func = reducedim_cache[key]
-    end
-
-    func(f, A, region, R, v0)
-
-    return R
-end
-end
-
 ## Filter ##
 
 # given a function returning a boolean and an array, return matching elements
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 57d59d61b2fe2..0c129543f8e45 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -25,4 +25,77 @@ sum(A::AbstractArray{Bool}, region) = reducedim(+,A,region,0,similar(A,Int,reduc
 prod(A::AbstractArray{Bool}, region) =
     error("use all() instead of prod() for boolean arrays")
 
-    
\ No newline at end of file
+# TODO:
+# - find out why inner loop with dimsA[i] instead of size(A,i) is way too slow
+
+let reducedim_cache = nothing
+# generate the body of the N-d loop to compute a reduction
+function gen_reducedim_func(n, f)
+    ivars = { symbol(string("i",i)) for i=1:n }
+    # limits and vars for reduction loop
+    lo    = { symbol(string("lo",i)) for i=1:n }
+    hi    = { symbol(string("hi",i)) for i=1:n }
+    rvars = { symbol(string("r",i)) for i=1:n }
+    setlims = { quote
+        # each dim of reduction is either 1:sizeA or ivar:ivar
+        if in($i,region)
+            $(lo[i]) = 1
+            $(hi[i]) = size(A,$i)
+        else
+            $(lo[i]) = $(hi[i]) = $(ivars[i])
+        end
+               end for i=1:n }
+    rranges = { :( $(lo[i]):$(hi[i]) ) for i=1:n }  # lo:hi for all dims
+    body =
+    quote
+        _tot = v0
+        $(setlims...)
+        $(make_loop_nest(rvars, rranges,
+                         :(_tot = ($f)(_tot, A[$(rvars...)]))))
+        R[_ind] = _tot
+        _ind += 1
+    end
+    quote
+        local _F_
+        function _F_(f, A, region, R, v0)
+            _ind = 1
+            $(make_loop_nest(ivars, { :(1:size(R,$i)) for i=1:n }, body))
+        end
+        _F_
+    end
+end
+
+global reducedim
+function reducedim(f::Function, A, region, v0, R)
+    ndimsA = ndims(A)
+
+    if is(reducedim_cache,nothing)
+        reducedim_cache = Dict()
+    end
+
+    key = ndimsA
+    fname = :f
+
+    if  (is(f,+)     && (fname=:+;true)) ||
+        (is(f,*)     && (fname=:*;true)) ||
+        (is(f,scalarmax)   && (fname=:scalarmax;true)) ||
+        (is(f,scalarmin)   && (fname=:scalarmin;true)) ||
+        (is(f,&)     && (fname=:&;true)) ||
+        (is(f,|)     && (fname=:|;true))
+        key = (fname, ndimsA)
+    end
+
+    if !haskey(reducedim_cache,key)
+        fexpr = gen_reducedim_func(ndimsA, fname)
+        func = eval(fexpr)
+        reducedim_cache[key] = func
+    else
+        func = reducedim_cache[key]
+    end
+
+    func(f, A, region, R, v0)
+
+    return R
+end
+end
+

From 40603916d233d6247908f3f4235b6aca9d734177 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Fri, 3 Jan 2014 14:17:26 -0600
Subject: [PATCH 03/12] more efficient implementation of reduced_dims

---
 base/reducedim.jl | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 0c129543f8e45..b5f2be7400732 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -1,12 +1,43 @@
 
+## Functions to compute the reduced shape
+
 # for reductions that expand 0 dims to 1
-reduced_dims(A, region) = ntuple(ndims(A), i->(in(i, region) ? 1 :
-                                               size(A,i)))
+reduced_dims(a::AbstractArray, region) = reduced_dims(size(a), region)
+
+# for reductions that keep 0 dims as 0
+reduced_dims0(a::AbstractArray, region) = reduced_dims0(size(a), region)
+
+reduced_dims{N}(siz::NTuple{N,Int}, d::Int, rd::Int) = d == 1 ? tuple(rd, siz[d+1:N]...) :
+                                                       d == N ? tuple(siz[1:N-1]..., rd) :
+                                                       1 < d < N ? tuple(siz[1:d-1]..., rd, siz[d+1:N]...) : 
+                                                       siz
+
+reduced_dims{N}(siz::NTuple{N,Int}, d::Int) = reduced_dims(siz, d, 1)
+
+reduced_dims0{N}(siz::NTuple{N,Int}, d::Int) = reduced_dims(siz, d, (siz[d] == 0 ? 0 : 1))
+
+function reduced_dims{N}(siz::NTuple{N,Int}, region)
+    rsiz = [siz...]
+    for i in region
+        if 1 <= i <= N
+            rsiz[i] = 1
+        end
+    end
+    tuple(rsiz...)
+end
+
+function reduced_dims0{N}(siz::NTuple{N,Int}, region)
+    rsiz = [siz...]
+    for i in region
+        if i <= i <= N
+            rsiz[i] = (rsiz[i] == 0 ? 0 : 1)
+        end
+    end
+    tuple(rsiz...)
+end
+
 
-# keep 0 dims in place
-reduced_dims0(A, region) = ntuple(ndims(A), i->(size(A,i)==0 ? 0 :
-                                                in(i, region) ? 1 :
-                                                size(A,i)))
+# reduction codes
 
 reducedim(f::Function, A, region, v0) =
     reducedim(f, A, region, v0, similar(A, reduced_dims(A, region)))

From 579ab44d399bc688a43856228ca1d5cfed51b504 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Fri, 3 Jan 2014 22:14:13 -0600
Subject: [PATCH 04/12] Fast sum along dimensions over contiguous arrays

---
 base/reducedim.jl | 201 +++++++++++++++++++++++++++++++++++++++++++++-
 base/sysimg.jl    |   4 +-
 test/arrayops.jl  |   5 ++
 test/reducedim.jl |  44 ++++++++++
 4 files changed, 252 insertions(+), 2 deletions(-)
 create mode 100644 test/reducedim.jl

diff --git a/base/reducedim.jl b/base/reducedim.jl
index b5f2be7400732..f678e83145d27 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -37,7 +37,7 @@ function reduced_dims0{N}(siz::NTuple{N,Int}, region)
 end
 
 
-# reduction codes
+##### (original) reduction codes for arbitrary arrays
 
 reducedim(f::Function, A, region, v0) =
     reducedim(f, A, region, v0, similar(A, reduced_dims(A, region)))
@@ -130,3 +130,202 @@ function reducedim(f::Function, A, region, v0, R)
 end
 end
 
+
+
+##### Below are optimized codes for contiguous arrays
+
+function rcompress_dims{N}(siz::NTuple{N,Int}, region)
+    isrd = fill(false, N)
+    for k in region
+        if 1 <= k <= N
+            isrd[k] = true
+        end
+    end
+    
+    sdims = Int[]
+    sizehint(sdims, N)
+    b = isrd[1]
+    n = siz[1]
+    i = 2
+    while i <= N
+        if isrd[i] == b
+            n *= siz[i]
+        else
+            b = !b
+            push!(sdims, n)
+            n = siz[i]
+        end
+        i += 1
+    end
+    if n != 1
+        push!(sdims, n)
+    end
+    return (isrd[1], sdims)
+end
+
+function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
+    # Parameters:
+    #
+    # - fname:  the interface function name (e.g. sum, maximum)
+    # - comb:   the combination operation (e.g. +)
+    # - sker:   a kernel function that reduces a vector (or a range of it) to a scalar
+    # - ker0!:  a kernel that initializes an accumulator array using the first column of terms
+    # - ker1!:  a kernel that accumulates a term column to an accumulating column
+    #
+
+
+    fname! = symbol("$(fname)!")
+    fa! = symbol("$(fname)_a!")
+    fb! = symbol("$(fname)_b!")
+    
+    quote
+        global $(fname!)
+        function $(fname!)(dst::Array, a::Array, region)
+            isrd1, secs = rcompress_dims(size(a), region)
+            @assert prod(secs) == length(a)
+            if isrd1
+                $(fa!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+            else
+                $(fb!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+            end
+            dst
+        end
+    
+        # $(fa!)
+        global $(fa!)
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int)
+            if isinit
+                dst[od+1] = $(sker)(a, oa+1, oa+n1)
+            else
+                dst[od+1] = $(comb)(dst[od+1], $(sker)(a, oa+1, oa+n1))
+            end
+        end
+
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int)
+            if isinit
+                for j = 1:n1
+                    alast = oa + n2
+                    dst[od+j] = $(sker)(a, oa+1, alast)
+                    oa = alast
+                end
+            else
+                for j = 1:n1
+                    alast = oa + n2
+                    dst[od+j] = $(comb)(dst[od+j], $(sker)(a, oa+1, alast))
+                    oa = alast
+                end
+            end
+        end
+
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
+            as::Int = *(n2, n3, ns...)
+            if length(ns) & 1 == 0
+                $(fa!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                oa += as    
+    
+                for j = 2:n1
+                    $(fa!)(false, dst, od, a, oa, n2, n3, ns...)
+                    oa += as
+                end                                 
+            else 
+                ds::Int = *(n3, ns[2:2:end]...)
+                for j = 1:n1
+                    $(fa!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                    od += ds
+                    oa += as
+                end   
+            end
+        end
+
+        # $(fb!)
+        global $(fb!)
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int)
+            if isinit
+                $(ker0!)(dst, od, a, oa, n1)
+            else
+                $(ker1!)(dst, od, a, oa, n1)
+            end
+        end
+
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int)
+            if isinit
+                $(ker0!)(dst, od, a, oa, n2)
+            else
+                $(ker1!)(dst, od, a, oa, n2)        
+            end
+            oa += n2
+    
+            for j = 2:n1
+                $(ker1!)(dst, od, a, oa, n2)
+                oa += n2
+            end
+        end
+
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
+            as = *(n2, n3, ns...)
+            if length(ns) & 1 == 0
+                ds::Int = *(n3, ns[2:2:end]...)
+                for j = 1:n1
+                    $(fb!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                    od += ds
+                    oa += as
+                end
+            else
+                $(fb!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                oa += as
+    
+                for j = 2:n1
+                    $(fb!)(false, dst, od, a, oa, n2, n3, ns...)
+                    oa += as
+                end
+            end
+        end
+    end
+end
+
+macro code_reducedim(fname, comb, sker, ker0, ker1)
+    esc(generate_reducedim_funcs(fname, comb, sker, ker0, ker1))
+end
+
+
+# sum
+
+function vcopy!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
+    for i = 1:n
+        @inbounds dst[od+i] = a[oa+i]
+    end    
+end
+
+vcopy!{T}(dst::Array{T}, od::Int, a::Array{T}, oa::Int, n::Int) = copy!(dst, od+1, a, oa+1, n)
+
+function vadd!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
+    for i = 1:n
+        @inbounds dst[od+i] += a[oa+i]
+    end
+end
+
+@code_reducedim sum (+) sum_seq vcopy! vadd!
+function sum{T<:Number}(a::Array{T}, region)
+    dst = Array(T, reduced_dims(a, region))
+    if !isempty(dst)
+        if isempty(a)
+            fill!(dst, zero(T))
+        else
+            sum!(dst, a, region)
+        end
+    end
+    return dst
+end
+
+function sum(a::Array{Bool}, region)
+    dst = Array(Int, reduced_dims(a, region))
+    if !isempty(dst)
+        if isempty(a)
+            fill!(dst, 0)
+        else
+            sum!(dst, a, region)
+        end
+    end
+    return dst
+end
+
diff --git a/base/sysimg.jl b/base/sysimg.jl
index 1503d7ebebf6b..f019b3c14e9c4 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -42,7 +42,6 @@ include("pointer.jl")
 
 include("float.jl")
 include("reduce.jl")
-include("reducedim.jl")
 include("complex.jl")
 include("rational.jl")
 
@@ -180,6 +179,9 @@ importall .LinAlg
 include("broadcast.jl")
 importall .Broadcast
 
+# reduction along dims
+include("reducedim.jl")  # macros in this file relies on string.jl
+
 # signal processing
 include("fftw.jl")
 include("dsp.jl")
diff --git a/test/arrayops.jl b/test/arrayops.jl
index dbc8d5945d893..83be870ee8ce8 100644
--- a/test/arrayops.jl
+++ b/test/arrayops.jl
@@ -790,3 +790,8 @@ A = [NaN]; B = [NaN]
 @test !(A==B)
 @test isequal(A,B)
 @test A!==B
+
+# complete testsuite for reducedim
+
+include("reducedim.jl")
+
diff --git a/test/reducedim.jl b/test/reducedim.jl
new file mode 100644
index 0000000000000..0a79003ef0d88
--- /dev/null
+++ b/test/reducedim.jl
@@ -0,0 +1,44 @@
+
+# Supporting routines
+
+@test Base.rcompress_dims((3, 4), 1) == (true,  [3, 4])
+@test Base.rcompress_dims((3, 4), 2) == (false, [3, 4])
+@test Base.rcompress_dims((3, 4), 3) == (false, [12])
+@test Base.rcompress_dims((3, 4), (1, 2)) == (true, [12])
+
+@test Base.rcompress_dims((3, 4, 5), 1) == (true,  [3, 20])
+@test Base.rcompress_dims((3, 4, 5), 2) == (false, [3, 4, 5])
+@test Base.rcompress_dims((3, 4, 5), 3) == (false, [12, 5])
+@test Base.rcompress_dims((3, 4, 5), (1, 2)) == (true,  [12, 5])
+@test Base.rcompress_dims((3, 4, 5), (1, 3)) == (true,  [3, 4, 5])
+@test Base.rcompress_dims((3, 4, 5), (2, 3)) == (false, [3, 20])
+@test Base.rcompress_dims((3, 4, 5), (1, 2, 3)) == (true, [60])
+
+@test Base.rcompress_dims((3, 4, 5, 2), 1) == (true,  [3, 40])
+@test Base.rcompress_dims((3, 4, 5, 2), 2) == (false, [3, 4, 10])
+@test Base.rcompress_dims((3, 4, 5, 2), 3) == (false, [12, 5, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), 4) == (false, [60, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 2)) == (true,  [12, 10])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 3)) == (true,  [3, 4, 5, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 4)) == (true,  [3, 20, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (2, 3)) == (false, [3, 20, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (2, 4)) == (false, [3, 4, 5, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (3, 4)) == (false, [12, 10])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 3)) == (true,  [60, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 4)) == (true,  [12, 5, 2])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 3, 4)) == (true,  [3, 4, 10])
+@test Base.rcompress_dims((3, 4, 5, 2), (2, 3, 4)) == (false, [3, 40])
+@test Base.rcompress_dims((3, 4, 5, 2), (1, 2, 3, 4)) == (true, [120])
+
+# main tests
+
+safe_sum{T}(A::Array{T}, region) = reducedim(+,A,region,zero(T))
+
+Areduc = rand(3, 4, 5, 6)
+for region in {
+	1, 2, 3, 4, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), 
+	(1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
+
+	@test_approx_eq sum(Areduc, region) safe_sum(Areduc, region)
+end
+

From 2ffdbfaaf799df0b2c84100f1b46ed16f4c72acd Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Fri, 3 Jan 2014 23:17:10 -0600
Subject: [PATCH 05/12] specialized for cases with region being an integer

---
 base/reducedim.jl | 30 +++++++++++++++++++++++++-----
 test/reducedim.jl |  2 +-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index f678e83145d27..c5edad11636a4 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -180,13 +180,33 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
     
     quote
         global $(fname!)
+        function $(fname!)(dst::Array, a::Array, dim::Integer)
+            nd = ndims(a)
+            siz = size(a)
+            if 1 <= dim <= nd
+                if dim == 1
+                    $(fa!)(true, dst, 0, a, 0, prod(siz[2:nd]), siz[1])
+                elseif dim == nd
+                    $(fb!)(true, dst, 0, a, 0, siz[nd], prod(siz[1:nd-1]))
+                else
+                    $(fb!)(true, dst, 0, a, 0, prod(siz[dim+1:nd]), siz[dim], prod(siz[1:dim-1]))
+                end
+            else
+                $(ker0!)(dst, 0, a, 0, length(a))
+            end
+            dst
+        end
+
         function $(fname!)(dst::Array, a::Array, region)
-            isrd1, secs = rcompress_dims(size(a), region)
-            @assert prod(secs) == length(a)
-            if isrd1
-                $(fa!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+            if length(region) == 1
+                $(fname!)(dst, a, region[1])
             else
-                $(fb!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+                isrd1, secs = rcompress_dims(size(a), region)
+                if isrd1
+                    $(fa!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+                else
+                    $(fb!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+                end                
             end
             dst
         end
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 0a79003ef0d88..662098305ed6f 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -36,7 +36,7 @@ safe_sum{T}(A::Array{T}, region) = reducedim(+,A,region,zero(T))
 
 Areduc = rand(3, 4, 5, 6)
 for region in {
-	1, 2, 3, 4, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), 
+	1, 2, 3, 4, 5, (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4), 
 	(1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
 
 	@test_approx_eq sum(Areduc, region) safe_sum(Areduc, region)

From 01dfa4aa60b7b10703f2214e4c8856a527b94036 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 14:26:59 -0600
Subject: [PATCH 06/12] some tuning of the vector kernels

---
 base/reducedim.jl | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index c5edad11636a4..920c6c53b026a 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -173,7 +173,6 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
     # - ker1!:  a kernel that accumulates a term column to an accumulating column
     #
 
-
     fname! = symbol("$(fname)!")
     fa! = symbol("$(fname)_a!")
     fb! = symbol("$(fname)_b!")
@@ -192,7 +191,7 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
                     $(fb!)(true, dst, 0, a, 0, prod(siz[dim+1:nd]), siz[dim], prod(siz[1:dim-1]))
                 end
             else
-                $(ker0!)(dst, 0, a, 0, length(a))
+                $(ker0!)(dst, 1, a, 1, length(a))
             end
             dst
         end
@@ -261,22 +260,22 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
         global $(fb!)
         function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int)
             if isinit
-                $(ker0!)(dst, od, a, oa, n1)
+                $(ker0!)(dst, od+1, a, oa+1, n1)
             else
-                $(ker1!)(dst, od, a, oa, n1)
+                $(ker1!)(dst, od+1, a, oa+1, n1)
             end
         end
 
         function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int)
             if isinit
-                $(ker0!)(dst, od, a, oa, n2)
+                $(ker0!)(dst, od+1, a, oa+1, n2)
             else
-                $(ker1!)(dst, od, a, oa, n2)        
+                $(ker1!)(dst, od+1, a, oa+1, n2)        
             end
             oa += n2
     
             for j = 2:n1
-                $(ker1!)(dst, od, a, oa, n2)
+                $(ker1!)(dst, od+1, a, oa+1, n2)
                 oa += n2
             end
         end
@@ -312,15 +311,19 @@ end
 
 function vcopy!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
     for i = 1:n
-        @inbounds dst[od+i] = a[oa+i]
+        @inbounds dst[od] = a[oa]
+        od += 1
+        oa += 1
     end    
 end
 
-vcopy!{T}(dst::Array{T}, od::Int, a::Array{T}, oa::Int, n::Int) = copy!(dst, od+1, a, oa+1, n)
+vcopy!{T}(dst::Array{T}, od::Int, a::Array{T}, oa::Int, n::Int) = copy!(dst, od, a, oa, n)
 
 function vadd!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
     for i = 1:n
-        @inbounds dst[od+i] += a[oa+i]
+        @inbounds dst[od] += a[oa]
+        od += 1
+        oa += 1
     end
 end
 

From c3771fe90d6f2e13f87f18ccd1249f2dbf3cc0c8 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 14:32:28 -0600
Subject: [PATCH 07/12] allow specify the result type of sum.

---
 base/reducedim.jl | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 920c6c53b026a..7ba59a8d62dd3 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -328,23 +328,12 @@ function vadd!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
 end
 
 @code_reducedim sum (+) sum_seq vcopy! vadd!
-function sum{T<:Number}(a::Array{T}, region)
-    dst = Array(T, reduced_dims(a, region))
-    if !isempty(dst)
-        if isempty(a)
-            fill!(dst, zero(T))
-        else
-            sum!(dst, a, region)
-        end
-    end
-    return dst
-end
 
-function sum(a::Array{Bool}, region)
-    dst = Array(Int, reduced_dims(a, region))
+function sum{R}(rt::Type{R}, a::Array, region)
+    dst = Array(R, reduced_dims(a, region))
     if !isempty(dst)
         if isempty(a)
-            fill!(dst, 0)
+            fill!(dst, zero(R))
         else
             sum!(dst, a, region)
         end
@@ -352,3 +341,6 @@ function sum(a::Array{Bool}, region)
     return dst
 end
 
+sum{T}(a::Array{T}, region) = sum(T, a, region)
+sum(a::Array{Bool}, region) = sum(Int, a, region)
+

From 483e13f040f5c8c1ef67ccdb4d60c80df9c02335 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 14:41:21 -0600
Subject: [PATCH 08/12] allow minimum and maximum over a subvector

---
 base/abstractarray.jl | 54 ++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 39 deletions(-)

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index c55b0ef651c30..2dadc1ceb1364 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1580,45 +1580,19 @@ function prod{T}(A::AbstractArray{T})
     v
 end
 
-function minimum{T<:Real}(A::AbstractArray{T})
-    if isempty(A); error("argument must not be empty"); end
-    v = A[1]
-    for i=2:length(A)
-        @inbounds x = A[i]
-        if x < v
-            v = x
-        end
-    end
-    v
-end
 
-function maximum{T<:Real}(A::AbstractArray{T})
-    if isempty(A); error("argument must not be empty"); end
-    v = A[1]
-    for i=2:length(A)
-        @inbounds x = A[i]
-        if x > v
-            v = x
-        end
-    end
-    v
-end
-
-# specialized versions for floating-point, which deal with NaNs
-
-function minimum{T<:FloatingPoint}(A::AbstractArray{T})
-    if isempty(A); error("argument must not be empty"); end
-    n = length(A)
+function minimum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
+    if first > last; error("argument range must not be empty"); end
 
     # locate the first non NaN number
-    v = A[1]
-    i = 2
-    while v != v && i <= n
+    v = A[first]
+    i = first + 1
+    while v != v && i <= last
         @inbounds v = A[i]
         i += 1
     end
 
-    while i <= n
+    while i <= last
         @inbounds x = A[i]
         if x < v
             v = x
@@ -1628,19 +1602,18 @@ function minimum{T<:FloatingPoint}(A::AbstractArray{T})
     v
 end
 
-function maximum{T<:FloatingPoint}(A::AbstractArray{T})
-    if isempty(A); error("argument must not be empty"); end
-    n = length(A)
+function maximum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
+    if first > last; error("argument range must not be empty"); end
 
     # locate the first non NaN number
-    v = A[1]
-    i = 2
-    while v != v && i <= n
+    v = A[first]
+    i = first + 1
+    while v != v && i <= last
         @inbounds v = A[i]
         i += 1
     end
 
-    while i <= n
+    while i <= last
         @inbounds x = A[i]
         if x > v
             v = x
@@ -1650,6 +1623,9 @@ function maximum{T<:FloatingPoint}(A::AbstractArray{T})
     v
 end
 
+minimum{T<:Real}(A::AbstractArray{T}) = minimum(A, 1, length(A))
+maximum{T<:Real}(A::AbstractArray{T}) = maximum(A, 1, length(A))
+
 # extrema
 
 function extrema{T<:Real}(A::AbstractArray{T})

From 54ba4700569a24d0d454824c030b96f92b49daaf Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 14:48:54 -0600
Subject: [PATCH 09/12] apply the new reduction framework to maximum & minimum

---
 base/reducedim.jl | 24 ++++++++++++++++++++++++
 test/reducedim.jl |  6 +++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 7ba59a8d62dd3..fe969608bf936 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -344,3 +344,27 @@ end
 sum{T}(a::Array{T}, region) = sum(T, a, region)
 sum(a::Array{Bool}, region) = sum(Int, a, region)
 
+
+# maximum & minimum
+
+function vmax!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
+    for i = 1:n
+        @inbounds dst[od] = scalarmax(dst[od], a[oa])
+        od += 1
+        oa += 1
+    end
+end
+
+function vmin!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
+    for i = 1:n
+        @inbounds dst[od] = scalarmin(dst[od], a[oa])
+        od += 1
+        oa += 1
+    end
+end
+
+@code_reducedim maximum scalarmax maximum vcopy! vmax!
+@code_reducedim minimum scalarmin minimum vcopy! vmin!
+
+
+
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 662098305ed6f..eb1b506f4863f 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -32,7 +32,9 @@
 
 # main tests
 
-safe_sum{T}(A::Array{T}, region) = reducedim(+,A,region,zero(T))
+safe_sum{T}(A::Array{T}, region) = reducedim(+, A, region, zero(T))
+safe_maximum{T}(A::Array{T}, region) = reducedim(Base.scalarmax, A, region, typemin(T))
+safe_minimum{T}(A::Array{T}, region) = reducedim(Base.scalarmin, A, region, typemax(T))
 
 Areduc = rand(3, 4, 5, 6)
 for region in {
@@ -40,5 +42,7 @@ for region in {
 	(1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
 
 	@test_approx_eq sum(Areduc, region) safe_sum(Areduc, region)
+	@test_approx_eq maximum(Areduc, region) safe_maximum(Areduc, region)
+	@test_approx_eq minimum(Areduc, region) safe_minimum(Areduc, region)
 end
 

From 3b1a8ead33aba5f1c2fabad8009c631e8126043c Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 15:01:36 -0600
Subject: [PATCH 10/12] apply new reduction framework to prod

---
 base/abstractarray.jl | 15 +++++++++------
 base/reducedim.jl     | 26 ++++++++++++++++++++++++++
 test/reducedim.jl     |  2 ++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 2dadc1ceb1364..2ce2121296ac4 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1569,16 +1569,19 @@ function cumsum_kbn{T<:FloatingPoint}(A::AbstractArray{T}, axis::Integer)
     return B + C
 end
 
-function prod{T}(A::AbstractArray{T})
-    if isempty(A)
+
+function prod_rgn{T}(A::AbstractArray{T}, first::Int, last::Int)
+    if first > last
         return one(T)
     end
-    v = A[1]
-    for i=2:length(A)
-        @inbounds v *= A[i]
+    i = first
+    v = A[i]
+    while i < last
+        @inbounds v *= A[i+=1]
     end
-    v
+    return v
 end
+prod{T}(A::AbstractArray{T}) = prod_rgn(A, 1, length(A))
 
 
 function minimum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
diff --git a/base/reducedim.jl b/base/reducedim.jl
index fe969608bf936..b1fa2a14f3db2 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -344,6 +344,32 @@ end
 sum{T}(a::Array{T}, region) = sum(T, a, region)
 sum(a::Array{Bool}, region) = sum(Int, a, region)
 
+# prod
+
+function vmultiply!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
+    for i = 1:n
+        @inbounds dst[od] *= a[oa]
+        od += 1
+        oa += 1
+    end
+end
+
+@code_reducedim prod (*) prod_rgn vcopy! vmultiply!
+
+function prod{R}(rt::Type{R}, a::Array, region)
+    dst = Array(R, reduced_dims(a, region))
+    if !isempty(dst)
+        if isempty(a)
+            fill!(dst, one(R))
+        else
+            prod!(dst, a, region)
+        end
+    end
+    return dst
+end
+
+prod{T}(a::Array{T}, region) = prod(T, a, region)
+
 
 # maximum & minimum
 
diff --git a/test/reducedim.jl b/test/reducedim.jl
index eb1b506f4863f..b66d227d10ab6 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -33,6 +33,7 @@
 # main tests
 
 safe_sum{T}(A::Array{T}, region) = reducedim(+, A, region, zero(T))
+safe_prod{T}(A::Array{T}, region) = reducedim(*, A, region, one(T))
 safe_maximum{T}(A::Array{T}, region) = reducedim(Base.scalarmax, A, region, typemin(T))
 safe_minimum{T}(A::Array{T}, region) = reducedim(Base.scalarmin, A, region, typemax(T))
 
@@ -42,6 +43,7 @@ for region in {
 	(1, 2, 3), (1, 3, 4), (2, 3, 4), (1, 2, 3, 4)}
 
 	@test_approx_eq sum(Areduc, region) safe_sum(Areduc, region)
+	@test_approx_eq prod(Areduc, region) safe_prod(Areduc, region)
 	@test_approx_eq maximum(Areduc, region) safe_maximum(Areduc, region)
 	@test_approx_eq minimum(Areduc, region) safe_minimum(Areduc, region)
 end

From 9ecc3213ece7ecedec5ff8062f2e686e64a6f93f Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 15:10:33 -0600
Subject: [PATCH 11/12] updates to the maximum & minimum codes

---
 base/abstractarray.jl |  8 ++++----
 base/reducedim.jl     | 20 +++++++++++++++++---
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 2ce2121296ac4..eb1456b2042b9 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1584,7 +1584,7 @@ end
 prod{T}(A::AbstractArray{T}) = prod_rgn(A, 1, length(A))
 
 
-function minimum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
+function minimum_rgn{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
     if first > last; error("argument range must not be empty"); end
 
     # locate the first non NaN number
@@ -1605,7 +1605,7 @@ function minimum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
     v
 end
 
-function maximum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
+function maximum_rgn{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
     if first > last; error("argument range must not be empty"); end
 
     # locate the first non NaN number
@@ -1626,8 +1626,8 @@ function maximum{T<:Real}(A::AbstractArray{T}, first::Int, last::Int)
     v
 end
 
-minimum{T<:Real}(A::AbstractArray{T}) = minimum(A, 1, length(A))
-maximum{T<:Real}(A::AbstractArray{T}) = maximum(A, 1, length(A))
+minimum{T<:Real}(A::AbstractArray{T}) = minimum_rgn(A, 1, length(A))
+maximum{T<:Real}(A::AbstractArray{T}) = maximum_rgn(A, 1, length(A))
 
 # extrema
 
diff --git a/base/reducedim.jl b/base/reducedim.jl
index b1fa2a14f3db2..879c45ecc7863 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -14,7 +14,7 @@ reduced_dims{N}(siz::NTuple{N,Int}, d::Int, rd::Int) = d == 1 ? tuple(rd, siz[d+
 
 reduced_dims{N}(siz::NTuple{N,Int}, d::Int) = reduced_dims(siz, d, 1)
 
-reduced_dims0{N}(siz::NTuple{N,Int}, d::Int) = reduced_dims(siz, d, (siz[d] == 0 ? 0 : 1))
+reduced_dims0{N}(siz::NTuple{N,Int}, d::Int) = 1 <= d <= N ? reduced_dims(siz, d, (siz[d] == 0 ? 0 : 1)) : siz
 
 function reduced_dims{N}(siz::NTuple{N,Int}, region)
     rsiz = [siz...]
@@ -389,8 +389,22 @@ function vmin!(dst::Array, od::Int, a::Array, oa::Int, n::Int)
     end
 end
 
-@code_reducedim maximum scalarmax maximum vcopy! vmax!
-@code_reducedim minimum scalarmin minimum vcopy! vmin!
+@code_reducedim maximum scalarmax maximum_rgn vcopy! vmax!
+@code_reducedim minimum scalarmin minimum_rgn vcopy! vmin!
 
+function maximum{T}(a::Array{T}, region)
+    dst = Array(T, reduced_dims0(a, region))
+    if !isempty(dst)
+        maximum!(dst, a, region)
+    end
+    return dst
+end
 
+function minimum{T}(a::Array{T}, region)
+    dst = Array(T, reduced_dims0(a, region))
+    if !isempty(dst)
+        minimum!(dst, a, region)
+    end
+    return dst
+end
 

From da2d02c0da44a06f958bc2f73e912ea59175eb71 Mon Sep 17 00:00:00 2001
From: Dahua Lin <lindahua@gmail.com>
Date: Sat, 4 Jan 2014 15:32:58 -0600
Subject: [PATCH 12/12] more flexible code generation functions

Now the code-gen allows generating reduction functions that may contain more than one input arrays.
---
 base/reducedim.jl | 79 ++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 879c45ecc7863..20f424146d56c 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -163,10 +163,13 @@ function rcompress_dims{N}(siz::NTuple{N,Int}, region)
     return (isrd[1], sdims)
 end
 
-function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
+function generate_reducedim_funcs(fname, params, args, sizexpr, comb, sker, ker0!, ker1!)
     # Parameters:
     #
-    # - fname:  the interface function name (e.g. sum, maximum)
+    # - fname:   the interface function name (e.g. sum, maximum)
+    # - params:  a list of input parameters in function signatures
+    # - args:    a list of argument symbols 
+    # - sizexpr: the expression that calculates the input size
     # - comb:   the combination operation (e.g. +)
     # - sker:   a kernel function that reduces a vector (or a range of it) to a scalar
     # - ker0!:  a kernel that initializes an accumulator array using the first column of terms
@@ -179,32 +182,32 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
     
     quote
         global $(fname!)
-        function $(fname!)(dst::Array, a::Array, dim::Integer)
-            nd = ndims(a)
-            siz = size(a)
+        function $(fname!)(dst::Array, $(params...), dim::Integer)
+            siz = $(sizexpr)
+            nd = length(siz)
             if 1 <= dim <= nd
                 if dim == 1
-                    $(fa!)(true, dst, 0, a, 0, prod(siz[2:nd]), siz[1])
+                    $(fa!)(true, dst, 0, $(args...), 0, prod(siz[2:nd]), siz[1])
                 elseif dim == nd
-                    $(fb!)(true, dst, 0, a, 0, siz[nd], prod(siz[1:nd-1]))
+                    $(fb!)(true, dst, 0, $(args...), 0, siz[nd], prod(siz[1:nd-1]))
                 else
-                    $(fb!)(true, dst, 0, a, 0, prod(siz[dim+1:nd]), siz[dim], prod(siz[1:dim-1]))
+                    $(fb!)(true, dst, 0, $(args...), 0, prod(siz[dim+1:nd]), siz[dim], prod(siz[1:dim-1]))
                 end
             else
-                $(ker0!)(dst, 1, a, 1, length(a))
+                $(ker0!)(dst, 1, $(args...), 1, prod(siz))
             end
             dst
         end
 
-        function $(fname!)(dst::Array, a::Array, region)
+        function $(fname!)(dst::Array, $(params...), region)
             if length(region) == 1
-                $(fname!)(dst, a, region[1])
+                $(fname!)(dst, $(args...), region[1])
             else
-                isrd1, secs = rcompress_dims(size(a), region)
+                isrd1, secs = rcompress_dims($(sizexpr), region)
                 if isrd1
-                    $(fa!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+                    $(fa!)(true, dst, 0, $(args...), 0, secs[end:-1:1]...)
                 else
-                    $(fb!)(true, dst, 0, a, 0, secs[end:-1:1]...)
+                    $(fb!)(true, dst, 0, $(args...), 0, secs[end:-1:1]...)
                 end                
             end
             dst
@@ -212,44 +215,44 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
     
         # $(fa!)
         global $(fa!)
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int)
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int)
             if isinit
-                dst[od+1] = $(sker)(a, oa+1, oa+n1)
+                dst[od+1] = $(sker)($(args...), oa+1, oa+n1)
             else
-                dst[od+1] = $(comb)(dst[od+1], $(sker)(a, oa+1, oa+n1))
+                dst[od+1] = $(comb)(dst[od+1], $(sker)($(args...), oa+1, oa+n1))
             end
         end
 
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int)
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int)
             if isinit
                 for j = 1:n1
                     alast = oa + n2
-                    dst[od+j] = $(sker)(a, oa+1, alast)
+                    dst[od+j] = $(sker)($(args...), oa+1, alast)
                     oa = alast
                 end
             else
                 for j = 1:n1
                     alast = oa + n2
-                    dst[od+j] = $(comb)(dst[od+j], $(sker)(a, oa+1, alast))
+                    dst[od+j] = $(comb)(dst[od+j], $(sker)($(args...), oa+1, alast))
                     oa = alast
                 end
             end
         end
 
-        function $(fa!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
+        function $(fa!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
             as::Int = *(n2, n3, ns...)
             if length(ns) & 1 == 0
-                $(fa!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                $(fa!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
                 oa += as    
     
                 for j = 2:n1
-                    $(fa!)(false, dst, od, a, oa, n2, n3, ns...)
+                    $(fa!)(false, dst, od, $(args...), oa, n2, n3, ns...)
                     oa += as
                 end                                 
             else 
                 ds::Int = *(n3, ns[2:2:end]...)
                 for j = 1:n1
-                    $(fa!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                    $(fa!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
                     od += ds
                     oa += as
                 end   
@@ -258,43 +261,42 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
 
         # $(fb!)
         global $(fb!)
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int)
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int)
             if isinit
-                $(ker0!)(dst, od+1, a, oa+1, n1)
+                $(ker0!)(dst, od+1, $(args...), oa+1, n1)
             else
-                $(ker1!)(dst, od+1, a, oa+1, n1)
+                $(ker1!)(dst, od+1, $(args...), oa+1, n1)
             end
         end
 
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int)
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int)
             if isinit
-                $(ker0!)(dst, od+1, a, oa+1, n2)
+                $(ker0!)(dst, od+1, $(args...), oa+1, n2)
             else
-                $(ker1!)(dst, od+1, a, oa+1, n2)        
+                $(ker1!)(dst, od+1, $(args...), oa+1, n2)        
             end
             oa += n2
-    
             for j = 2:n1
-                $(ker1!)(dst, od+1, a, oa+1, n2)
+                $(ker1!)(dst, od+1, $(args...), oa+1, n2)
                 oa += n2
             end
         end
 
-        function $(fb!)(isinit::Bool, dst::Array, od::Int, a::Array, oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
+        function $(fb!)(isinit::Bool, dst::Array, od::Int, $(params...), oa::Int, n1::Int, n2::Int, n3::Int, ns::Int...)
             as = *(n2, n3, ns...)
             if length(ns) & 1 == 0
                 ds::Int = *(n3, ns[2:2:end]...)
                 for j = 1:n1
-                    $(fb!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                    $(fb!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
                     od += ds
                     oa += as
                 end
             else
-                $(fb!)(isinit, dst, od, a, oa, n2, n3, ns...)
+                $(fb!)(isinit, dst, od, $(args...), oa, n2, n3, ns...)
                 oa += as
     
                 for j = 2:n1
-                    $(fb!)(false, dst, od, a, oa, n2, n3, ns...)
+                    $(fb!)(false, dst, od, $(args...), oa, n2, n3, ns...)
                     oa += as
                 end
             end
@@ -302,6 +304,11 @@ function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
     end
 end
 
+function generate_reducedim_funcs(fname, comb, sker, ker0!, ker1!)
+    # specialized method to generate functions with single input arguments
+    generate_reducedim_funcs(fname, [:(a::Array)], [:a], :(size(a)), comb, sker, ker0!, ker1!)
+end
+
 macro code_reducedim(fname, comb, sker, ker0, ker1)
     esc(generate_reducedim_funcs(fname, comb, sker, ker0, ker1))
 end