Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Some folds are now working. Reducing arrays over tuples is still not …

…supported.
  • Loading branch information...
commit 12671d7d84ccc5e6877fe17be400cd2acc556b4c 1 parent 1435f84
@dybber dybber authored
View
26 Data/Array/Accelerate/OpenCL/CodeGen.hs
@@ -100,8 +100,8 @@ codeGenAcc acc vars =
-- computation nodes
--
Generate _ f -> mkGenerate (codeGenAccTypeDim acc) (codeGenFun f)
- -- Fold f e a -> mkFold (codeGenAccTypeDim a) (codeGenExp e) (codeGenFun f)
- -- Fold1 f a -> mkFold1 (codeGenAccTypeDim a) (codeGenFun f)
+ Fold f e a -> mkFold (codeGenAccTypeDim a) (seqexps $ codeGenExp e) (codeGenFun f)
+ Fold1 f a -> mkFold1 (codeGenAccTypeDim a) (codeGenFun f)
-- FoldSeg f e a s -> mkFoldSeg (codeGenAccTypeDim a) (codeGenAccType s) (codeGenExp e) (codeGenFun f)
-- Fold1Seg f a s -> mkFold1Seg (codeGenAccTypeDim a) (codeGenAccType s) (codeGenFun f)
-- Scanl f e _ -> mkScanl (codeGenExpType e) (codeGenExp e) (codeGenFun f)
@@ -125,17 +125,17 @@ codeGenAcc acc vars =
in
mkReplicate (codeGenAccType a) dimSl dimOut . seqexps . reverse $ extend sl 0
--- -- Index sl a slix ->
--- -- let dimCo = length (codeGenExpType slix)
--- -- dimSl = accDim acc
--- -- dimIn0 = accDim a
--- -- --
--- -- restrict :: SliceIndex slix sl co dim -> (Int,Int) -> [CExpr]
--- -- restrict (SliceNil) _ = []
--- -- restrict (SliceAll sliceIdx) (m,n) = mkPrj dimSl "sl" n : restrict sliceIdx (m,n+1)
--- -- restrict (SliceFixed sliceIdx) (m,n) = mkPrj dimCo "co" m : restrict sliceIdx (m+1,n)
--- -- in
--- -- mkIndex (codeGenAccType a) dimSl dimCo dimIn0 . reverse $ restrict sl (0,0)
+ Index sl a slix ->
+ let dimCo = length (codeGenExpType slix)
+ dimSl = accDim acc
+ dimIn0 = accDim a
+ --
+ restrict :: SliceIndex slix sl co dim -> (Int,Int) -> [C.Exp]
+ restrict (SliceNil) _ = []
+ restrict (SliceAll sliceIdx) (m,n) = mkPrj dimSl "sl" n : restrict sliceIdx (m,n+1)
+ restrict (SliceFixed sliceIdx) (m,n) = mkPrj dimCo "co" m : restrict sliceIdx (m+1,n)
+ in
+ mkIndex (codeGenAccType a) dimSl dimCo dimIn0 . seqexps . reverse $ restrict sl (0,0)
-- -- Stencil f bndy a ->
-- -- let ty0 = codeGenTupleTex (accType a)
View
21 Data/Array/Accelerate/OpenCL/CodeGen/Data.hs
@@ -12,28 +12,30 @@
module Data.Array.Accelerate.OpenCL.CodeGen.Data
(
- --CType, CMacro,
CUTranslSkel(..)
)
where
import Language.C
---import Text.PrettyPrint
---type CType = [TypeSpec]
---type CMacro = (Id, Maybe Exp)
-data CUTranslSkel = CUTranslSkel [Definition] --[CMacro]
--- FilePath
+
+data CUTranslSkel = CUTranslSkel [Definition]
+
instance Show CUTranslSkel where
- show (CUTranslSkel code) = header ++ (unlines $ map show code)
+ show (CUTranslSkel code) =
+ header
+ ++ (unlines $ map show code)
header :: String
header = "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n"
- ++ "#include <accelerate_opencl_shape.h>\n\n"
+ ++ include "accelerate_opencl_shape.h"
+include :: FilePath -> String
+include hdr = "#include <" ++ hdr ++ ">\n"
+
-- instance Pretty CUTranslSkel where
-- pretty (CUTranslSkel code defs skel) =
-- vcat [ include "accelerate_cuda_extras.h"
@@ -43,9 +45,6 @@ header = "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n"
-- ]
--- include :: FilePath -> Doc
--- include hdr = text "#include <" <> text hdr <> text ">"
-
-- macro :: CMacro -> Doc
-- macro (d,v) = text "#define" <+> text (identToString d)
-- <+> maybe empty (parens . pretty) v
View
184 Data/Array/Accelerate/OpenCL/CodeGen/Reduce.hs
@@ -0,0 +1,184 @@
+{-# LANGUAGE QuasiQuotes #-}
+-- |
+-- Module : Data.Array.Accelerate.OpenCL.CodeGen.Reduce
+-- Copyright : [2011] Martin Dybdal
+-- License : BSD3
+--
+-- Maintainer : Martin Dybdal <dybber@dybber.dk>
+-- Stability : experimental
+-- Portability : non-portable (GHC extensions)
+--
+-- Constructors for array computation skeletons
+--
+
+module Data.Array.Accelerate.OpenCL.CodeGen.Reduce
+ (
+ mkFold, mkFold1
+ --, mkFoldSeg, mkFold1Seg
+ )
+ where
+
+import Control.Applicative ((<$>))
+import Data.Maybe (fromMaybe)
+
+import qualified Language.C as C
+import qualified Language.C.Syntax
+import qualified Data.Loc
+import qualified Data.Symbol
+
+import Language.C.Quote.OpenCL
+
+import Data.Array.Accelerate.OpenCL.CodeGen.Data
+import Data.Array.Accelerate.OpenCL.CodeGen.Util
+import Data.Array.Accelerate.OpenCL.CodeGen.Tuple
+import Data.Array.Accelerate.OpenCL.CodeGen.Monad
+
+
+-- Exported functions
+-- ------------------
+mkFold :: ([C.Type],Int) -> C.Exp -> C.Exp -> CUTranslSkel
+mkFold ty identity apply = makeFold False ty (Just identity) apply
+
+mkFold1 :: ([C.Type],Int) -> C.Exp -> CUTranslSkel
+mkFold1 ty apply = makeFold True ty Nothing apply
+
+
+
+-- Reduction
+-- ---------
+
+makeFold :: Bool -> ([C.Type],Int) -> Maybe C.Exp -> C.Exp -> CUTranslSkel
+makeFold inclusive (ty, dim) identity apply = runCGM $ do
+ (d_out, d_inA : _) <- mkTupleTypeAsc 1 ty
+ fromMaybe (return ()) (mkIdentity <$> identity)
+ mkApplyAsc 2 apply
+ mkDim "DimInA" dim
+ mkDim "DimOut" (dim-1)
+ let mkSkel | dim == 1 = mkFoldAllSkel
+ | otherwise = mkFoldSkel
+ mkSkel d_out d_inA inclusive
+
+mkFoldSkel :: Arguments -> Arguments -> Bool -> CGM ()
+mkFoldSkel = error "folds for higher dimensions are not yet supported in the OpenCL backend for Accelerate"
+
+mkFoldAllSkel :: Arguments -> Arguments -> Bool -> CGM ()
+mkFoldAllSkel d_out d_inA inclusive = do
+ ps <- getParams
+
+ mkHandleSeed d_out inclusive
+
+ let include = "#include <reduce.cl>"
+ addDefinitions [cunit| $esc:include |]
+
+ addDefinitions
+ [cunit|
+ __kernel void fold (const typename Ix shape,
+ $params:ps) {
+
+ volatile __local typename TyOut s_data[100];
+ //__global ArrOutTy *s_data = partition(s_ptr, get_local_size(0));
+
+ /*
+ * Calculate first level of reduction reading into shared memory
+ */
+ const typename Ix tid = get_local_id(0);
+ const typename Ix blockSize = get_local_size(0);
+ const typename Ix gridSize = get_global_size(0);
+ typename Ix i = get_global_id(0);
+ typename TyOut sum;
+
+ /*
+ * Reduce multiple elements per thread. The number is determined by the
+ * number of active thread blocks (via gridDim). More blocks will result in
+ * a larger `gridSize', and hence fewer elements per thread
+ *
+ * The loop stride of `gridSize' is used to maintain coalescing.
+ */
+ if (i < shape)
+ {
+ sum = getA(i, $args:d_inA);
+ for (i += gridSize; i < shape; i += gridSize)
+ sum = apply(sum, getA(i, $args:d_inA));
+ }
+
+ /*
+ * Each thread puts its local sum into shared memory, then threads
+ * cooperatively reduce the shared array to a single value.
+ */
+ set_local(tid, sum, s_data);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ sum = reduce_block_n(s_data, sum, min(shape, blockSize));
+
+ /*
+ * Write the results of this block back to global memory. If we are the last
+ * phase of a recursive multi-block reduction, include the seed element.
+ */
+
+ if (tid == 0)
+ {
+ handleSeed(sum, $args:d_out, $args:d_inA);
+ }
+ }
+ |]
+
+
+mkHandleSeed :: Arguments -> Bool -> CGM ()
+mkHandleSeed d_out False = do
+ ps <- getParams
+ addDefinitions
+ [cunit|
+ inline void handleSeed(typename TyOut sum,
+ $params:ps)
+ {
+ typename Ix blockIdx = (get_global_id(0)-get_local_id(0)) / get_local_size(0);
+ set(blockIdx, sum, $args:d_out);
+ }
+ |]
+mkHandleSeed d_out True = do
+ ps <- getParams
+ addDefinitions
+ [cunit|
+ inline void handleSeed(typename TyOut sum,
+ $params:ps)
+ {
+ typename Ix blockIdx = (get_global_id(0)-get_local_id(0)) / get_local_size(0);
+ if (shape > 0) {
+ typename TyOut seed = get_global_size(0) == get_local_size(0)
+ ? apply(sum, identity())
+ : sum ;
+ set(blockIdx, seed, $args:d_out);
+ }
+ else
+ set(blockIdx, identity(), $args:d_out);
+ }
+ |]
+
+
+-- mkFoldSeg :: ([CType],Int) -> [CType] -> [CExpr] -> [CExpr] -> CUTranslSkel
+-- mkFoldSeg (ty,dim) int identity apply = CUTranslSkel code [] skel
+-- where
+-- skel = "foldSeg.inl"
+-- code = CTranslUnit
+-- ( mkTupleTypeAsc 2 ty ++
+-- [ mkTuplePartition "ArrOut" ty True
+-- , mkIdentity identity
+-- , mkApply 2 apply
+-- , mkTypedef "Int" False False (head int)
+-- , mkDim "DimIn0" dim
+-- , mkDim "DimOut" dim ])
+-- (mkNodeInfo (initPos skel) (Name 0))
+
+-- mkFold1Seg :: ([CType],Int) -> [CType] -> [CExpr] -> CUTranslSkel
+-- mkFold1Seg (ty,dim) int apply = CUTranslSkel code inc skel
+-- where
+-- skel = "foldSeg.inl"
+-- inc = [(internalIdent "INCLUSIVE", Just (fromBool True))]
+-- code = CTranslUnit
+-- ( mkTupleTypeAsc 2 ty ++
+-- [ mkTuplePartition "ArrOut" ty True
+-- , mkApply 2 apply
+-- , mkTypedef "Int" False False (head int)
+-- , mkDim "DimIn0" dim
+-- , mkDim "DimOut" dim ])
+-- (mkNodeInfo (initPos skel) (Name 0))
+
View
64 Data/Array/Accelerate/OpenCL/CodeGen/Skeleton.hs
@@ -14,7 +14,8 @@
module Data.Array.Accelerate.OpenCL.CodeGen.Skeleton
(
mkGenerate,
- -- mkFold, mkFold1, mkFoldSeg, mkFold1Seg,
+ mkFold, mkFold1,
+--, mkFoldSeg, mkFold1Seg,
mkMap, mkZipWith,
-- mkStencil, mkStencil2,
-- mkScanl, mkScanr, mkScanl', mkScanr', mkScanl1, mkScanr1,
@@ -34,6 +35,7 @@ import Data.Array.Accelerate.OpenCL.CodeGen.Data
import Data.Array.Accelerate.OpenCL.CodeGen.Util
import Data.Array.Accelerate.OpenCL.CodeGen.Tuple
import Data.Array.Accelerate.OpenCL.CodeGen.Monad
+import Data.Array.Accelerate.OpenCL.CodeGen.Reduce (mkFold, mkFold1)
--import Data.Array.Accelerate.CUDA.CodeGen.Stencil
@@ -63,66 +65,6 @@ mkGenerate (tyOut, dimOut) apply = runCGM $ do
}
|]
--- Reduction
--- ---------
-
--- mkFold :: ([CType],Int) -> [CExpr] -> [CExpr] -> CUTranslSkel
--- mkFold (ty,dim) identity apply = CUTranslSkel code [] skel
--- where
--- skel | dim == 1 = "foldAll.inl"
--- | otherwise = "fold.inl"
--- code = CTranslUnit
--- ( mkTupleTypeAsc 2 ty ++
--- [ mkTuplePartition "ArrOut" ty True
--- , mkIdentity identity
--- , mkApply 2 apply
--- , mkDim "DimIn0" dim
--- , mkDim "DimOut" (dim-1) ])
--- (mkNodeInfo (initPos skel) (Name 0))
-
--- mkFold1 :: ([CType],Int) -> [CExpr] -> CUTranslSkel
--- mkFold1 (ty,dim) apply = CUTranslSkel code inc skel
--- where
--- skel | dim == 1 = "foldAll.inl"
--- | otherwise = "fold.inl"
--- inc = [(internalIdent "INCLUSIVE", Just (fromBool True))]
--- code = CTranslUnit
--- ( mkTupleTypeAsc 2 ty ++
--- [ mkTuplePartition "ArrOut" ty True
--- , mkApply 2 apply
--- , mkDim "DimIn0" dim
--- , mkDim "DimOut" (dim-1) ])
--- (mkNodeInfo (initPos skel) (Name 0))
-
--- mkFoldSeg :: ([CType],Int) -> [CType] -> [CExpr] -> [CExpr] -> CUTranslSkel
--- mkFoldSeg (ty,dim) int identity apply = CUTranslSkel code [] skel
--- where
--- skel = "foldSeg.inl"
--- code = CTranslUnit
--- ( mkTupleTypeAsc 2 ty ++
--- [ mkTuplePartition "ArrOut" ty True
--- , mkIdentity identity
--- , mkApply 2 apply
--- , mkTypedef "Int" False False (head int)
--- , mkDim "DimIn0" dim
--- , mkDim "DimOut" dim ])
--- (mkNodeInfo (initPos skel) (Name 0))
-
--- mkFold1Seg :: ([CType],Int) -> [CType] -> [CExpr] -> CUTranslSkel
--- mkFold1Seg (ty,dim) int apply = CUTranslSkel code inc skel
--- where
--- skel = "foldSeg.inl"
--- inc = [(internalIdent "INCLUSIVE", Just (fromBool True))]
--- code = CTranslUnit
--- ( mkTupleTypeAsc 2 ty ++
--- [ mkTuplePartition "ArrOut" ty True
--- , mkApply 2 apply
--- , mkTypedef "Int" False False (head int)
--- , mkDim "DimIn0" dim
--- , mkDim "DimOut" dim ])
--- (mkNodeInfo (initPos skel) (Name 0))
-
-
-- Map
-- ---
mkMap :: [C.Type] -> [C.Type] -> C.Exp -> CUTranslSkel
View
49 Data/Array/Accelerate/OpenCL/CodeGen/Tuple.hs
@@ -12,7 +12,7 @@
module Data.Array.Accelerate.OpenCL.CodeGen.Tuple
(
mkInputTuple, mkOutputTuple, --Accessor (..),
- mkTupleTypeAsc
+ mkTupleTypeAsc, Arguments
-- mkTupleType, mkTuplePartition
)
where
@@ -35,13 +35,14 @@ import Data.Array.Accelerate.OpenCL.CodeGen.Util
-- data Accessor = Get (String -> Exp)
-- | Set (String -> String -> Exp)
+
type Arguments = [Exp]
mkInputTuple :: String -> [Type]-> CGM Arguments
-mkInputTuple subscript types = mkTupleType (Just subscript) types
+mkInputTuple subscript = mkTupleType (Just subscript)
mkOutputTuple :: [Type]-> CGM Arguments
-mkOutputTuple types = mkTupleType Nothing types
+mkOutputTuple = mkTupleType Nothing
mkTupleType :: Maybe String -> [Type] -> CGM Arguments
mkTupleType subscript types = do
@@ -54,8 +55,11 @@ mkTupleType subscript types = do
addDefinitions $ zipWith (mkTypedef volatile) tynames types
when (n > 1) $ addDefinition (mkStruct tuple_name volatile types)
- (args,ps) <- mkParameterList subscript n tynames
- (maybe mkSet mkGet subscript) n ps
+ (args,ps) <- mkParameterList Global subscript n tynames
+ (_,psLocal) <- mkParameterList Local subscript n tynames
+ (maybe mkSet mkGet subscript) n ps Global
+ (maybe mkSet mkGet subscript) n psLocal Local
+ addParams ps
return args
mkTupleTypeAsc :: Int -> [Type] -> CGM (Arguments, [Arguments])
@@ -65,22 +69,32 @@ mkTupleTypeAsc n typ = do
argsIn <- mapM (flip mkInputTuple typ) names
return $ (argsOut, argsIn)
-mkParameterList :: Maybe String -> Int -> [String] -> CGM (Arguments, [Param])
-mkParameterList subscript n tynames = do
+-- mkLocalAccessors :: Int -> [Type] -> CGM ()
+-- mkLocalAccessors subscript types = do
+-- let names = [ [chr $ ord 'A' + i] | i <- [0..n-1]]
+-- n = length types
+-- tynames
+-- | n > 1 = take n [tuple_name ++ "_" ++ show i | i <- [0..]] -- TyInA_0, TyInA_1, ...
+-- | otherwise = [tuple_name]
+-- (argsOut, psout) <- mkParameterList Local Nothing n
+-- argsIn <- mapM (flip mkInputTuple typ) names
+-- return $ (argsOut, argsIn)
+
+mkParameterList :: StorageQual -> Maybe String -> Int -> [String] -> CGM (Arguments, [Param])
+mkParameterList storage subscript n tynames = do
let ps = params (zip types' param_names)
- addParams ps
return (args, ps)
- where
+ where
param_prefix = maybe "out" ("in" ++) subscript
param_names
| n > 1 = take n [param_prefix ++ "_" ++ show i | i <- [0..]] -- inA_0, inB_0, ..
| otherwise = [param_prefix] -- inA or out
- types' = map (mkPtr . mkGlobal . typename) tynames
+ types' = map (mkPtr . changeStorage storage . typename) tynames
args = map (\p -> [cexp|$id:p|]) param_names
-mkGet :: String -> Int -> [Param] -> CGM ()
-mkGet prj n params = do
+mkGet :: String -> Int -> [Param] -> StorageQual -> CGM ()
+mkGet prj n params storage = do
addDefinition
[cedecl|
inline $ty:returnType $id:name($ty:ix idx, $params:params) {
@@ -91,7 +105,8 @@ mkGet prj n params = do
|]
where
parnames = ["in" ++ prj ++ "_" ++ show i | i <- [0..]]
- name = "get" ++ prj
+ name | storage == Local = "get" ++ prj ++ "_local"
+ | otherwise = "get" ++ prj
returnType = typename $ "TyIn" ++ prj
assign i name = let field = 'a' : show i
in [cstm|val.$id:field = $id:name [idx];|]
@@ -100,15 +115,17 @@ mkGet prj n params = do
| otherwise = [ [cstm|val = $id:("in" ++ prj) [idx];|] ]
-mkSet :: Int -> [Param] -> CGM ()
-mkSet n params =
+mkSet :: Int -> [Param] -> StorageQual -> CGM ()
+mkSet n params storage =
addDefinition
[cedecl|
- inline void set($ty:ix idx, const $ty:outType val, $params:params) {
+ inline void $id:name($ty:ix idx, const $ty:outType val, $params:params) {
$stms:assignments
}
|]
where
+ name | storage == Local = "set_local"
+ | otherwise = "set"
parnames = ["out" ++ "_" ++ show i | i <- [0..]]
assign i name = let field = 'a' : show i
in [cstm|$id:name [idx] = val.$id:field;|]
View
26 Data/Array/Accelerate/OpenCL/CodeGen/Util.hs
@@ -25,8 +25,8 @@ outType = typename "TyOut"
-- Common device functions
-- -----------------------
-mkIdentity :: Exp -> Definition
-mkIdentity = mkDeviceFun "identity" (typename "TyOut") []
+mkIdentity :: Exp -> CGM ()
+mkIdentity exp = addDefinition $ mkDeviceFun "identity" (typename "TyOut") [] exp
mkApply :: Int -> Exp -> CGM ()
mkApply argc exp
@@ -34,6 +34,14 @@ mkApply argc exp
(mkDeviceFun "apply" outType
$ params $ map (\c -> (typename ("TyIn"++ [c]), 'x' : [c])) $ reverse $ take argc ['A'..]) exp
+-- For associative apply function
+mkApplyAsc :: Int -> Exp -> CGM ()
+mkApplyAsc argc exp
+ = addDefinition $
+ (mkDeviceFun "apply" outType
+ $ params $ map (\c -> (typename ("TyInA"), 'x' : [c])) $ reverse $ take argc ['A'..]) exp
+
+
mkProject :: Direction -> Exp -> CGM ()
mkProject Forward exp =
addDefinition $
@@ -98,11 +106,17 @@ mkPtr (Type (DeclSpec storage quals typ l0) _ l1) =
Type (DeclSpec storage quals typ l0) (Ptr [] (DeclRoot noSrcLoc) noSrcLoc) l1
mkPtr _ = error "Not a DeclSpec"
-mkGlobal :: Type -> Type
-mkGlobal (Type (DeclSpec storage quals typ l0) _ l1) =
- Type (DeclSpec storage ((TCLGlobal noSrcLoc) : quals) typ l0) (DeclRoot noSrcLoc) l1
-mkGlobal _ = error "Not a DeclSpec"
+data StorageQual = Global | Local
+ deriving (Eq, Show)
+changeStorage :: StorageQual -> Type -> Type
+changeStorage stor (Type (DeclSpec storage quals typ l0) _ l1) =
+ Type (DeclSpec storage (s : quals) typ l0) (DeclRoot noSrcLoc) l1
+ where
+ s = case stor of
+ Global -> TCLGlobal noSrcLoc
+ Local -> TCLLocal noSrcLoc
+changeStorage _ _ = error "Not a DeclSpec"
mkTypedef :: Bool -> String -> Type -> Definition
mkTypedef volatile tyname typ | volatile = let typ' = mkVolatile typ
View
2  Data/Array/Accelerate/OpenCL/Execute.hs
@@ -373,7 +373,7 @@ foldOp c kernel bindings acc aenv (Array sh0 in0)
| dim sh0 == 1 = do
cfg@(_,_,(_,g,_)) <- configure kernel acc (size sh0)
res@(Array _ out) <- newArray (bool c 1 (g > 1)) (toElt (fst sh0,g)) :: CIO (Array (dim:.Int) e)
- dispatch cfg bindings aenv ((((),out),in0),size sh0)
+ dispatch cfg bindings aenv ((((),size sh0),out),in0)
freeArray in0
if g > 1 then foldOp c kernel bindings acc aenv res
else return (Array (fst sh0) out)
View
20 accelerate-opencl.cabal
@@ -11,6 +11,7 @@ Stability: Experimental
Category: Compilers/Interpreters, Concurrency, Data
Data-files: clbits/accelerate_opencl_shape.h
+ clbits/reduce.cl
Extra-source-files: include/accelerate.h
@@ -25,31 +26,29 @@ Flag bounds-checks
Flag unsafe-checks
Description: Enable bounds checking in unsafe operations
- Default: False
+ Default: True
Flag internal-checks
Description: Enable internal consistency checks
- Default: False
+ Default: True
Library
- Build-depends: base >= 4.3
- , accelerate >= 0.9
- , pretty >= 1.0
- , language-c-quote >= 0.3.0.0
+ Build-depends: base == 4.3.*
+ , accelerate == 0.9.*
+ , pretty == 1.0.*
+ , language-c-quote == 0.3.*
, mainland-pretty
, srcloc
, symbol
- , hopencl >= 0.1.1
- , mtl >= 2.0
+ , hopencl == 0.1.*
+ , mtl == 2.0.*
, fclabels >= 0.9 && < 1.0
, bytestring
, zlib
, transformers == 0.2.*
, directory
, filepath
- , binary
- , unix == 2.4.*
Exposed-modules: Data.Array.Accelerate.OpenCL
@@ -59,6 +58,7 @@ Library
Data.Array.Accelerate.OpenCL.CodeGen.Skeleton
Data.Array.Accelerate.OpenCL.CodeGen.Monad
Data.Array.Accelerate.OpenCL.CodeGen.Tuple
+ Data.Array.Accelerate.OpenCL.CodeGen.Reduce
Data.Array.Accelerate.OpenCL.Analysis.Hash
Data.Array.Accelerate.OpenCL.Analysis.Device
Data.Array.Accelerate.OpenCL.Array.Data
View
77 clbits/reduce.cl
@@ -0,0 +1,77 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Kernel : Reduce
+ * Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell
+ * License : BSD3
+ *
+ * Maintainer : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
+ * Stability : experimental
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef __REDUCE__
+#define __REDUCE__
+
+
+/*
+ * Cooperatively reduce a single warp's segment of an array to a single value
+ */
+inline TyOut
+reduce_warp_n
+(
+ __local TyOut *s_data,
+ TyOut sum,
+ Ix n
+)
+{
+ int warpSize = 32;
+ const Ix tid = get_local_id(0);
+ const Ix lane = get_local_id(0) & (warpSize - 1);
+
+ if (n > 16 && lane + 16 < n) { sum = apply(sum, getA_local(tid+16, s_data)); set_local(tid, sum, s_data); }
+ if (n > 8 && lane + 8 < n) { sum = apply(sum, getA_local(tid+ 8, s_data)); set_local(tid, sum, s_data); }
+ if (n > 4 && lane + 4 < n) { sum = apply(sum, getA_local(tid+ 4, s_data)); set_local(tid, sum, s_data); }
+ if (n > 2 && lane + 2 < n) { sum = apply(sum, getA_local(tid+ 2, s_data)); set_local(tid, sum, s_data); }
+ if (n > 1 && lane + 1 < n) { sum = apply(sum, getA_local(tid+ 1, s_data)); }
+
+ return sum;
+}
+
+/*
+ * Block reduction to a single value
+ */
+inline TyOut
+reduce_block_n
+(
+ __local TyOut *s_data,
+ TyOut sum,
+ Ix n
+)
+{
+ const Ix tid = get_local_id(0);
+
+ if (n > 512) { if (tid < 512 && tid + 512 < n) { sum = apply(sum, getA_local(tid+512, s_data)); set_local(tid, sum, s_data); } }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 256) { if (tid < 256 && tid + 256 < n) { sum = apply(sum, getA_local(tid+256, s_data)); set_local(tid, sum, s_data); } }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 128) { if (tid < 128 && tid + 128 < n) { sum = apply(sum, getA_local(tid+128, s_data)); set_local(tid, sum, s_data); } }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 64) { if (tid < 64 && tid + 64 < n) { sum = apply(sum, getA_local(tid+ 64, s_data)); set_local(tid, sum, s_data); } }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 32) { if (tid < 32 && tid + 32 < n) { sum = apply(sum, getA_local(tid+ 32, s_data)); set_local(tid, sum, s_data); }}
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 16) { if (tid < 16 && tid + 16 < n) { sum = apply(sum, getA_local(tid+ 16, s_data)); set_local(tid, sum, s_data); }}
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 8) { if (tid < 8 && tid + 8 < n) { sum = apply(sum, getA_local(tid+ 8, s_data)); set_local(tid, sum, s_data); }}
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 4) { if (tid < 4 && tid + 4 < n) { sum = apply(sum, getA_local(tid+ 4, s_data)); set_local(tid, sum, s_data); }}
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 2) { if (tid < 2 && tid + 2 < n) { sum = apply(sum, getA_local(tid+ 2, s_data)); set_local(tid, sum, s_data); }}
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (n > 1) { if (tid == 0 && tid + 1 < n) { sum = apply(sum, getA_local(tid+ 1, s_data)); }}
+
+ return sum;
+}
+
+#endif
+
Please sign in to comment.
Something went wrong with that request. Please try again.