Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initiating some work on launch configuration analysis, and fixed a bu…

…g regarding launch of fold-operations
  • Loading branch information...
commit 37bf017ca1c2fb0593af538d473ecfc41cb354f9 1 parent 6204e98
@dybber dybber authored
View
181 Data/Array/Accelerate/OpenCL/Analysis/Launch.hs
@@ -0,0 +1,181 @@
+{-# LANGUAGE CPP, GADTs, RankNTypes #-}
+-- |
+-- Module : Data.Array.Accelerate.OpenCL.Analysis.Launch
+-- Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell
+-- License : BSD3
+--
+-- Maintainer : Manuel M T Chakravarty <chak@cse.unsw.edu.au>
+-- Stability : experimental
+-- Portability : non-partable (GHC extensions)
+--
+
+module Data.Array.Accelerate.OpenCL.Analysis.Launch (launchConfig)
+ where
+
+-- friends
+import Data.Array.Accelerate.AST
+import Data.Array.Accelerate.Type
+import Data.Array.Accelerate.Array.Sugar (Array(..), EltRepr)
+import Data.Array.Accelerate.Analysis.Type hiding (accType, expType)
+import Data.Array.Accelerate.Analysis.Shape hiding (accDim)
+
+import Data.Array.Accelerate.OpenCL.State
+import Data.Array.Accelerate.OpenCL.Compile (ExecOpenAcc(..))
+import Data.Array.Accelerate.OpenCL.Config
+
+-- library
+import Data.Record.Label
+import Control.Monad.IO.Class
+
+import qualified Foreign.OpenCL.Bindings as OpenCL
+import qualified Foreign.Storable as F
+
+#include "accelerate.h"
+
+
+-- |Reify dimensionality of array computations
+--
+accDim :: ExecOpenAcc aenv (Array sh e) -> Int
+accDim (ExecAcc _ _ _ acc) = preAccDim accDim acc
+accDim (ExecAfun _ _) = error "when I get sad, I stop being sad and be AWESOME instead."
+
+-- |Reify type of arrays and scalar expressions
+--
+accType :: ExecOpenAcc aenv (Array sh e) -> TupleType (EltRepr e)
+accType (ExecAcc _ _ _ acc) = preAccType accType acc
+accType (ExecAfun _ _) = error "TRUE STORY."
+
+expType :: PreOpenExp ExecOpenAcc aenv env t -> TupleType (EltRepr t)
+expType = preExpType accType
+
+launchConfig :: OpenCL.DeviceID -> PreOpenAcc ExecOpenAcc aenv a -> Int -> OpenCL.Kernel -> CIO (Int, Int, Integer)
+launchConfig dev acc n fn = liftIO $ do
+ maxWorkGroupSize <- OpenCL.deviceMaxWorkGroupSize dev
+ maxWorkItemSizes <- OpenCL.deviceMaxWorkItemSizes dev
+-- putStrLn $ "DEVICE_MAX_WORK_GROUP_SIZE: " ++ show maxWorkGroupSize
+-- putStrLn $ "DEVICE_MAX_WORK_ITEM_SIZES: " ++ show maxWorkItemSizes
+ let groupSize = minimum [n,
+ fromIntegral maxWorkGroupSize,
+ fromIntegral $ maxWorkItemSizes !! 0]
+ globalSize = shrRoundUp groupSize n
+ -- putStrLn $ "n: " ++ show n
+ -- putStrLn $ "Group size: " ++ show groupSize
+ -- putStrLn $ "Global size: " ++ show globalSize
+ return (groupSize, globalSize, 0)
+
+shrRoundUp :: Int -> Int -> Int
+shrRoundUp 0 _ = 0
+shrRoundUp groupSize globalSize =
+ let r = globalSize `mod` groupSize
+ in if r == 0
+ then globalSize
+ else globalSize + groupSize - r;
+
+-- launchConfig :: PreOpenAcc ExecOpenAcc aenv a -> Int -> OpenCL.Kernel -> CIO (Int, Int, Integer)
+-- launchConfig acc n fn = do
+-- regs <- liftIO $ CUDA.requires fn CUDA.NumRegs
+-- stat <- liftIO $ CUDA.requires fn CUDA.SharedSizeBytes -- static memory only
+-- prop <- getM deviceProps
+
+-- let dyn = sharedMem prop acc
+-- (cta, occ) = blockSize prop acc regs ((stat+) . dyn)
+-- mbk = CUDA.multiProcessorCount prop * CUDA.activeThreadBlocks occ
+
+-- return (cta,
+-- mbk `min` gridSize prop acc n cta,
+-- toInteger (dyn cta))
+
+-- -- |
+-- -- Determine kernel launch parameters for the given array computation (as well
+-- -- as compiled function module). This consists of the thread block size, number
+-- -- of blocks, and dynamically allocated shared memory (bytes), respectively.
+-- --
+-- -- By default, this launches the kernel with the minimum block size that gives
+-- -- maximum occupancy, and the grid size limited to the maximum number of
+-- -- physically resident blocks. Hence, kernels may need to process multiple
+-- -- elements per thread.
+-- --
+-- launchConfig :: PreOpenAcc ExecOpenAcc aenv a -> Int -> CUDA.Fun -> CIO (Int, Int, Integer)
+-- launchConfig acc n fn = do
+-- regs <- liftIO $ CUDA.requires fn CUDA.NumRegs
+-- stat <- liftIO $ CUDA.requires fn CUDA.SharedSizeBytes -- static memory only
+-- prop <- getM deviceProps
+
+-- let dyn = sharedMem prop acc
+-- (cta, occ) = blockSize prop acc regs ((stat+) . dyn)
+-- mbk = CUDA.multiProcessorCount prop * CUDA.activeThreadBlocks occ
+
+-- return (cta, mbk `min` gridSize prop acc n cta, toInteger (dyn cta))
+
+
+-- -- |
+-- -- Determine the optimal thread block size for a given array computation. Fold
+-- -- requires blocks with a power-of-two number of threads.
+-- --
+-- blockSize :: CUDA.DeviceProperties -> PreOpenAcc ExecOpenAcc aenv a -> Int -> (Int -> Int) -> (Int, CUDA.Occupancy)
+-- blockSize p (Fold _ _ _) r s = CUDA.optimalBlockSizeBy p CUDA.incPow2 (const r) s
+-- blockSize p (Fold1 _ _) r s = CUDA.optimalBlockSizeBy p CUDA.incPow2 (const r) s
+-- blockSize p _ r s = CUDA.optimalBlockSizeBy p CUDA.incWarp (const r) s
+
+
+-- -- |
+-- -- Determine the number of blocks of the given size necessary to process the
+-- -- given array expression. This should understand things like #elements per
+-- -- thread for the various kernels.
+-- --
+-- -- foldSeg: 'size' is the number of segments, require one warp per segment
+-- --
+-- gridSize :: CUDA.DeviceProperties -> PreOpenAcc ExecOpenAcc aenv a -> Int -> Int -> Int
+-- gridSize p acc@(FoldSeg _ _ _ _) size cta = split acc (size * CUDA.warpSize p) cta
+-- gridSize p acc@(Fold1Seg _ _ _) size cta = split acc (size * CUDA.warpSize p) cta
+-- gridSize p acc@(Fold _ _ a) size cta = if accDim a == 1 then split acc size cta else split acc (size * CUDA.warpSize p) cta
+-- gridSize p acc@(Fold1 _ a) size cta = if accDim a == 1 then split acc size cta else split acc (size * CUDA.warpSize p) cta
+-- gridSize _ acc size cta = split acc size cta
+
+-- split :: PreOpenAcc ExecOpenAcc aenv a -> Int -> Int -> Int
+-- split acc size cta = (size `between` eltsPerThread acc) `between` cta
+-- where
+-- between arr n = 1 `max` ((n + arr - 1) `div` n)
+-- eltsPerThread _ = 1
+
+
+-- |
+-- Analyse the given array expression, returning an estimate of dynamic shared
+-- memory usage as a function of thread block size. This can be used by the
+-- occupancy calculator to optimise kernel launch shape.
+--
+sharedMem :: DeviceProps -> PreOpenAcc ExecOpenAcc aenv a -> Int -> Int
+-- non-computation forms
+sharedMem _ (Let _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Let"
+sharedMem _ (Let2 _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Let2"
+sharedMem _ (PairArrays _ _) _
+ = INTERNAL_ERROR(error) "sharedMem" "PairArrays"
+sharedMem _ (Avar _) _ = INTERNAL_ERROR(error) "sharedMem" "Avar"
+sharedMem _ (Apply _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Apply"
+sharedMem _ (Acond _ _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Acond"
+sharedMem _ (Use _) _ = INTERNAL_ERROR(error) "sharedMem" "Use"
+sharedMem _ (Unit _) _ = INTERNAL_ERROR(error) "sharedMem" "Unit"
+sharedMem _ (Reshape _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Reshape"
+
+-- skeleton nodes
+sharedMem _ (Generate _ _) _ = 0
+sharedMem _ (Replicate _ _ _) _ = 0
+sharedMem _ (Index _ _ _) _ = 0
+sharedMem _ (Map _ _) _ = 0
+sharedMem _ (ZipWith _ _ _) _ = 0
+sharedMem _ (Permute _ _ _ _) _ = 0
+sharedMem _ (Backpermute _ _ _) _ = 0
+sharedMem _ (Stencil _ _ _) _ = 0
+sharedMem _ (Stencil2 _ _ _ _ _) _ = 0
+sharedMem _ (Fold _ _ a) blockDim = sizeOf (accType a) * blockDim
+sharedMem _ (Fold1 _ a) blockDim = sizeOf (accType a) * blockDim
+sharedMem _ (Scanl _ x _) blockDim = sizeOf (expType x) * blockDim
+sharedMem _ (Scanr _ x _) blockDim = sizeOf (expType x) * blockDim
+sharedMem _ (Scanl' _ x _) blockDim = sizeOf (expType x) * blockDim
+sharedMem _ (Scanr' _ x _) blockDim = sizeOf (expType x) * blockDim
+sharedMem _ (Scanl1 _ a) blockDim = sizeOf (accType a) * blockDim
+sharedMem _ (Scanr1 _ a) blockDim = sizeOf (accType a) * blockDim
+sharedMem p (FoldSeg _ _ a _) blockDim =
+ (blockDim `div` warpSize p) * 4 * F.sizeOf (undefined::Int32) + blockDim * sizeOf (accType a)
+sharedMem p (Fold1Seg _ a _) blockDim =
+ (blockDim `div` warpSize p) * 4 * F.sizeOf (undefined::Int32) + blockDim * sizeOf (accType a)
View
130 Data/Array/Accelerate/OpenCL/CodeGen/Reduce.hs
@@ -53,10 +53,10 @@ makeFold inclusive (ty, dimIn) identity apply = runCGM $ do
(d_local,local_params) <- mkParameterList Local (Just "local") n tynames
fromMaybe (return ()) (mkIdentity <$> identity)
mkApplyAsc 2 apply
- mkDim "DimInA" dimIn
mkDim "DimOut" (dimIn-1)
+ mkDim "DimInA" dimIn
let mkSkel | dimIn == 1 = mkFoldAllSkel
- | otherwise = mkFoldSkel
+ | otherwise = mkFoldSkel dimIn
mkSkel d_out d_inA (d_local, local_params) inclusive
where
n = length ty
@@ -64,10 +64,6 @@ makeFold inclusive (ty, dimIn) identity apply = runCGM $ do
| n > 1 = take n ["TyOut" ++ "_" ++ show i | i <- [0..]]
| otherwise = ["TyOut"]
-
-mkFoldSkel :: Arguments -> Arguments -> (Arguments, [C.Param]) -> Bool -> CGM ()
-mkFoldSkel = error "folds for higher dimensions are not yet supported in the OpenCL backend for Accelerate"
-
mkFoldAllSkel :: Arguments -> Arguments -> (Arguments, [C.Param]) -> Bool -> CGM ()
mkFoldAllSkel d_out d_inA (d_local, local_params) inclusive = do
ps <- getParams
@@ -83,9 +79,6 @@ mkFoldAllSkel d_out d_inA (d_local, local_params) inclusive = do
$params:ps,
$params:local_params) {
- //volatile __local typename TyOut s_data[100];
- //__global ArrOutTy *s_data = partition(s_ptr, get_local_size(0));
-
/*
* Calculate first level of reduction reading into shared memory
*/
@@ -113,6 +106,7 @@ mkFoldAllSkel d_out d_inA (d_local, local_params) inclusive = do
* cooperatively reduce the shared array to a single value.
*/
set_local(tid, sum, $args:d_local);
+
barrier(CLK_LOCAL_MEM_FENCE);
sum = reduce_block_n(sum, min(shape, blockSize), $args:d_local);
@@ -127,6 +121,96 @@ mkFoldAllSkel d_out d_inA (d_local, local_params) inclusive = do
}
|]
+
+mkFoldSkel :: Int -> Arguments -> Arguments -> (Arguments, [C.Param]) -> Bool -> CGM ()
+mkFoldSkel dimIn d_out d_inA (d_local, local_params) inclusive = do
+ ps <- getParams
+ let dimOut = dimIn - 1
+
+ mkHandleSeedFold d_out inclusive
+ mkWarpReduce local_params d_local
+ mkBlockReduce local_params d_local
+
+ addDefinitions
+ [cunit|
+ __kernel void fold(const typename DimOut shOut,
+ const typename DimInA shInA,
+ $params:ps,
+ $params:local_params) {
+ int warpSize = 32;
+
+ //typename TyOut* s_data = partition($args:d_local, blockDim.x);
+
+ const typename Ix num_elements = $id:(indexHead dimIn)(shInA);
+ const typename Ix num_segments = $id:(size dimOut)(shOut);
+
+ const typename Ix num_vectors = get_local_size(0) / warpSize * (get_global_size(0) / get_local_size(0));
+ const typename Ix thread_id = get_global_id(0);
+ const typename Ix vector_id = thread_id / warpSize;
+ const typename Ix thread_lane = get_local_id(0) & (warpSize - 1);
+
+ /*
+ * Each warp reduces elements along a projection through an innermost
+ * dimension to a single value
+ */
+ for (typename Ix seg = vector_id; seg < num_segments; seg += num_vectors)
+ {
+
+// printf("tid: %d, numseg: %d, numvec: %d, seg: %d, globsize: %d, locsize: %d\n",
+// thread_id, num_segments, num_vectors, seg, get_global_size(0), get_local_size(0));
+
+
+ const typename Ix start = seg * num_elements;
+ const typename Ix end = start + num_elements;
+ typename TyOut sum;
+
+ if (num_elements > warpSize)
+ {
+ /*
+ * Ensure aligned access to global memory, and that each thread
+ * initialises its local sum.
+ */
+ typename Ix i = start - (start & (warpSize - 1)) + thread_lane;
+ if (i >= start)
+ sum = getA(i, $args:d_inA);
+
+ if (i + warpSize < end)
+ {
+ typename TyOut tmp = getA(i + warpSize, $args:d_inA);
+
+ if (i >= start) sum = apply(sum, tmp);
+ else sum = tmp;
+ }
+
+ /*
+ * Now, iterate along the inner-most dimension collecting a local sum
+ */
+ for (i += 2 * warpSize; i < end; i += warpSize)
+ sum = apply(sum, getA(i, $args:d_inA));
+ }
+ else if (start + thread_lane < end)
+ {
+ sum = getA(start + thread_lane, $args:d_inA);
+ }
+
+ /*
+ * Each thread puts its local sum into shared memory, then cooperatively
+ * reduce the shared array to a single value.
+ */
+ set_local(get_local_id(0), sum, $args:d_local);
+ sum = reduce_warp_n(sum, min(num_elements, warpSize), $args:d_local);
+
+ /*
+ * Finally, the first thread writes the result for this segment
+ */
+ if (thread_lane == 0) {
+ handleSeed(seg, num_elements, sum, $args:d_out, $args:d_inA);
+ }
+ }
+ }
+ |]
+
+
-- | Cooperatively reduce a single warp's segment of an array to a single value
mkWarpReduce :: [C.Param] -> Arguments -> CGM ()
mkWarpReduce ps args = do
@@ -216,6 +300,34 @@ mkHandleSeed d_out True = do
|]
+mkHandleSeedFold :: Arguments -> Bool -> CGM ()
+mkHandleSeedFold d_out False = do
+ ps <- getParams
+ addDefinitions
+ [cunit|
+ inline void handleSeed(typename Ix seg,
+ const typename Ix num_elements,
+ typename TyOut sum,
+ $params:ps)
+ {
+ set(seg, sum, $args:d_out);
+ }
+ |]
+mkHandleSeedFold d_out True = do
+ ps <- getParams
+ addDefinitions
+ [cunit|
+ inline void handleSeed(typename Ix seg,
+ const typename Ix num_elements,
+ typename TyOut sum,
+ $params:ps)
+ {
+ sum = num_elements > 0 ? apply(sum, identity()) : identity();
+ set(seg, sum, $args:d_out);
+ }
+ |]
+
+
-- mkFoldSeg :: ([CType],Int) -> [CType] -> [CExpr] -> [CExpr] -> CUTranslSkel
-- mkFoldSeg (ty,dim) int identity apply = CUTranslSkel code [] skel
-- where
View
6 Data/Array/Accelerate/OpenCL/CodeGen/Util.hs
@@ -132,6 +132,12 @@ toIndex dim = "toIndexDIM" ++ show dim
fromIndex :: Int -> String
fromIndex dim = "fromIndexDIM" ++ show dim
+indexHead :: Int -> String
+indexHead dim = "indexHeadDIM" ++ show dim
+
+indexTail :: Int -> String
+indexTail dim = "indexTailDIM" ++ show dim
+
size :: Int -> String
size dim = "sizeDIM" ++ show dim
View
20 Data/Array/Accelerate/OpenCL/Config.hs
@@ -0,0 +1,20 @@
+-- |
+-- Module : Data.Array.Accelerate.OpenCL.Config
+-- Copyright : [2011] Martin Dybdal
+-- License : BSD3
+--
+-- Maintainer : Martin Dybdal <dybber@dybber.dk>
+-- Stability : experimental
+-- Portability : non-partable (GHC extensions)
+--
+
+module Data.Array.Accelerate.OpenCL.Config where
+
+
+data DeviceProps = DeviceProps
+ { warpSize :: Int
+ }
+
+
+defaultConfig :: DeviceProps
+defaultConfig = DeviceProps { warpSize = 32 }
View
26 Data/Array/Accelerate/OpenCL/Execute.hs
@@ -36,7 +36,7 @@ import Data.Array.Accelerate.OpenCL.State
import Data.Array.Accelerate.OpenCL.Compile
import Data.Array.Accelerate.OpenCL.CodeGen
import Data.Array.Accelerate.OpenCL.Array.Data
---import Data.Array.Accelerate.OpenCL.Analysis.Launch
+import Data.Array.Accelerate.OpenCL.Analysis.Launch (launchConfig)
-- libraries
import Prelude hiding (sum)
@@ -372,9 +372,9 @@ foldOp c kernel bindings acc aenv (Array sh0 in0)
-- case, which probably breaks reference counting.
--
| dim sh0 == 1 = do
- cfg@(_,_,(_,g,_)) <- configure kernel acc (size sh0)
- res@(Array _ out) <- newArray (bool c 1 (g > 1)) (toElt (fst sh0,g)) :: CIO (Array (dim:.Int) e)
- dispatch cfg bindings aenv (((((),size sh0),out),in0), LocalArray out (size sh0))
+ cfg@(_,_,(blockSize,g,_)) <- configure kernel acc (size sh0)
+ res@(Array _ out) <- newArray (bool c 1 (g > 1)) (toElt (fst sh0, g `div` blockSize)) :: CIO (Array (dim:.Int) e)
+ dispatch cfg bindings aenv (((((),size sh0),out),in0), LocalArray out blockSize)
freeArray in0
if g > 1 then foldOp c kernel bindings acc aenv res
else return (Array (fst sh0) out)
@@ -383,7 +383,7 @@ foldOp c kernel bindings acc aenv (Array sh0 in0)
--
| otherwise = do
res@(Array sh out) <- newArray c $ toElt (fst sh0)
- execute kernel bindings acc aenv (size (fst sh0)) (((((),out),in0),convertIx sh),convertIx sh0)
+ execute kernel bindings acc aenv (size (fst sh0)) ((((((),convertIx sh), convertIx sh0), out), in0), LocalArray out (size sh0))
freeArray in0
return res
@@ -748,7 +748,9 @@ configure :: AccKernel a
configure (name, program) acc n = do
mdl <- program
fun <- liftIO $ OpenCL.createKernel mdl name
- cfg <- return (n, 0, 0) --launchConfig acc n fun
+ (dev, _) <- head <$> getM cl_devices -- we only execute on one
+ -- device currently
+ cfg <- launchConfig dev acc n fun
return (mdl, fun, cfg)
@@ -771,19 +773,11 @@ dispatch (mdl, fun, cfg) fvs aenv args = do
launch :: Marshalable args => (Int,Int,Integer) -> OpenCL.Kernel -> args -> CIO ()
launch (cta,grid,smem) fn a = do
args <- marshal a
- (dev, queue) <- head <$> getM cl_devices
+ (_, queue) <- head <$> getM cl_devices
liftIO $ do
- maxWorkGroupSize <- OpenCL.deviceMaxWorkGroupSize dev
- let workGroups = fromIntegral $ min maxWorkGroupSize (fromIntegral cta)
OpenCL.setKernelArgs fn args
- _ <- OpenCL.enqueueNDRangeKernel queue fn [] [workGroups] [workGroups] []
+ _ <- OpenCL.enqueueNDRangeKernel queue fn [] [fromIntegral grid] [fromIntegral cta] []
return ()
- -- liftIO $ do
- -- CUDA.setParams fn args
- -- CUDA.setSharedSize fn smem
- -- CUDA.setBlockShape fn (cta,1,1)
- -- CUDA.launch fn (grid,1) Nothing
-
-- Memory management
-- -----------------
Please sign in to comment.
Something went wrong with that request. Please try again.