Mu2e · brownd1978 · Oct 1, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/JobConfig/ensemble/MakeSignalMDS_tutorial.md b/JobConfig/ensemble/MakeSignalMDS_tutorial.md
@@ -0,0 +1,52 @@
+# Introduction
+
+# Tutorial 
+
+1. Make a new directory in your working directory. You should ensure you have access to Production (either via a musing or a clone). Call this something like "ensemble_MDS2c_CeMLL_1e-14_2weeks" altering the fields as is applicable to what you want to make.
+
+2. Enter the new directory. Run the following command:
+
+```
+Stage3_addsignal_easy.sh --known MDS2c --signal CeMLeadingLog --rate 1e-14 --nexp 10 --chooselivetime 1209600
+```
+
+here the parameters are:
+
+* ``known``: the tag of the mixed background sample you wish to sample from.
+* ``signal``: must be the primary name of the signal type you want to sample (e.g. CeMLeadingLog)
+* ``rate``: chosen signal rate (e.g. 1e-13)
+* ``nexp``: how many pseudo experiments (i.e. random samplings) you want to make
+* ``chosenlivetime``: in seconds (check the config to ensure you don't try to make more than is available)
+
+The output of this command will include
+
+* a set of nexp mcs files, these contains random sets of expected signal events (sampled from a much larger set) and include Poisson statistical variations.
+* a set of nexp nts files that are EventNtuples of the analogous mcs files (names are the same)
+* nexp lists ``filename_ChosenMixed_i`` contain random sets of MDS known ntuples and the new signal ntuples
+* a new directory called ``fcl`` contains the splitting and ntupling fcls for reference
+
+3. Now you have random sets of ntuples you want to combine these into a merged dataset (for some blinding effect). To do that run a second script:
+
+```
+combine_ntuples.sh 1 MDS2c
+```
+
+where the first argumenet is the iteration of the list (1 to nexp) and the second arguement is the MDS tag version name (should be the same as before). You will need to run this for every nexp.
+
+In the current directory you will see that for every nexp (i) you will now see a directory: ``merged_files_i``.
+
+In this directory there will be a set of files (merge factor can be altered within the combine_ntuple.sh script). The filenames will be for example:
+
+```
+nts.mu2e.ensembleMDS2cMix1BB_CeMLeadingLog_1e-14_1223190.2_16.root
+```
+
+where most of this is obvious, the final number (1223190) is the livetime of the sample in seconds.
+
+4. You should be able to analyze this file list as you would any other set of files. If you prefer remote/xrootd etc. file access you will need to upload them to SAM, but I recommend keeping them in you personal directories.
+
+
+
+
+
+
diff --git a/JobConfig/ensemble/fcl/split.fcl b/JobConfig/ensemble/fcl/split.fcl
@@ -0,0 +1,28 @@
+#include "Offline/fcl/minimalMessageService.fcl"
+
+process_name: artsplit
+
+# read only the first n events
+source: {
+  module_type: RootInput
+  fileNames: @nil
+  maxEvents: 0
+}
+
+services: { 
+  message: @local::default_message 
+}
+
+physics: { 
+  e1: [out]  
+  end_paths: [e1] 
+  trigger_paths: []
+}
+
+# write everything you read to a new file
+outputs: {
+  out: {
+    module_type: RootOutput
+    fileName: @nil
+  }
+}
diff --git a/JobConfig/ensemble/python/calculateEvents.py b/JobConfig/ensemble/python/calculateEvents.py
@@ -7,7 +7,7 @@ def main(args):
       getPOT(float(args.livetime), str(args.BB),True)
     if(args.prc == "CeMLeadingLog" or args.prc == "CePLeadingLog"):
       Yield = ce_normalization(float(args.livetime), float(args.rue), str(args.BB))
-      print("CeP",Yield)
+      print("Ce",Yield)
     if(args.prc == "GetRMUE"):
       Yield = get_ce_rmue(float(args.livetime), float(args.nsig), str(args.BB))
       print(Yield)

diff --git a/JobConfig/ensemble/python/normalizations.py b/JobConfig/ensemble/python/normalizations.py
@@ -3,7 +3,9 @@
 import argparse
 import ROOT
 import math
+import random
 import os
+import numpy as np
 
 # numbers
 captures_per_stopped_muon = 0.609 # from AL capture studies
@@ -101,9 +103,9 @@ def getPOT(onspilltime, run_mode = '1BB',printout=False, frac=1): #livetime in s
 # get CE normalization:
 def ce_normalization(onspilltime, rue, run_mode = '1BB'):
     POT = getPOT(onspilltime, run_mode)
-
-    #print(f"Expected CE's {POT * target_stopped_mu_per_POT * captures_per_stopped_muon * rue}")
-    return POT * target_stopped_mu_per_POT * captures_per_stopped_muon * rue
+    N_CE = POT * target_stopped_mu_per_POT * captures_per_stopped_muon * rue
+    Poisson = np.random.poisson(lam=(N_CE ))
+    return  Poisson
 
 # get DIO normalization:
 def dio_normalization(onspilltime, emin, run_mode = '1BB'):

diff --git a/JobConfig/ensemble/scripts/Stage3_addsignal.sh b/JobConfig/ensemble/scripts/Stage3_addsignal.sh
diff --git a/JobConfig/ensemble/scripts/Stage3_addsignal_easy.sh b/JobConfig/ensemble/scripts/Stage3_addsignal_easy.sh
@@ -0,0 +1,233 @@
+#!/usr/bin/bash
+usage() { echo "Usage: $0
+  e.g. Stage3_addsignal.sh --known MDS2a --signal CeMLeadingLog --rate 1e-13 --nexp 3
+  usage:
+  --owner = the username of your account (or mu2e if you are using mu2epro);
+  --known = known physics tag e.g. MDS2a
+  --rate = chosen rate e.g. 1e-14 (note this could be edited during the process so check print outs)
+  --signal = primary name of chosen signal e.g. CeMLeadingLog for the e- ce leadinglog samples
+  --release = SimJob tag e.g. MDC2020aw
+  --dbpurpose = db purpose of input mcs files e.g. perfect or best
+  --dbversion = db version e.g. v1_3
+  --nexp = number of sets of mixed samples or 'pseudo experiments' to make default is 1
+  --chooselivetime = chose a livetime in seconds e.g 86000
+
+  NOTE: assumes signal and known are the same versions
+"
+}
+
+# Function: Exit with error.
+exit_abnormal() {
+  usage
+  exit 1
+}
+OWNER="mu2e"
+KNOWN="MDS2a" #background sample tag
+RATE=1e-13
+SIGNAL="CeMLeadingLog" #name as given to primary during production
+RELEASE="MDC2020ba"
+DBPURPOSE="best"
+DBVERSION="v1_3"
+NEXP=1
+CHOOSE=0.
+SETUP="" #musing path
+
+while getopts ":-:" options; do
+  case "${options}" in
+    -)
+      case "${OPTARG}" in
+        owner)
+          OWNER=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        known)
+          KNOWN=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        rate)
+          RATE=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        signal)
+          SIGNAL=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        release)
+          RELEASE=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        dbversion)
+          DBVERSION=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        dbpurpose)
+          DBPURPOSE=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        release)
+          RELEASE=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+       nexp)
+          NEXP=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        chooselivetime)
+          CHOOSE=${!OPTIND} OPTIND=$(( $OPTIND + 1 ))
+          ;;
+        *)
+          echo "Unknown option " ${OPTARG}
+          exit_abnormal
+          ;;
+        esac;;
+    :)                                    # If expected argument omitted:
+      echo "Error: -${OPTARG} requires an argument."
+      exit_abnormal                       # Exit abnormally.
+      ;;
+    *)                                    # If unknown (any other) option:
+      exit_abnormal                       # Exit abnormally.
+      ;;
+    esac
+done
+
+# step 1: check livetime of the tag
+GEN_LIVETIME=""
+GEN_JOBS=""
+# extract config file from disk:
+CONFIG=${KNOWN}.txt
+
+echo "running: mu2eDatasetFileList cnf.${OWNER}.ensemble${KNOWN}.${RELEASE}${CURRENT}.txt"
+
+mu2eDatasetFileList cnf.${OWNER}.ensemble${KNOWN}.${RELEASE}${CURRENT}.txt >> config.txt
+# Read each line (file path) from the input file
+while IFS= read -r file_path; do
+    if [ -f "$file_path" ]; then
+        cp "$file_path" ${KNOWN}.txt
+    fi
+done < config.txt
+
+while IFS='= ' read -r col1 col2
+do 
+    if [[ "${col1}" == "livetime" ]] ; then
+      GEN_LIVETIME=${col2}
+      LIVETIME=${col2}
+    fi
+    if [[ "${col1}" == "njobs" ]] ; then
+      GEN_JOBS=${col2}
+    fi
+    if [[ "${col1}" == "BB" ]] ; then
+      BB=${col2}
+    fi
+
+done <${CONFIG}
+echo "extracted config for ${KNOWN}"
+echo "found ${GEN_LIVETIME} ${BB}"
+rm *.csv
+# if user has chosen to sample only a smaller amount of livetime then override
+if (awk "BEGIN {exit !(${CHOOSE} != 0)}") ; then
+  echo "livetime chosen to be ${CHOOSE} s"
+  LIVETIME=$(awk "BEGIN {print ${CHOOSE}}" LIVETIME="${CHOOSE}")
+fi
+if (awk "BEGIN {exit !(${CHOOSE} > ${GEN_LIVETIME})}") ; then
+  echo "ERROR: users chosen livetime is larger than total sample size, defaulting to ${GEN_LIVETIME} s"
+  LIVETIME=$(awk "BEGIN {print ${GEN_LIVETIME}}" LIVETIME="${GEN_LIVETIME}")
+fi
+echo "livetime ${LIVETIME}s is initated, watch for changes...."
+
+# find how many known files are for livetime
+N_TOTAL_KNOWN=$(samDatasetsSummary.sh mcs.${OWNER}.ensemble${KNOWN}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art  | awk '/Files/ {print $2}')
+LIVETIME_PER_FILE=$(awk "BEGIN {printf \"%.0f\", ${GEN_LIVETIME}/${N_TOTAL_KNOWN}}")
+echo "livetime per file ${LIVETIME_PER_FILE}"
+N_KNOWN_FILES_TO_USE=$(awk "BEGIN {printf \"%.0f\", ${LIVETIME}/${LIVETIME_PER_FILE}}")
+echo "${N_KNOWN_FILES_TO_USE} files of ${KNOWN} to be used with livetime of ${LIVETIME} s"
+
+# actual livetime that will be used for normalization of signal depends on int number of files
+LIVETIME=$(awk "BEGIN {printf \"%.0f\", ${N_KNOWN_FILES_TO_USE}*${LIVETIME_PER_FILE}}")
+echo "IMPORTANT: livetime ${LIVETIME}s is selected based on need for integar number of files"
+
+# understand how many events are present, and what fraction we need to sample
+echo "accessing " mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art
+NGEN=10000000
+#(samDatasetsSummary.sh mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art  | awk '/Generated/ {print $2}') #FIXME
+
+echo "sample mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art contains ${NGEN} gen events"
+
+# recheck rate for new Nfiles
+#RATE=$(calculateEvents.py --livetime ${LIVETIME} --BB ${BB} --nsig ${NSIG} --prc "GetRATE" )
+#echo "can only sample full files, sampling ${N_SIGNAL_FILES_TO_USE} files so ${NSIG} and ${RATE}"
+
+#need to store this somewhere, amend the .config and make an associated config for combined sample with nexp, rate, livetime_rate added at end of original.
+echo "======= combined samples info =========">> ${KNOWN}.txt
+echo "signal= ${SIGNAL}">> ${KNOWN}.txt
+echo "Rmue= ${RATE}">> ${KNOWN}.txt
+echo "livetime_combined= ${LIVETIME}">> ${KNOWN}.txt
+echo "npseudo_experiments= ${NEXP}">> ${KNOWN}.txt
+
+# build complete list
+rm filenames_All_${SIGNAL}
+rm filenames_All_${KNOWN}
+rm filenames_*
+echo "looking for mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art"
+mu2eDatasetFileList "mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art" > filenames_All_${SIGNAL} 
+
+mu2eDatasetFileList nts.mu2e.ensemble${KNOWN}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}_v06_06_00.root > filenames_All_${KNOWN}
+
+# step: split the signal files to get an exact number:
+i=1
+while [ $i -le ${NEXP} ]
+do
+  # remove old files
+  rm ntuple_$i.fcl
+  rm splitter_$i.fcl
+
+  # calculate yield of signal for chose rate, if > 0 then proceed --> use python scripts
+  NSIG=$(calculateEvents.py --livetime ${LIVETIME} --prc ${SIGNAL} --BB ${BB} --rue ${RATE})
+  echo "${RATE} for ${BB} and ${LIVETIME} s means ${NSIG} events will be sampled"
+  NSIG=$(awk "BEGIN {printf \"%.0f\", ${NSIG}}")
+
+  # calculate number of files
+  N_TOTAL_SIGNAL=$(samDatasetsSummary.sh mcs.${OWNER}.${SIGNAL}Mix${BB}Triggered.${RELEASE}_${DBPURPOSE}_${DBVERSION}.art  | awk '/Files/ {print $2}')
+  EVENTS_PER_FILE=$(awk "BEGIN {printf \"%.0f\", ${NGEN}/${N_TOTAL_SIGNAL}}")
+  echo "signal sample has ${N_TOTAL_SIGNAL} files with ${EVENTS_PER_FILE} events per file"
+  N_SIGNAL_FILES_TO_USE=$(awk "BEGIN {printf \"%.0f\", ${NSIG}/${EVENTS_PER_FILE}}")
+
+  # if its < 1 file the above will be 0, so we need to make sure we use at least 1 file here
+  if (( N_SIGNAL_FILES_TO_USE == 0 )); then
+    N_SIGNAL_FILES_TO_USE=1
+  fi
+  echo "based on requested rate, will use ${N_SIGNAL_FILES_TO_USE} signal files"
+
+  # build the splitter .fcl file and run on the chosen samples
+  echo "will sample ${N_SIGNAL_FILES_TO_USE} signal files"
+  # randomly select a file here
+  shuf -n ${N_SIGNAL_FILES_TO_USE} filenames_All_${SIGNAL} > temp
+  shuf temp > filenames_ChosenSig_$i
+  rm temp
+  # construct .fcl
+  echo "#include \"Production/JobConfig/ensemble/fcl/split.fcl\"" > splitter_$i.fcl
+  echo "source.fileNames: [" >> splitter_$i.fcl
+  while IFS= read -r line; do
+    echo "adding file: " $line
+    echo "\"$line\"" >> splitter_$i.fcl
+    if (( ${N_SIGNAL_FILES_TO_USE} > 1 )); then
+      echo "," >> splitter_$i.fcl
+    fi
+  done < "filenames_ChosenSig_$i"
+  echo "]" >> splitter_$i.fcl
+  echo "source.maxEvents: ${NSIG}" >> splitter_$i.fcl
+  echo "outputs.out.fileName: \"mcs.${OWNER}.${SIGNAL}Mix${BB}TriggeredSplit.${RELEASE}_${DBPURPOSE}_${DBVERSION}.${i}.art\"" >> splitter_$i.fcl
+  cmd=$(mu2e -c splitter_$i.fcl)
+  echo "Running: $cmd"
+  # run the splitting function
+  $cmd
+
+  # make the ntuples
+  echo "#include \"EventNtuple/fcl/from_mcs-mockdata.fcl\"" >> ntuple_$i.fcl
+  echo "services.TFileService.fileName: \"nts.${OWNER}.${SIGNAL}Mix${BB}TriggeredSplit.${RELEASE}_${DBPURPOSE}_${DBVERSION}.${i}.root\"" >> ntuple_$i.fcl
+  cmd=$(mu2e -c ntuple_$i.fcl mcs.${OWNER}.${SIGNAL}Mix${BB}TriggeredSplit.${RELEASE}_${DBPURPOSE}_${DBVERSION}.${i}.art)
+  echo "Running: $cmd"
+  $cmd
+  ls nts.${OWNER}.${SIGNAL}Mix${BB}TriggeredSplit.${RELEASE}_${DBPURPOSE}_${DBVERSION}.${i}.root > temp
+
+  # create randomly mixed list of ntuples
+  shuf -n ${N_KNOWN_FILES_TO_USE} filenames_All_${KNOWN} >> temp
+  shuf temp > filenames_ChosenMixed_$i
+  rm temp
+  i=$((i + 1))
+
+done
+mkdir fcl
+mv *.fcl fcl
+echo "finished compiling list of chosen ntuples"
+rm *.csv