In [1]:
import os

print(os.environ['PYTHONPATH'])

/home/matteoc/decaf/grinder:


In [2]:
import parsl
import os
from parsl.app.app import python_app, bash_app
from parsl.configs.local_threads import config

from parsl.providers import LocalProvider,CondorProvider,SlurmProvider
from parsl.channels import LocalChannel,SSHChannel
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from parsl.launchers import SrunLauncher

from parsl.addresses import address_by_hostname

x509_proxy = 'x509up_u%s'%(os.getuid())
year = '2018'

wrk_init = '''
export X509_USER_PROXY=${HOME}/x509up_u45169
export X509_CERT_DIR=${HOME}/certs/
export XRD_RUNFORKHANDLER=1
'''#%(x509_proxy)

twoGB = 2048
nproc = 48

sched_opts = '''
#SBATCH --cpus-per-task=%d
#SBATCH --mem-per-cpu=%d
''' % (nproc, twoGB, ) 

slurm_htex = Config(
    executors=[
        HighThroughputExecutor(
            label="coffea_parsl_slurm",
            address=address_by_hostname(),
            prefetch_capacity=0,  
            max_workers=nproc,
            provider=SlurmProvider(
                channel=LocalChannel(),
                launcher=SrunLauncher(),
                init_blocks=72,
                max_blocks=72,
                nodes_per_block=1,
                partition='general',
                scheduler_options=sched_opts,   # Enter scheduler_options if needed
                worker_init=wrk_init,         # Enter worker_init if needed
                walltime='02:00:00'
            ),
        )
    ],
    retries=10,
    strategy=None,
)

#parsl.set_stream_logger() # <-- log everything to stdout

dfk = parsl.load(slurm_htex)

chunksize=500000


In [3]:
lumis = {}
#Values from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PdmVAnalysisSummaryTable
lumis['2016']=35.92
lumis['2017']=41.53
lumis['2018']=59.97
lumi = 1000.*float(lumis[year])

In [4]:
samples = {
    "iszeroL":('ZJets','WJets','DY','TT','ST','WW','WZ','ZZ','QCD','HToBB','MET'),
    "isoneM":('WJets','DY','TT','ST','WW','WZ','ZZ','QCD','HToBB','MET'),
    "isoneE":('WJets','DY','TT','ST','WW','WZ','ZZ','QCD','HToBB','SingleElectron','EGamma'),
    "istwoM":('WJets','DY','TT','ST','WW','WZ','ZZ','HToBB','MET'),
    "istwoE":('WJets','DY','TT','ST','WW','WZ','ZZ','HToBB','SingleElectron','EGamma'),
    "isoneA":('GJets','QCD','SinglePhoton','EGamma')
}

In [5]:
import json

with open("../harvester/beans/"+year+".json") as fin:
    samplefiles = json.load(fin)
xsec = {k: v['xs'] for k,v in samplefiles.items()}

print(xsec)

{'MET____0_': -1, 'EGamma____0_': -1, 'ZJetsToNuNu_HT-100To200_13TeV-madgraph____0_': 280.5, 'ZJetsToNuNu_HT-200To400_13TeV-madgraph____0_': 77.7, 'ZJetsToNuNu_HT-400To600_13TeV-madgraph____0_': 10.71, 'ZJetsToNuNu_HT-600To800_13TeV-madgraph____0_': 2.562, 'ZJetsToNuNu_HT-800To1200_13TeV-madgraph____0_': 1.183, 'ZJetsToNuNu_HT-1200To2500_13TeV-madgraph____0_': 0.286, 'ZJetsToNuNu_HT-2500ToInf_13TeV-madgraph____0_': 0.006945, 'DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 147.4, 'DYJetsToLL_M-50_HT-200to400_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 40.99, 'DYJetsToLL_M-50_HT-400to600_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 5.678, 'DYJetsToLL_M-50_HT-600to800_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 1.367, 'DYJetsToLL_M-50_HT-800to1200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 0.6304, 'DYJetsToLL_M-50_HT-1200to2500_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8____0_': 0.1514, 'DYJetsToLL_M-50_HT-2500toInf_TuneCP5_

In [None]:
#get the analysis worker from the cloudpickle file
#import cloudpickle as cpkl
#import lz4.frame as lz4f
from analysis.darkhiggs import AnalysisProcessor

#processor_pkl = 'AnalysisProcessor.cpkl.lz4'
#AnalysisProcessor = None
#with lz4f.open(processor_pkl, mode="rb") as fin:
#    AnalysisProcessor = cpkl.load(fin)
print(AnalysisProcessor)

In [None]:
import time
from coffea import hist, processor
from coffea.processor import run_parsl_job
from coffea.processor.parsl.parsl_executor import parsl_executor
import gzip
import pickle
import cloudpickle
import numpy as np


filelist = {}
for dataset, info in samplefiles.items():
    #if your_wanted_dataset not in dataset: continue
    #dataset = dt.strip().split("____")[0]
    if not dataset in filelist: filelist[dataset] = []
    fileslice = slice(None)
    for file in info['files'][fileslice]:
        filelist[dataset].append(file)

        
selections = {}
for dataset in filelist:
    if not dataset in selections: selections[dataset] = []
    for selection,v in samples.items():
        for i in range (0,len(v)):
            if v[i] not in dataset: continue
    fileset = {}
    fileset[dataset] = filelist[dataset]
    processor_instance=AnalysisProcessor(selected_regions=selections[dataset], year=year, xsec=xsec, lumi=lumi)
    tstart = time.time()
    output = run_parsl_job(fileset,
                           treename='Events',
                           processor_instance=processor_instance,
                           executor=parsl_executor,
                           executor_args={'config':None, 'flatten': False},
                           data_flow=dfk,
                           chunksize=500000,
                          )

    # Pickle is not very fast or memory efficient, will be replaced by something better soon
    with lz4f.open("pods/"+year+"/"+dataset+".pkl.gz", mode="wb", compression_level=5) as fout:
        cloudpickle.dump(output, fout)
        
    dt = time.time() - tstart
    nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
    nfilled = sum(sum(np.sum(arr > 0) for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
    print("Filled %.1fM bins" % (nbins/1e6, ))
    print("Nonzero bins: %.1f%%" % (100*nfilled/nbins, ))


In [None]:
parsl.dfk().cleanup()
parsl.clear()
