In [1]:
%matplotlib inline

from ROOT import TFile,vector,TGraph
import ROOT
import matplotlib.pyplot as plt
import pickle
import pandas as pd

import numpy as np
from numpy import mean
from math import sqrt,acos,cos,sin,pi,exp,log,isnan,atan2
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from numpy import asarray
from root_pandas import read_root
from matplotlib import gridspec
from scipy import stats

Welcome to JupyROOT 6.14/08


In [2]:
def bless_tune1_reweight(row):
    rw = float(1)
    if row['nu_interaction_mode']==1001 and (row['nu_pdg']==14 or row['nu_pdg']==-14):
        rw = xsec_tune1_graph_numu.Eval(row['MC_energyInit']) / xsec_mcc9_graph_numu.Eval(row['MC_energyInit'])
    if row['nu_interaction_mode']==1001 and (row['nu_pdg']==12 or row['nu_pdg']==-12):
        rw = xsec_tune1_graph_nue.Eval(row['MC_energyInit']) / xsec_mcc9_graph_nue.Eval(row['MC_energyInit'])
    return rw

def bless_MC_labels(row):
    mclabel = ''
    intlabel = ''
    parentlabel = ''
    pizero = [1090,1086,1090,1080,1015,1013,1011,1008,1006,1004]
    piplusminus = [1085,1079,1032,1017,1014,1007,1005,1003,1028,1021,1016,1012,1010,1009]
        
    if abs(row['nu_pdg']) == 12:
        intlabel = 'nue'
    elif abs(row['nu_pdg']) == 14:
        intlabel = 'numu'
    
    if not 0 < row['MC_scedr'] <= 5.0 :
        mclabel = 'offvtx'
    else:
        if row['nu_interaction_type'] == 1001:
            mclabel = 'CCQE'
        elif row['nu_interaction_type'] == 1000:
            mclabel = 'MEC'
        elif row['nu_interaction_type'] in pizero:
            mclabel = 'pizero'
        elif row['nu_interaction_type'] in piplusminus:
            mclabel = 'piplusminus' 
        else:
            mclabel = 'other'
            
    return '%s_%s'%(intlabel,mclabel)

def bless_leeweight(row):
    wgts_unfolded = np.array((0, 5.03093, 4.50515, 3.50515, 2.31959, 1.31959, 0.64948, 0.27835, 0.11340, 0))
    binedges_unfolded = np.array((0,.200,.250,.300,.350,.400,.450,.500,.600,.800,3.000))
    
    for i in range(0,len(wgts_unfolded)):
        if row['MC_energyInit'] < binedges_unfolded[i+1]:
            return wgts_unfolded[i]
        
    return 0

In [3]:
# Time to load the good runs list
# good_df = pd.read_csv('data/goodruns_2020.txt') # run1
# good_df = pd.read_csv('data/goodruns_run3.txt')
good_df = pd.read_csv('data/goodruns_run2.txt')
good_df['good'] = 1

RSE=['run','subrun','event']

In [4]:
xsec_mcc9_file = ROOT.TFile('data/xsec_graphs_tune1.root')
xsec_mcc9_graph_numu = xsec_mcc9_file.Get('nu_mu_Ar40/qel_cc_n')
xsec_mcc9_graph_nue = xsec_mcc9_file.Get('nu_e_Ar40/qel_cc_n')
xsec_tune1_file = ROOT.TFile('data/xsec_graphs_mcc9_v304.root')
xsec_tune1_graph_numu = xsec_tune1_file.Get('nu_mu_Ar40/qel_cc_n')
xsec_tune1_graph_nue = xsec_tune1_file.Get('nu_e_Ar40/qel_cc_n')

In [5]:
# Beam quality
beamq_df = read_root('data/beamdataquality_remix_bnb5e19.root','bdq')
tag = 'Sept_9_run2_MC'

# loading in all data from pi0 box

In [6]:
# Time to load the good runs list
good_df = pd.read_csv('data/goodruns_2020.txt') # run1
good_df['good'] = 1
RSE=['run','subrun','event']

tag = 'Aug_21_pi0box_run1'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_run1_C1_bnb_dlfilter_pi0_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (tag, "start length",len(df_data))
df_data = df_data.join(good_df.set_index('run'),on='run') #get good runs flag
df_data_goodruns = df_data.query("good==1")
df_data_goodruns_precuts = df_data_goodruns.query("PassPMTPrecut ==1 and PassShowerReco ==1 and InFiducial ==1")
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print (tag, "end length",len(df_data_goodruns_precuts))
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)
del df_data,df_data_goodruns,df_data_goodruns_precuts #clear out info

good_df = pd.read_csv('data/goodruns_run2.txt')
good_df['good'] = 1
RSE=['run','subrun','event']

tag = 'Aug_21_pi0box_run2D'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_run2_D2_bnb_dlfilter_pi0_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (tag, "start length",len(df_data))
df_data = df_data.join(good_df.set_index('run'),on='run') #get good runs flag
df_data_goodruns = df_data.query("good==1")
df_data_goodruns_precuts = df_data_goodruns.query("PassPMTPrecut ==1 and PassShowerReco ==1 and InFiducial ==1")
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print (tag, "end length",len(df_data_goodruns_precuts))
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)
del df_data,df_data_goodruns,df_data_goodruns_precuts #clear out info

tag = 'Aug_21_pi0box_run2E'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_run2_E1_bnb_dlfilter_pi0_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (tag, "start length",len(df_data))
df_data = df_data.join(good_df.set_index('run'),on='run') #get good runs flag
df_data_goodruns = df_data.query("good==1")
df_data_goodruns_precuts = df_data_goodruns.query("PassPMTPrecut ==1 and PassShowerReco ==1 and InFiducial ==1")
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print (tag, "end length",len(df_data_goodruns_precuts))
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)
del df_data,df_data_goodruns,df_data_goodruns_precuts #clear out info

good_df = pd.read_csv('data/goodruns_run3.txt')
good_df['good'] = 1
RSE=['run','subrun','event']

tag = 'Aug_21_pi0box_run3F'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_run3_F1_bnb_dlfilter_pi0_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (tag, "start length",len(df_data))
df_data = df_data.join(good_df.set_index('run'),on='run') #get good runs flag
df_data_goodruns = df_data.query("good==1")
df_data_goodruns_precuts = df_data_goodruns.query("PassPMTPrecut ==1 and PassShowerReco ==1 and InFiducial ==1")
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print (tag, "end length",len(df_data_goodruns_precuts))
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)
del df_data,df_data_goodruns,df_data_goodruns_precuts #clear out info

tag = 'Aug_21_pi0box_run3G'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_run3_G1_bnb_dlfilter_pi0_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (tag, "start length",len(df_data))
df_data = df_data.join(good_df.set_index('run'),on='run') #get good runs flag
df_data_goodruns = df_data.query("good==1")
df_data_goodruns_precuts = df_data_goodruns.query("PassPMTPrecut ==1 and PassShowerReco ==1 and InFiducial ==1")
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print (tag, "end length",len(df_data_goodruns_precuts))
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)
del df_data,df_data_goodruns,df_data_goodruns_precuts #clear out info

Aug_21_pi0box_run1 start length 1809
Aug_21_pi0box_run1 end length 1618
Aug_21_pi0box_run2D start length 2186
Aug_21_pi0box_run2D end length 2068
Aug_21_pi0box_run2E start length 701
Aug_21_pi0box_run2E end length 639
Aug_21_pi0box_run3F start length 507
Aug_21_pi0box_run3F end length 473
Aug_21_pi0box_run3G start length 2049
Aug_21_pi0box_run3G end length 1914


# MC BNB OVERLAY

In [13]:
tag ='Sept_run2_test'
good_df = pd.read_csv('data/goodruns_run2.txt') # run1
# good_df = pd.read_csv('data/goodruns_run3.txt')
good_df['good'] = 1

# tag ='July_28_run1_new'
# mcc9_v28_wctagger_bnboverlay_finalbdt.root
# mcc9_v28_wctagger_nueintrinsics_finalbdt.root
# mcc9_v28_wctagger_extbnb_finalbdt.root
# mcc9_v28_wctagger_5e19_finalbdt.root

# tag ='July_28_run3_old'
# mcc9_v29e_dl_run3b_bnb_nu_overlay_nocrtremerge_finalbdt.root
# mcc9_v29e_dl_run3b_bnb_intrinsic_nue_overlay_nocrtremerge_finalbdt.root
# mcc9_v29e_dl_run3_G1_extbnb_finalbdt.root
# mcc9_v29e_dl_run3_F1_bnb_dlfilter_pi0_v1_1_3_fvv.root

# tag = 'July_29_pi0box_run1'
# mcc9_v29e_dl_run1_C1_bnb_dlfilter_pi0_v1_1_3_fvv.root


# mcc9_v28_wctagger_run3_bnb1e19.root

RSE=['run','subrun','event']

df_bnb = read_root('/media/disk1/kmason/mcc9_v29e_dl_run2_bnb_nu_overlay_finalbdt.root','dlana/FinalVertexVariables')
print(len(df_bnb))
df_bnb_cvweight = read_root('data/weights_forCV_v40_bnb_nu_run2.root')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
cols = df_bnb_cvweight.columns.tolist()
print(cols)


print('Loaded Files')

df_bnb = df_bnb.join(good_df.set_index('run'),on='run')
df_bnb = df_bnb.join(df_bnb_cvweight.set_index(RSE)[['nu_energy_true','nu_interaction_mode','nu_interaction_type','xsec_corr_weight','spline_weight','nu_interaction_ccnc','nu_pdg']],on=RSE)

# remove nue cc events
df_numu = df_bnb.query('not (nu_interaction_ccnc==0 and abs(nu_pdg)==12)')
print(len(df_numu))
df_numu_goodruns = df_numu.query('good==1')

print(len(df_numu_goodruns))
df_numu_goodruns_precuts = df_numu_goodruns.query("InFiducial==1 and PassPMTPrecut == 1")
print(len(df_numu_goodruns_precuts))

df_numu_goodruns_precuts = df_numu_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()

print('Merged')

df_numu_goodruns_precuts.insert(0,'xsec_tune1_weight',df_numu_goodruns_precuts.apply(bless_tune1_reweight,axis=1))

print('Genie Reweighted')

df_numu_goodruns_precuts.insert(0,'mc_label',df_numu_goodruns_precuts.apply(bless_MC_labels,axis=1))

cuts = 'PassPMTPrecut == 1  and _pi0mass<400 and PassSimpleCuts ==1 and Proton_Edep>60.0  and Electron_Edep>35.0 and PassPMTPrecut==1 and PassShowerReco==1  and shower1_E_Y>80 and ChargeNearTrunk >250 and Electron_ThetaRecoB_e1ep <1.5 and _shower_alpha <2.5 and _pi0mass>0 and BDTscore_1e1p<.7'
print(len(df_numu_goodruns_precuts.query(cuts))) 



print ('MC Labeled')

143672
['run', 'subrun', 'event', 'nu_pdg', 'nu_energy_true', 'nu_interaction_ccnc', 'nu_interaction_mode', 'nu_interaction_type', 'nu_target_pdg', 'nu_L_true', 'spline_weight', 'rootino_weight', 'ub_tune_weight', 'xsec_corr_weight', 'lee_weight']
Loaded Files
244621
244621
205342
Merged
Genie Reweighted
1317
MC Labeled


In [12]:
df_numu_goodruns_precuts.to_pickle('data/pickles/numu_goodruns_precuts_%s.pickle'%tag)

print('Saved Parquet')

del df_bnb,df_bnb_cvweight,df_numu_goodruns_precuts

print('Cleaned up')

Saved Parquet
Cleaned up


# MC NUE OVERLAY

In [9]:
df_nue = read_root('/media/disk1/kmason/mcc9_v29e_dl_run2_bnb_intrinsics_nue_overlay_finalbdt.root','dlana/FinalVertexVariables')
df_nue_cvweights = read_root('data/weights_forCV_v40_intrinsic_nue_run2.root')

print('Loaded Files')

df_nue = df_nue.join(good_df.set_index('run'),on='run')
df_nue = df_nue.join(df_nue_cvweights.set_index(RSE)[['lee_weight','nu_interaction_mode','nu_interaction_type','xsec_corr_weight','spline_weight','nu_pdg']],on=RSE)
df_nue_goodruns = df_nue.query("good==1")
# df_nue_goodruns = df_nue
df_nue_goodruns_precuts = df_nue_goodruns.query("InFiducial==1")

# df_nue_goodruns_precuts = df_nue_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()


print('Merged')

df_nue_goodruns_precuts.insert(0,'xsec_tune_weight',df_nue_goodruns_precuts.apply(bless_tune1_reweight,axis=1))
df_nue_goodruns_precuts.insert(0,'mc_label',df_nue_goodruns_precuts.apply(bless_MC_labels,axis=1))
# df_nue_goodruns_precuts.insert(0,'leeweight',df_nue_goodruns_precuts.apply(bless_leeweight,axis=1))

Loaded Files
Merged


In [10]:
df_nue_goodruns_precuts.to_pickle('data/pickles/nue_goodruns_precuts_%s.pickle'%tag)

print('Saved Parquet')

del df_nue,df_nue_goodruns_precuts,df_nue_cvweights

print('Cleaned up')

Saved Parquet
Cleaned up


# EXT

In [10]:
df_ext = read_root('/media/disk1/kmason/mcc9_v29e_dl_run3_G1_extbnb_finalbdt.root','dlana/FinalVertexVariables')

print('Loaded files')
print(len(df_ext))

df_ext = df_ext.join(good_df.set_index('run'),on='run')

df_ext_goodruns = df_ext.query("good==1")
df_ext_goodruns_precuts = df_ext_goodruns.query("PassPMTPrecut==1 and InFiducial==1")

# df_ext_goodruns_precuts = df_ext_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()
print(len(df_ext_goodruns_precuts))

print('Merged')

# restrict to 5e19 run range
#df_ext_goodruns_pmtprecut = df_ext_goodruns_pmtprecut.query('run>=5119 and run<=5955')

Loaded files
102390
73310
Merged


In [11]:
df_ext_goodruns_precuts.to_pickle('data/pickles/ext_goodruns_precuts_%s.pickle'%tag)

print('Saved Parquet')

del df_ext,df_ext_goodruns,df_ext_goodruns_precuts

print('Cleaned up')

Saved Parquet
Cleaned up


# Data

In [8]:
good_df = pd.read_csv('data/goodruns_2020.txt')
good_df['good'] = 1

RSE=['run','subrun','event']

tag = 'Dec_12_fake5_run1'

df_data = read_root('/media/disk1/kmason/mcc9_v29e_dl_set5_fakedata_run1_pi0_lowBDT_v1_1_3_fvv.root','dlana/FinalVertexVariables')
print (len(df_data))

print('Loaded files')
df_data = df_data.join(good_df.set_index('run'),on='run')
# df_data = df_data.join(beamq_df.set_index(['run','subrun','event']),on=['run','subrun','event'])
# df_data_goodruns = df_data.query("good==1 and result==1")
df_data_goodruns = df_data.query("good==1")
df_data_goodruns = df_data
print (len(df_data_goodruns))
df_data_goodruns_precuts = df_data_goodruns.query( "PassShowerReco ==1")
print(len(df_data_goodruns_precuts))
df_data_goodruns_precuts = df_data_goodruns_precuts.query("InFiducial ==1  ")
print(len(df_data_goodruns_precuts))
# df_data_goodruns_precuts = df_data_goodruns_precuts.sort_values('shower1_E_Y',ascending=False).drop_duplicates(RSE).sort_index()

cuts = 'PassPMTPrecut == 1 and _pi0mass<400 and PassSimpleCuts ==1 and Proton_Edep>60.0  and Electron_Edep>35.0 and PassShowerReco==1  and shower1_E_Y>80 and ChargeNearTrunk >250 and Electron_ThetaRecoB_e1ep <1.5 and _shower_alpha <2.5 and _pi0mass>0 and BDTscore_1e1p<.7'
print(len(df_data_goodruns_precuts.query(cuts)))
# print(len(df_data_goodruns_precuts))
print('Merged')

11075
Loaded files
11075
11044
10785
4635
Merged


In [9]:
df_data_goodruns_precuts.to_pickle('data/pickles/data_goodruns_precuts_%s.pickle'%tag)

print('Saved Parquet')

del df_data,df_data_goodruns,df_data_goodruns_precuts

print('Cleaned up')

Saved Parquet
Cleaned up


# Get POT for mc

In [None]:
df_pot = read_root('../data/mcc9_v13_bnb_overlay/old/pot_scrape.root','pot_tree',columns=['run', 'subrun','pot'])
RS=['run','subrun']

df_pot = pd.merge(df_numu_goodruns,df_pot,on=RS)
df_pot.drop_duplicates(RS,inplace=True)
print(df_pot['pot'].values.sum())

In [None]:
# Get list of RS
RS=['run','subrun']
df_rs = df_data_goodruns.drop_duplicates(RS)

f = open('../data/RS_data.txt','w')
for e in range(len(df_rs)):
    f.write('%i %i\n' % (df_rs['run'].values[e], df_rs['subrun'].values[e]))
f.close() 

In [None]:
def bless_scedr(row):
    scedr = float(-1)
    dx = row['Xreco'] - row['parentSCEX']
    dy = row['Yreco'] - row['parentSCEY']
    dz = row['Zreco'] - row['parentSCEZ']
    dr2 = dx*dx+dy*dy+dz*dz
    dr = np.sqrt(dr2)

    scedr = dr
    return scedr


In [None]:
def bless_proton_dedx(row):
    dedx = row['Proton_Edep']/float(row['Proton_TrackLength'])
    return dedx

def bless_proton_dedx_recombo(row):
    
    Rho = 1.383
    betap = 0.183592
    alpha = 0.921969

    Wion = 23.6e-6
    Efield = 0.273

    dedx = (np.exp(row['Proton_dQdx']*(betap/(Rho*Efield))*Wion)-alpha)/(betap/(Rho*Efield))
    return dedx

In [None]:
# restrict to 5e19 run range
#df_nue_goodruns_pmtprecut = df_nue_goodruns_pmtprecut.query('run>=5119 and run<=5955')