In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import subprocess
import bz2
from shutil import copy2, rmtree

def pair(df, x):

    dfns = df.shift(-x)
    dfps = df.shift(+x)

    logicOS1 = (df.bx == dfns.bx - x) & (df.orbit == dfns.orbit) & (df.charge*dfns.charge < 0)
    OS1 = df[logicOS1].copy()

    logicOS2 = (df.bx == dfps.bx + x) & (df.orbit == dfps.orbit) & (df.charge*dfps.charge < 0)
    OS2 = df[logicOS2].copy()

    OS1 = OS1.reset_index()
    OS2 = OS2.reset_index()

    OS = OS1.merge(OS2, left_on=OS1.index, right_on=OS2.index, suffixes=('_in', '_out'))

    logicSS1 = (df.bx == dfns.bx - x) & (df.orbit == dfns.orbit) & (df.charge*dfns.charge > 0)
    SS1 = df[logicSS1].copy()

    logicSS2 = (df.bx == dfps.bx + x) & (df.orbit == dfps.orbit) & (df.charge*dfps.charge > 0)
    SS2 = df[logicSS2].copy()

    SS1 = SS1.reset_index()
    SS2 = SS2.reset_index()

    SS = SS1.merge(SS2, left_on=SS1.index, right_on=SS2.index, suffixes=('_in', '_out'))

    # slim the dataframes
    def slimDataFrame(df):
        df = df.drop(['run_in', 'index_in','index_out','key_0','orbit_out'], axis=1)
        df = df.rename(columns = {'run_out':'run','orbit_in':'orbit'})
        if False: df = df.set_index(['run','orbit','bx_in'])
        return df

    OS = slimDataFrame(OS)
    SS = slimDataFrame(SS)
    return (OS, SS)

In [None]:
# Create a Tree Structure

if not os.path.exists('./data/RemovedFiles'):
        os.makedirs('./data/RemovedFiles')
        
writePath  = ['./data/allRuns/', './data/goodRuns', './data/badRuns', './data/2LegAll',
              './data/2LegGood', './data/2LegBad']
twolegPath = ['DBx1/SS/', 'DBx1/OS/', 'DBx2/SS/', 'DBx2/OS/', 'DBx3/SS/', 'DBx3/OS/']
runPath = ['monitor/', 'csv/']
for path in writePath:
    if not os.path.exists(path):
        os.makedirs(path)
    else: 
        rmtree(path)
        os.makedirs(path)
    if path in writePath[0:3]:
        for rPath in runPath:
            endPath =  os.path.join(path, rPath)
            os.makedirs(endPath)
    if path in writePath[3:6]:
        for tlPath in twolegPath:
            endPath =  os.path.join(path, tlPath)
            os.makedirs(endPath)
    



# Clean data files
path = './data/hiion/' 
files = os.listdir(path)
files = [f for f in files if 'monitor' not in f]

# getrunNumber from string e.g., int('/data/hiion/scout_326676_000000.monitor.txt'.split('_')[1]
getRun = lambda x: int(x.split('_')[1])


default_line = 'orbit,bx,phi,eta,pt,charge'

print(os.path.dirname(os.path.abspath(files[0]))) # print path for safety

files_to_remove = []

for i in range(len(files)):
    
    with open(os.path.join(path, files[i]), 'r') as file:
        
        file.seek(0)
        x=file.read(len(default_line + '\n'))
        file.seek(0)
        y=file.read(len(default_line + '\n')+1)
        file.seek(0)
        
        if x.strip("\n") == y.strip("\n"):              #checks if there is only the default line 'orbit,bx,phi,eta,pt,charge\n'
            
            files_to_remove.append(files[i])
            print(files[i] + ' is EMPTY')
            
        elif i>0 and getRun(files[i]) == getRun(files[i-1]):
            
            with open(os.path.join(path, files[i-1]), 'a') as oldfile:
                for line in file:
                    if line.strip("\n") != default_line :
                        oldfile.write(line)
                        
            files_to_remove.append(files[i])
            print(files[i] + ' is a continuation of another file of the same run. ')

In [None]:
destination = './data/RemovedFiles/'
for f in files_to_remove:
        if len(files_to_remove)>0:
            if os.path.exists(os.path.join(path, f)):
                copy2(os.path.join(path, f), destination)
                os.remove(os.path.join(path, f))
                print('Moving  '+ f + '  to the RemovedFiles Folder')
            else:
                print("The "+ f + "file does not exist")

In [None]:
# Write clean data to the tree


# list to hold dataframes
dfList = []
runIndex= []

for file in files:
    filepath = path+file
    df = pd.read_csv(filepath)
    
    # add a column with the run number
    df['run'] = getRun(file) 
    dfList += [df]
    
    #make in index with the run numbers
    runIndex += [getRun(file)]


#Fix the runs where the orbit number resets (regard only muons after the reset)
dfs = []
for df in dfList:
    reset = 0
    if not df['orbit'].is_monotonic:
        for i in range(1,len(df)):
            if df.loc[i-1, 'orbit']>df.loc[i, 'orbit']:
                reset = i
                break
    tmp = df.loc[reset:]
    tmp.reset_index(drop=True, inplace=True) 
    dfs.append(tmp)            
    

for df in dfs:
    name = str(df['run'][0])+".txt"
    df.to_csv("./data/allRuns/csv/"+name, index = False) 
    for i in range(3):
        OS, SS = pair(df,i+1)
        OS.to_csv("./data/2LegAll/DBx{}/OS/".format(i+1)+name, index = False)
        SS.to_csv("./data/2LegAll/DBx{}/SS/".format(i+1)+name, index = False) 