In [1]:
# General data manipulation
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import cv2
import os
from os import listdir
from os.path import isfile, join 
import sys
import time
from PIL import Image # pip install pillow
from keras.utils import np_utils

Using TensorFlow backend.


# Text Data

In [2]:
# get train file names and convert to dataframe
# you may need to change your working directory first:
# os.chdir('/your_path')

ER_file_names = os.listdir('idao_dataset/train/ER/')
NR_file_names = os.listdir('idao_dataset/train/NR/')

ER = pd.DataFrame([[y.replace(';1.png','').replace('ev','') for y in x.split('_')] for x in ER_file_names])
ER['path'] = ER_file_names
NR = pd.DataFrame([[y.replace(';1.png','').replace('ev','') for y in x.split('_')] for x in NR_file_names])
NR['path'] = NR_file_names

ER = ER[[5,6,0,15,16,'path']].rename(columns={5:'type',6:'energy',0:'num',15:'run',16:'ev'})
NR = NR[[6,7,0,17,18,'path']].rename(columns={6:'type',7:'energy',0:'num',17:'run',18:'ev'})
data = ER.append(NR, ignore_index = True)
# data

In [3]:
data

Unnamed: 0,type,energy,num,run,ev,path
0,ER,10,0.5041503310203552,run4,272,0.5041503310203552__CYGNO_60_40_ER_10_keV_930V...
1,ER,3,-0.13757403194904327,run3,449,-0.13757403194904327__CYGNO_60_40_ER_3_keV_930...
2,ER,10,0.5042710304260254,run5,882,0.5042710304260254__CYGNO_60_40_ER_10_keV_930V...
3,ER,30,-1.4947187900543213,run3,660,-1.4947187900543213__CYGNO_60_40_ER_30_keV_930...
4,ER,10,1.562488317489624,run2,820,1.562488317489624__CYGNO_60_40_ER_10_keV_930V_...
...,...,...,...,...,...,...
13399,NR,1,-2.8066000938415527,run5,207,-2.8066000938415527__CYGNO_60_40_He_NR_1_keV_9...
13400,NR,6,2.858799934387207,run1,748,2.858799934387207__CYGNO_60_40_He_NR_6_keV_930...
13401,NR,6,3.1136999130249023,run1,809,3.1136999130249023__CYGNO_60_40_He_NR_6_keV_93...
13402,NR,20,2.88070011138916,run5,723,2.88070011138916__CYGNO_60_40_He_NR_20_keV_930...


In [4]:
data['type_2_ER'] = (data.type == 'ER')*1

In [5]:
data

Unnamed: 0,type,energy,num,run,ev,path,type_2_ER
0,ER,10,0.5041503310203552,run4,272,0.5041503310203552__CYGNO_60_40_ER_10_keV_930V...,1
1,ER,3,-0.13757403194904327,run3,449,-0.13757403194904327__CYGNO_60_40_ER_3_keV_930...,1
2,ER,10,0.5042710304260254,run5,882,0.5042710304260254__CYGNO_60_40_ER_10_keV_930V...,1
3,ER,30,-1.4947187900543213,run3,660,-1.4947187900543213__CYGNO_60_40_ER_30_keV_930...,1
4,ER,10,1.562488317489624,run2,820,1.562488317489624__CYGNO_60_40_ER_10_keV_930V_...,1
...,...,...,...,...,...,...,...
13399,NR,1,-2.8066000938415527,run5,207,-2.8066000938415527__CYGNO_60_40_He_NR_1_keV_9...,0
13400,NR,6,2.858799934387207,run1,748,2.858799934387207__CYGNO_60_40_He_NR_6_keV_930...,0
13401,NR,6,3.1136999130249023,run1,809,3.1136999130249023__CYGNO_60_40_He_NR_6_keV_93...,0
13402,NR,20,2.88070011138916,run5,723,2.88070011138916__CYGNO_60_40_He_NR_20_keV_930...,0


In [6]:
n_ER = sum(data.type == 'ER')
n_ER

6758

In [7]:
n_NR = sum(data.type == 'NR')
n_NR

6646

In [8]:
## Number of data to process
n = n_NR

In [9]:
data.iloc[len(ER):len(ER)+n,]
data_tmp = data.iloc[0:n,].append( data.iloc[len(ER):len(ER)+n,])

In [10]:
data_tmp = data_tmp.reset_index()

## Convert image into array

In [11]:
# convert PIL.Image object to numpy.Array, for training
def img2arr(img):
    return np.asarray(img.getdata(), dtype=np.uint8).reshape(img.height, img.width, -1)

### ER

In [14]:
# Converting ER
cwd = os.getcwd() # get current directory
mypath = cwd+'/idao_dataset/train/ER'
file_list = [cwd+'/idao_dataset/train/ER/'+f for f in listdir(mypath) if isfile(join(mypath, f))]

In [15]:
img_arr_list = []

In [16]:
n= n_ER

In [17]:
%%time 
# img_arr_list = []
for each in file_list[0:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel

CPU times: user 1h 4min 11s, sys: 2min 42s, total: 1h 6min 53s
Wall time: 1h 7min 11s


In [18]:
er_data = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
er_data['type_2_ER'] = data_tmp.type_2_ER[0:n]
er_data['energy'] = list(data_tmp.energy[0:n])
er_data.head()

Unnamed: 0,img_array,type_2_ER,energy
0,"[[104, 104, 101, 101, 101, 101, 97, 104, 96, 1...",1,10
1,"[[98, 97, 103, 104, 104, 98, 99, 102, 101, 97,...",1,3
2,"[[102, 97, 100, 96, 100, 97, 94, 110, 97, 100,...",1,10
3,"[[101, 96, 100, 99, 96, 108, 104, 102, 101, 99...",1,30
4,"[[97, 98, 100, 101, 98, 100, 102, 100, 99, 99,...",1,10


In [19]:
%%time
er_data.to_pickle('er_data(1-'+str(n)+').pkl')

CPU times: user 15.8 s, sys: 4.66 s, total: 20.5 s
Wall time: 28.3 s


### NR

In [12]:
%%time 
# Converting NR
cwd = os.getcwd() # get current directory
mypath = cwd+'/idao_dataset/train/NR'
file_list = [cwd+'/idao_dataset/train/NR/'+f for f in listdir(mypath) if isfile(join(mypath, f))]
img_arr_list = []

n= n_NR

for each in file_list[0:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel

CPU times: user 1h 5min 26s, sys: 3min 31s, total: 1h 8min 57s
Wall time: 1h 9min 30s


In [13]:
nr_data = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
nr_data['type_2_ER'] = list(data_tmp.type_2_ER[n:])
nr_data['energy'] = list(data_tmp.energy[n:])
nr_data.head()

Unnamed: 0,img_array,type_2_ER,energy
0,"[[96, 103, 98, 100, 100, 100, 108, 96, 99, 100...",0,20
1,"[[96, 104, 104, 98, 98, 100, 97, 103, 100, 100...",0,1
2,"[[101, 98, 106, 99, 99, 97, 97, 100, 102, 104,...",0,6
3,"[[103, 100, 98, 98, 101, 100, 108, 102, 101, 9...",0,1
4,"[[102, 103, 101, 102, 102, 102, 105, 101, 104,...",0,20


In [14]:
%%time
nr_data.to_pickle('nr_data(1-'+str(n)+').pkl')

CPU times: user 11.6 s, sys: 3.76 s, total: 15.4 s
Wall time: 23.8 s
