In [1]:
# General data manipulation
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import cv2
import os
from os import listdir
from os.path import isfile, join 
import sys
import time
from PIL import Image # pip install pillow
from keras.utils import np_utils

Using TensorFlow backend.


# Training data
## Text Data

In [2]:
# get train file names and convert to dataframe
# you may need to change your working directory first:
# os.chdir('/your_path')

ER_file_names = os.listdir('idao_dataset/train/ER/')
NR_file_names = os.listdir('idao_dataset/train/NR/')

ER = pd.DataFrame([[y.replace(';1.png','').replace('ev','') for y in x.split('_')] for x in ER_file_names])
ER['path'] = ER_file_names
NR = pd.DataFrame([[y.replace(';1.png','').replace('ev','') for y in x.split('_')] for x in NR_file_names])
NR['path'] = NR_file_names

ER = ER[[5,6,0,15,16,'path']].rename(columns={5:'type',6:'energy',0:'num',15:'run',16:'ev'})
NR = NR[[6,7,0,17,18,'path']].rename(columns={6:'type',7:'energy',0:'num',17:'run',18:'ev'})
data = ER.append(NR, ignore_index = True)
# data

In [3]:
data

Unnamed: 0,type,energy,num,run,ev,path
0,ER,10,0.5041503310203552,run4,272,0.5041503310203552__CYGNO_60_40_ER_10_keV_930V...
1,ER,3,-0.13757403194904327,run3,449,-0.13757403194904327__CYGNO_60_40_ER_3_keV_930...
2,ER,10,0.5042710304260254,run5,882,0.5042710304260254__CYGNO_60_40_ER_10_keV_930V...
3,ER,30,-1.4947187900543213,run3,660,-1.4947187900543213__CYGNO_60_40_ER_30_keV_930...
4,ER,10,1.562488317489624,run2,820,1.562488317489624__CYGNO_60_40_ER_10_keV_930V_...
...,...,...,...,...,...,...
13399,NR,1,-2.8066000938415527,run5,207,-2.8066000938415527__CYGNO_60_40_He_NR_1_keV_9...
13400,NR,6,2.858799934387207,run1,748,2.858799934387207__CYGNO_60_40_He_NR_6_keV_930...
13401,NR,6,3.1136999130249023,run1,809,3.1136999130249023__CYGNO_60_40_He_NR_6_keV_93...
13402,NR,20,2.88070011138916,run5,723,2.88070011138916__CYGNO_60_40_He_NR_20_keV_930...


In [4]:
data['type_2_ER'] = (data.type == 'ER')*1

In [5]:
data

Unnamed: 0,type,energy,num,run,ev,path,type_2_ER
0,ER,10,0.5041503310203552,run4,272,0.5041503310203552__CYGNO_60_40_ER_10_keV_930V...,1
1,ER,3,-0.13757403194904327,run3,449,-0.13757403194904327__CYGNO_60_40_ER_3_keV_930...,1
2,ER,10,0.5042710304260254,run5,882,0.5042710304260254__CYGNO_60_40_ER_10_keV_930V...,1
3,ER,30,-1.4947187900543213,run3,660,-1.4947187900543213__CYGNO_60_40_ER_30_keV_930...,1
4,ER,10,1.562488317489624,run2,820,1.562488317489624__CYGNO_60_40_ER_10_keV_930V_...,1
...,...,...,...,...,...,...,...
13399,NR,1,-2.8066000938415527,run5,207,-2.8066000938415527__CYGNO_60_40_He_NR_1_keV_9...,0
13400,NR,6,2.858799934387207,run1,748,2.858799934387207__CYGNO_60_40_He_NR_6_keV_930...,0
13401,NR,6,3.1136999130249023,run1,809,3.1136999130249023__CYGNO_60_40_He_NR_6_keV_93...,0
13402,NR,20,2.88070011138916,run5,723,2.88070011138916__CYGNO_60_40_He_NR_20_keV_930...,0


In [6]:
n_ER = sum(data.type == 'ER')
n_ER

6758

In [7]:
n_NR = sum(data.type == 'NR')
n_NR

6646

In [8]:
## Number of data to process
n = n_NR

In [9]:
data.iloc[len(ER):len(ER)+n,]
data_tmp = data.iloc[0:n,].append( data.iloc[len(ER):len(ER)+n,])

In [10]:
data_tmp = data_tmp.reset_index()

## Convert image into array

In [11]:
# convert PIL.Image object to numpy.Array, for training
def img2arr(img):
    return np.asarray(img.getdata(), dtype=np.uint8).reshape(img.height, img.width, -1)

### ER

In [14]:
# Converting ER
cwd = os.getcwd() # get current directory
mypath = cwd+'/idao_dataset/train/ER'
file_list = [cwd+'/idao_dataset/train/ER/'+f for f in listdir(mypath) if isfile(join(mypath, f))]

In [15]:
img_arr_list = []

In [16]:
n= n_ER

In [17]:
%%time 
# img_arr_list = []
for each in file_list[0:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel

CPU times: user 1h 4min 11s, sys: 2min 42s, total: 1h 6min 53s
Wall time: 1h 7min 11s


In [18]:
er_data = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
er_data['type_2_ER'] = data_tmp.type_2_ER[0:n]
er_data['energy'] = list(data_tmp.energy[0:n])
er_data.head()

Unnamed: 0,img_array,type_2_ER,energy
0,"[[104, 104, 101, 101, 101, 101, 97, 104, 96, 1...",1,10
1,"[[98, 97, 103, 104, 104, 98, 99, 102, 101, 97,...",1,3
2,"[[102, 97, 100, 96, 100, 97, 94, 110, 97, 100,...",1,10
3,"[[101, 96, 100, 99, 96, 108, 104, 102, 101, 99...",1,30
4,"[[97, 98, 100, 101, 98, 100, 102, 100, 99, 99,...",1,10


In [19]:
%%time
er_data.to_pickle('er_data(1-'+str(n)+').pkl')

CPU times: user 15.8 s, sys: 4.66 s, total: 20.5 s
Wall time: 28.3 s


### NR

In [12]:
%%time 
# Converting NR
cwd = os.getcwd() # get current directory
mypath = cwd+'/idao_dataset/train/NR'
file_list = [cwd+'/idao_dataset/train/NR/'+f for f in listdir(mypath) if isfile(join(mypath, f))]
img_arr_list = []

n= n_NR

for each in file_list[0:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel

CPU times: user 1h 5min 26s, sys: 3min 31s, total: 1h 8min 57s
Wall time: 1h 9min 30s


In [13]:
nr_data = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
nr_data['type_2_ER'] = list(data_tmp.type_2_ER[n:])
nr_data['energy'] = list(data_tmp.energy[n:])
nr_data.head()

Unnamed: 0,img_array,type_2_ER,energy
0,"[[96, 103, 98, 100, 100, 100, 108, 96, 99, 100...",0,20
1,"[[96, 104, 104, 98, 98, 100, 97, 103, 100, 100...",0,1
2,"[[101, 98, 106, 99, 99, 97, 97, 100, 102, 104,...",0,6
3,"[[103, 100, 98, 98, 101, 100, 108, 102, 101, 9...",0,1
4,"[[102, 103, 101, 102, 102, 102, 105, 101, 104,...",0,20


In [14]:
%%time
nr_data.to_pickle('nr_data(1-'+str(n)+').pkl')

CPU times: user 11.6 s, sys: 3.76 s, total: 15.4 s
Wall time: 23.8 s


# Private test data

In [11]:
# get train file names and convert to dataframe
# you may need to change your working directory first:
# os.chdir('/your_path')

private_test_file_names = os.listdir('idao_dataset/private_test/')

PT_data = pd.DataFrame(x for x in private_test_file_names).rename(columns={0: "path"})

PT_data.head(3)

Unnamed: 0,path
0,db4c165bce2379ccbaa8a1d5eb580b214a0c2888.png
1,920ef66abe7321b517824ac780226c424d238a05.png
2,9bdb427f3cadd75e689e17a8b407fe09231c2bd4.png


In [8]:
# convert PIL.Image object to numpy.Array, for training
def img2arr(img):
    return np.asarray(img.getdata(), dtype=np.uint8).reshape(img.height, img.width, -1)

In [9]:
# Converting ER
cwd = os.getcwd() # get current directory
mypath = cwd+'/idao_dataset/private_test'
file_list = [cwd+'/idao_dataset/private_test/'+f for f in listdir(mypath) if isfile(join(mypath, f))]

In [10]:
img_arr_list = []

In [15]:
start = 0
n = 4000

In [13]:
%%time 
# img_arr_list = []
for each in file_list[0:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel

CPU times: user 37min 35s, sys: 1min 14s, total: 38min 49s
Wall time: 40min 58s


In [21]:
PT_df = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
PT_df['id'] = PT_data['path'][start:n]
PT_df.head()

Unnamed: 0,img_array,id
0,"[[103, 109, 99, 103, 101, 99, 102, 101, 101, 9...",db4c165bce2379ccbaa8a1d5eb580b214a0c2888.png
1,"[[99, 100, 100, 102, 99, 99, 99, 104, 103, 105...",920ef66abe7321b517824ac780226c424d238a05.png
2,"[[96, 100, 101, 100, 102, 101, 100, 97, 99, 10...",9bdb427f3cadd75e689e17a8b407fe09231c2bd4.png
3,"[[93, 97, 103, 103, 100, 102, 107, 98, 102, 10...",2d3950a3f7099544d78d8fbaecfaf1cbe1ad0c94.png
4,"[[102, 98, 101, 101, 99, 100, 102, 103, 99, 96...",58f5d7531a28974837c961f516bce9ea61df4703.png


In [22]:
%%time
PT_df.to_pickle('private_df('+str(start)+'-'+str(n)+').pkl')

CPU times: user 6.74 s, sys: 2.23 s, total: 8.97 s
Wall time: 10.7 s


In [6]:
len(file_list)

15062

In [17]:
%%time 

start = 4000
n = 8000
count = 0
img_arr_list = []
for each in file_list[start:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel
    count +=1
    if count%500 == 0:
        print(count)
    
PT_df = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
PT_df['id'] = PT_data['path'][start:n]

PT_df.to_pickle('private_df('+str(start)+'-'+str(n)+').pkl')

500
1000
1500
2000
2500
3000
3500
4000
CPU times: user 31min 59s, sys: 1min 15s, total: 33min 15s
Wall time: 33min 25s


In [18]:
%%time 

start = 8000
n = 12000
count = 0
img_arr_list = []
for each in file_list[start:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel
    count +=1
    if count%500 == 0:
        print(count)
    
PT_df = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
PT_df['id'] = PT_data['path'][start:n]

PT_df.to_pickle('private_df('+str(start)+'-'+str(n)+').pkl')

500
1000
1500
2000
2500
3000
3500
4000
CPU times: user 37min 42s, sys: 12.6 s, total: 37min 55s
Wall time: 38min 26s


In [19]:
%%time 

start = 12000
n = 15061
count = 0
img_arr_list = []
for each in file_list[start:n]:    
    img = Image.open(each).convert('LA')
    img_arr_list.append(img2arr(img)[:,:,0]) # ignore the alpha channel
    count +=1
    if count%500 == 0:
        print(count)
    
PT_df = pd.DataFrame([img_arr_list]).transpose().rename(columns={0: "img_array"})
PT_df['id'] = PT_data['path'][start:n]

PT_df.to_pickle('private_df('+str(start)+'-'+str(n)+').pkl')

500
1000
1500
2000
2500
3000
CPU times: user 35min 30s, sys: 5.04 s, total: 35min 35s
Wall time: 35min 59s


In [2]:
%time
private_1_df = pd.read_pickle('private_df(0-4000).pkl')
private_2_df = pd.read_pickle('private_df(4000-8000).pkl')
private_3_df = pd.read_pickle('private_df(8000-12000).pkl')
private_4_df = pd.read_pickle('private_df(12000-15061).pkl')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [3]:
private_1_df.head(3)

Unnamed: 0,img_array,id
0,"[[103, 109, 99, 103, 101, 99, 102, 101, 101, 9...",db4c165bce2379ccbaa8a1d5eb580b214a0c2888
1,"[[99, 100, 100, 102, 99, 99, 99, 104, 103, 105...",920ef66abe7321b517824ac780226c424d238a05
2,"[[96, 100, 101, 100, 102, 101, 100, 97, 99, 10...",9bdb427f3cadd75e689e17a8b407fe09231c2bd4


In [4]:
private_2_df.head(3)

Unnamed: 0,img_array,id
0,"[[100, 107, 100, 99, 101, 101, 101, 100, 98, 9...",f956a847e0ed460691e42f7898dddb36b447246e
1,"[[101, 94, 100, 98, 99, 100, 97, 101, 99, 100,...",16dc7fd7ed590d09732d89e43f4af223251254a0
2,"[[102, 98, 102, 101, 102, 98, 99, 102, 100, 98...",4c69933ebe4898e6ddc8cdef22f9910352d7dd86


In [5]:
private_3_df.head(3)

Unnamed: 0,img_array,id
0,"[[102, 100, 100, 100, 97, 101, 108, 93, 99, 10...",ec58e5b0992b3ceac494293c6eb5852f5ec841e5
1,"[[96, 102, 102, 97, 97, 99, 108, 101, 102, 100...",543697fed4ad67af609401ca1bdc6cfc8b8d98e2
2,"[[101, 98, 99, 102, 97, 96, 98, 98, 99, 99, 10...",f4adc32c489998d83ad8001db1bbdc6fbcfa26b3


In [6]:
private_4_df.head(3)

Unnamed: 0,img_array,id
0,"[[101, 103, 99, 101, 103, 102, 100, 100, 99, 1...",80e8f379f501ee20992ad21976ea8e453ef6be7a
1,"[[101, 105, 101, 99, 100, 102, 101, 101, 102, ...",0e62cfba0f927045ad1bdcfc6789a945852ed70a
2,"[[103, 105, 98, 103, 97, 95, 97, 100, 102, 99,...",3447b8082f5eb23b1e695d95f21b4b0927479e83


In [7]:
# private_1_df['id'] = private_1_df.id.apply(lambda x: x[:-4])
# private_2_df['id'] = private_2_df.id.apply(lambda x: x[:-4])
# private_3_df['id'] = private_3_df.id.apply(lambda x: x[:-4])
# private_4_df['id'] = private_4_df.id.apply(lambda x: x[:-4])

In [13]:
# private_1_df.to_pickle('private_df(0-4000).pkl')
# private_2_df.to_pickle('private_df(4000-8000).pkl')
# private_3_df.to_pickle('private_df(8000-12000).pkl')
# private_4_df.to_pickle('private_df(12000-15061).pkl')

--------

In [26]:
private_all_df = private_1_df.append(private_2_df).append(private_3_df).append(private_4_df).reset_index()

In [33]:
private_all_df['id'] = private_all_df.id.apply(lambda x: x[:-4])

In [36]:
private_all_df = private_all_df[['id','img_array']]

In [None]:
private_all_df.to_pickle('private_df_all.pkl')

In [4]:
private_all_df = pd.read_pickle('private_df_all.pkl')

EOFError: Ran out of input

In [None]:
private_all_df.head(3)