This notebook is to accomplish dataset foundation together. The two datasets would require same labels, and shuffle in the same order.

In [1]:
import os,sys
from PIL import Image
import numpy as np
import pickle
from tqdm import tqdm
import io
import re
from random import shuffle

# CSV reading/writing
import pandas as pd
import csv
import md5_batch_gen

In [2]:
os.listdir()

['NVSS',
 'Dataset_download_testing.ipynb',
 'FRDEEPv2_tutorial.ipynb',
 '.DS_Store',
 'md5_batch_gen.py',
 'Step2_FRDEEP_Astroquery_Download.ipynb',
 '3_Source_preprocessed_png_image',
 '4_DataPickle_Generation',
 'FIRST',
 'object_id',
 'Step4_Create_FRDEEPv2.ipynb',
 'NVSS_IMG',
 'Dataset_evaluation.ipynb',
 '__pycache__',
 'Step5_FRDEEPv2_loading_test.ipynb',
 'FIRST.tar.gz',
 '1_Source_catalogue_spreadsheets',
 'FRDEEPv2_foundation.py',
 '.ipynb_checkpoints',
 'FRDEEPv2.tar.gz',
 'Step3_FRDEEP_FITS_PNG_conversion.ipynb',
 'NVSS.tar.gz',
 'FIRST_IMG',
 '2_Source_raw_fits_image']

# 0. Functions necessary to deal with the dataset foundation

In [3]:
def get_file_list(datadir,file_format):
    """
        This function returns file names in selected format under given directory.

        Args:
        datadir: the directory where the selected format files are saved.
        file_format: customized file format waiting for seaching.

        Returns:
        file_list: a list of file names under the given directory.
        
    """
    # get list of FITS files:
    file_list = []
    # Iterate files over directory
    for filename in os.listdir(datadir):
        if filename.endswith("."+ file_format):
            name = os.path.join(filename)
            file_list.append(name)
    print('Number of file under the directory: ',len(file_list))
    return file_list

def make_dir(dir_to_make):
    "Make directory if it doesn't exist"
    if not os.path.isdir(dir_to_make):
        os.mkdir(dir_to_make)
        
def image_max_value(image_file,file_dir):
    "check if the selected image has non zero maximum value"
    img_max = 0
    with Image.open(file_dir+image_file) as img:
        img = np.asarray(img)
        if img.max() > 0:
            img_max = 1
        else:
            print(f"Image {image_file} is black.\n")
    return img_max

# ------------------------------------
# Index randomisation
def randomise_by_index(inputlist,idx_list):

    """
       Function to randomize an array of data
    """
    if len(inputlist)!=len(idx_list):
        print("These aren't the same length")
    outputlist = []
    for element in idx_list:
        outputlist.append(inputlist[element])

    return outputlist

# ------------------------------------
# Image normalization
def Image_normalize(restore_dir,filename):
    im = Image.open(restore_dir+filename)
    im = (np.array(im))
    # Normalize to [0,255]
    img_max, img_min = im.max(),im.min()
    if img_max != 0:
        im = (im - img_min) / (img_max - img_min)
        im *= 255.
        
    gray = im[:,:].flatten()
    filedata = np.array(list(gray),np.uint8)
    return filedata    

# 1. Data handling

Directories of **NVSS/FIRST** images

In [4]:
# NVSS/FIRST image folder dir
NVSS_dir = './3_Source_preprocessed_png_image/NVSS/'
FIRST_dir = './3_Source_preprocessed_png_image/FIRST/'
# Saving directory
save_dir = './4_DataPickle_Generation/'


Reading the file name under each directory

In [5]:
NVSS_file_list = get_file_list(NVSS_dir,'png')
FIRST_file_list = get_file_list(FIRST_dir,'png')

Number of file under the directory:  658
Number of file under the directory:  658


Split FR I/II according to their labels [NVSS and FIRST are the same]

In [6]:
# NVSS object file names
FRI_list = [element for element in NVSS_file_list if element[:-4].split('_')[1] == 'I']
FRII_list = [element for element in NVSS_file_list if element[:-4].split('_')[1] == 'II']
# Find the longest object id
longest_id_FRI = max(FRI_list, key=len)
longest_id_FRII = max(FRII_list, key=len)
max_id_len = len(max(longest_id_FRI,longest_id_FRII)) + 1 # + 1 is simply just in cases.
print(max_id_len)

31


Directory to save **FRDEEP v2** dataset pickle files.

In [7]:
# Directory definition
FIRST_save_dir = save_dir + "FIRST/" # FIRST image pickle
NVSS_save_dir = save_dir + "NVSS/" # NVSS image pickle
id_save_dir = save_dir + "object_id/" # object id pickle
# Make directory if not exist
make_dir(FIRST_save_dir)
make_dir(NVSS_save_dir)
make_dir(id_save_dir)

**Hyperparameters**

In [8]:
split_ratio = 0.7

# label names:
label_names = ['FR I','FR II']

# length of data arrays [npix x npix x rgb] 
nvis = 150 * 150 * 1

In [9]:
# FR Is
FRI_train = FRI_list[:int(split_ratio * len(FRI_list))]
FRI_test = FRI_list[int(split_ratio * len(FRI_list)):]
# FR IIs
FRII_train = FRII_list[:int(split_ratio * len(FRII_list))]
FRII_test = FRII_list[int(split_ratio * len(FRII_list)):]

Split train and test data

Now, loop through and fill the batches:

# 2. dataset file/folder generation

In [10]:
full_FIRST_name,full_NVSS_name = [], []
# ------------------------------------
# loop through and fill the batches:
for batch in tqdm(range(2), ascii=True, desc='Batch'):
    
    if (batch==1):
        # the last batch is the test batch:
        oname = "test_batch"
        batch_label = 'testing batch 1 of 1'
        FR1_list = FRI_test
        FR2_list = FRII_test
    else:
        # everything else is a training batch:
        oname = "train_batch"
        batch_label = 'training batch 1 of 1'
        FR1_list = FRI_train
        FR2_list = FRII_train
    # create empty arrays for the batches:
    labels=[]
    filedata_FIRST=[];data_FIRST=[];filenames_FIRST=[]
    filedata_NVSS=[];data_NVSS=[];filenames_NVSS=[]
    file_id = []
    
    # get FRI radio galaxies:
    for i in range(len(FR1_list)):

        # ------------------------------------------
        # FIRST data
        # Get file names in the specific batch
        filename_FIRST = FR1_list[i]
        # Save these filenames in a list in sequence.
        filenames_FIRST.append(filename_FIRST)
        full_FIRST_name.append(filename_FIRST)
        # Normalize the image to [0,255]
        filedata_FIRST = Image_normalize(FIRST_dir,filename_FIRST)
        # Save normalized image data to data list.
        data_FIRST.append(filedata_FIRST)
        # ------------------------------------------
        # NVSS data
        # Get file names in the specific batch
        filename_NVSS = FR1_list[i] # the filename of the same image (NVSS vs. FIRST)is the same.
        # Save these filenames in a list in sequence.
        filenames_NVSS.append(filename_NVSS)
        full_NVSS_name.append(filename_NVSS)
        # Normalize the image to [0,255]
        filedata_NVSS = Image_normalize(NVSS_dir,filename_NVSS)
        # Save normalized image data to data list.
        data_NVSS.append(filedata_NVSS)
        # ------------------------------------------
        # Source object id
        file_id.append("{:<31}".format(FR1_list[i][:-4].split('_')[0]))
        # ------------------------------------------
        # Label them as 0
        labels.append(0)

    # get GRG radio galaxies:
    for i in range(len(FR2_list)):
        # ------------------------------------------
        # FIRST data
        # Get file names in the specific batch
        filename_FIRST = FR2_list[i]
        # Save these filenames in a list in sequence.            
        filenames_FIRST.append(filename_FIRST)
        full_FIRST_name.append(filename_FIRST)
        # Normalize the image to [0,255]
        filedata_FIRST = Image_normalize(FIRST_dir,filename_FIRST)
        # Save normalized image data to data list.
        data_FIRST.append(filedata_FIRST)
        # ------------------------------------------
        # NVSS data
        # Get file names in the specific batch
        filename_NVSS = FR2_list[i]
        # Save these filenames in a list in sequence.
        filenames_NVSS.append(filename_NVSS)
        full_NVSS_name.append(filename_NVSS)
        # Normalize the image to [0,255]
        filedata_NVSS = Image_normalize(NVSS_dir,filename_NVSS)
        # Save normalized image data to data list.
        data_NVSS.append(filedata_NVSS)
        # ------------------------------------------
        # Source object id
        file_id.append("{:<31}".format(FR2_list[i][:-4].split('_')[0]))
        # ------------------------------------------
        # Label them as 1
        labels.append(1) 
    print(f'len of the current batch: {len(file_id)}')
    # randomise data in batch: (the truth is, it does not shuffle the data)
    idx_list = [i for i in range(0,len(file_id))]
    shuffle(idx_list)
    labels = randomise_by_index(labels,idx_list)
    # FIRST
    data_FIRST = randomise_by_index(data_FIRST,idx_list)
    filenames_FIRST = randomise_by_index(filenames_FIRST,idx_list)
    # NVSS
    data_NVSS = randomise_by_index(data_NVSS,idx_list)
    filenames_NVSS = randomise_by_index(filenames_NVSS,idx_list)
    # Source object id
    file_id = randomise_by_index(file_id,idx_list)
    
    # Print testing data names
    for jj in range(5):
        print(file_id[jj],filenames_FIRST[jj],filenames_NVSS[jj],labels[jj])
            
    # create dictionary of FIRST batch:
    dict_FIRST = {
            'batch_label':batch_label,
            'labels':labels,
            'data':data_FIRST,
            'filenames':filenames_FIRST
            }
    dict_NVSS = {
            'batch_label':batch_label,
            'labels':labels,
            'data':data_NVSS,
            'filenames':filenames_NVSS
            }
    # write pickled output for FRDEEP-F:
    with io.open(FIRST_save_dir+oname, 'wb') as f:
        pickle.dump(dict_FIRST, f)
    # write pickled output for FRDEEP-N:
    with io.open(NVSS_save_dir+oname, 'wb') as f:
        pickle.dump(dict_NVSS, f)
    # save object id
    print('# of source object id saved in this batch: ',len(file_id))
    file_id = np.asarray(file_id)
    np.save(id_save_dir+oname,file_id)
# end batch loop
# ------------------------------------
# now write the meta data file:
oname = 'batches.meta'

# ------------------------------------
# create dictionary of batch (FIRST):
dict_FIRST = {
        'label_names':label_names,
        'num_vis':nvis,
        }
# create dictionary of batch:
dict_NVSS = {
        'label_names':label_names,
        'num_vis':nvis,
        }

# ------------------------------------
# write pickled output (FRDEEP-F):
with io.open(FIRST_save_dir+oname, 'wb') as f:
    pickle.dump(dict_FIRST, f)
# write pickled output (FRDEEP-N):
with io.open(NVSS_save_dir+oname, 'wb') as f:
    pickle.dump(dict_NVSS, f)    

print('Dataset foundation finished:)')

Batch:  50%|#####     | 1/2 [00:05<00:05,  5.55s/it]

len of the current batch: 460
3C 207                          3C 207_II.png 3C 207_II.png 1
3C 240                          3C 240_II.png 3C 240_II.png 1
SDSS J121519.19+472142.4        SDSS J121519.19+472142.4_I.png SDSS J121519.19+472142.4_I.png 0
TXS 1409-030                    TXS 1409-030_II.png TXS 1409-030_II.png 1
SDSS J134745.19+503203.5        SDSS J134745.19+503203.5_I.png SDSS J134745.19+503203.5_I.png 0
# of source object id saved in this batch:  460


Batch: 100%|##########| 2/2 [00:07<00:00,  3.95s/it]

len of the current batch: 198
3C 334                          3C 334_II.png 3C 334_II.png 1
4C 10.40                        4C 10.40_II.png 4C 10.40_II.png 1
SDSS J150148.14+163345.6        SDSS J150148.14+163345.6_I.png SDSS J150148.14+163345.6_I.png 0
TXS 1536+144                    TXS 1536+144_II.png TXS 1536+144_II.png 1
BWE 1459+2451                   BWE 1459+2451_II.png BWE 1459+2451_II.png 1
# of source object id saved in this batch:  198
Dataset foundation finished:)





# 3.  tar gz files for selected folder

## 3.1 package import

In [11]:
import tarfile
import os.path

## 3.2 function definition

In [12]:
def make_tarfile(output_filename, source_dir):
    "https://stackoverflow.com/questions/2032403/how-to-create-full-compressed-tar-file-using-python"
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

## 3.2 Define output file names/ source directories

In [13]:
# FIRST 
output_filename_FIRST = FIRST_save_dir[:-1] + '.tar.gz'
source_dir_FIRST = FIRST_save_dir
make_tarfile(output_filename_FIRST, source_dir_FIRST)
print(output_filename_FIRST)
# NVSS
output_filename_NVSS = NVSS_save_dir[:-1] + '.tar.gz'
source_dir_NVSS = NVSS_save_dir
make_tarfile(output_filename_NVSS, source_dir_NVSS)
print(output_filename_NVSS)
# Save the whole thing as a compressed file
output_filename_FRDEEPv2 = './FRDEEPv2.tar.gz'
make_tarfile(output_filename_FRDEEPv2, save_dir)
print(output_filename_FRDEEPv2)

./4_DataPickle_Generation/FIRST.tar.gz
./4_DataPickle_Generation/NVSS.tar.gz
./FRDEEPv2.tar.gz


# 4. Copy image files to the new upper directory

Directories to copy the images

In [14]:
NVSS_dir_target = save_dir + 'NVSS_IMG/'
FIRST_dir_target = save_dir + 'FIRST_IMG/'
# make directory (if not exist)
make_dir(NVSS_dir_target)
make_dir(FIRST_dir_target)

Image directories to be copied to (origin)

In [15]:
FIRST_file_dir_origin = [FIRST_dir + i for i in full_FIRST_name]
NVSS_file_dir_origin = [NVSS_dir + i for i in full_NVSS_name]

Image directories to get copied (target)

In [16]:
FIRST_file_dir_target = [FIRST_dir_target + i for i in full_FIRST_name]
NVSS_file_dir_target = [NVSS_dir_target + i for i in full_NVSS_name]

Do the copy

In [17]:
import shutil
for kk in range(len(FIRST_file_dir_origin)):
    shutil.copyfile(FIRST_file_dir_origin[kk], FIRST_file_dir_target[kk]) # FIRST image copy
    shutil.copyfile(NVSS_file_dir_origin[kk], NVSS_file_dir_target[kk]) # FIRST image copy

## Generate md5 code for data

### NVSS

In [22]:
meta_NVSS, data_batches_NVSS, test_batch_NVSS = md5_batch_gen.md5_data_batch_gen(NVSS_save_dir[:-1])
print(meta_NVSS, data_batches_NVSS, test_batch_NVSS)
NVSS_tgz_md5 = md5_batch_gen.md5_gen(output_filename_NVSS)
print(NVSS_tgz_md5)

['batches.meta', 'a8bb67d1caf2d0ca9fa501b337e39ea6'] [['train_batch', 'a7bd9f5f3f27f395f51e424a89e48db9']] ['test_batch', '9a46e93a9932f986efc94fddc7de0164']
3b6632b869370e0c678ab69dfe43d4c5


### FIRST

In [24]:
meta_FIRST, data_batches_FIRST, test_batch_FIRST = md5_batch_gen.md5_data_batch_gen(FIRST_save_dir[:-1])
print(meta_FIRST, data_batches_FIRST, test_batch_FIRST)
FIRST_tgz_md5 = md5_batch_gen.md5_gen(output_filename_FIRST)
print(FIRST_tgz_md5)

['batches.meta', 'a8bb67d1caf2d0ca9fa501b337e39ea6'] [['train_batch', '234b66460e95834c78cee7c7a73ed916']] ['test_batch', 'acb0277e9feb983ff8fa79c31c4b395a']
c57d884b8daba9aa5bbce8f383594291


### full

In [25]:
FULL_tgz_md5 = md5_batch_gen.md5_gen(output_filename_FRDEEPv2)
print(FULL_tgz_md5)

45bad6ed4e96ee4685306c013b0953f9
