## image-data-organiser.py

1. Import pandas and os
    
    The 'pandas' module is used to read in .csv files
    The 'os' module is used to access files

In [1]:
import pandas as pd
import os, subprocess
from random import shuffle
import convert_bytes_to_image as byte2img
from PIL import Image

2. Store the absolute path of the file, 'trainLabels.csv', which contains the malware file labels, and is stored in the same directory as this interactive python notebook file

In [2]:
train_labels_file = os.path.abspath('./trainLabels.csv')

3. Read in the data stored in the csv file

In [None]:
train_label_data = pd.read_csv(train_labels_file)

4. Iterate through the rows in 'trainLabels.csv'
    
    * For each row, based on the label number, group them accordingly by storing them into their respective lists

In [4]:
ramnit_train_set = []
lollipop_train_set = []
kelihos_ver3_set = []
vundo_set = []
simda_set = []
tracur_set = []
kelihos_ver1_set = []
obfuscator_acy_set = []
gatak_set = []
for value, label in train_label_data.get_values():
    if label == 1:
        ramnit_train_set.append(value)
    elif label == 2:
        lollipop_train_set.append(value)
    elif label == 3:
        kelihos_ver3_set.append(value)
    elif label == 4:
        vundo_set.append(value)
    elif label == 5:
        simda_set.append(value)
    elif label == 6:
        tracur_set.append(value)
    elif label == 7:
        kelihos_ver1_set.append(value)
    elif label == 8:
        obfuscator_acy_set.append(value)
    elif label == 9:
        gatak_set.append(value)
    else:
        print('Does not belong to any class: ', dict_value)

   * Print the number of samples available to train the neural network model for each malware family

In [5]:
print('ramnit_train_set:   --', len(ramnit_train_set))
print('lollipop_train_set: --', len(lollipop_train_set))
print('kelihos_ver3_set:   --', len(kelihos_ver3_set))
print('vundo_set:          --', len(vundo_set))
print('simda_set:          --', len(simda_set))
print('tracur_set:         --', len(tracur_set))
print('kelihos_ver1_set:   --', len(kelihos_ver1_set))
print('obfuscator_acy_set: --', len(obfuscator_acy_set))
print('gatak_set:          --', len(gatak_set))
print('\n')
print('Total:              --', str(len(ramnit_train_set) + len(lollipop_train_set)
                 + len(kelihos_ver3_set) + len(vundo_set) + len(simda_set)
                 + len(tracur_set) + len(kelihos_ver1_set)
                 + len(obfuscator_acy_set) + len(gatak_set)))

ramnit_train_set:   -- 1541
lollipop_train_set: -- 2478
kelihos_ver3_set:   -- 2942
vundo_set:          -- 475
simda_set:          -- 42
tracur_set:         -- 751
kelihos_ver1_set:   -- 398
obfuscator_acy_set: -- 1228
gatak_set:          -- 1013


Total:              -- 10868


5. Copy the training sets (to perform shuffling, as shuffling will manipulate the original DIRECTLY)

In [6]:
ramnit_train_shuffled = ramnit_train_set[:]
lollipop_train_shuffled = lollipop_train_set[:]
kelihos_ver3_shuffled = kelihos_ver3_set[:]
vundo_shuffled = vundo_set[:]
simda_shuffled = simda_set[:]
tracur_shuffled = tracur_set[:]
kelihos_ver1_shuffled = kelihos_ver1_set[:]
obfuscator_acy_shuffled = obfuscator_acy_set[:]
gatak_shuffled = gatak_set[:]

6. Shuffle the lists

In [7]:
shuffle(ramnit_train_shuffled)
shuffle(lollipop_train_shuffled)
shuffle(kelihos_ver3_shuffled)
shuffle(vundo_shuffled)
shuffle(simda_shuffled)
shuffle(tracur_shuffled)
shuffle(kelihos_ver1_shuffled)
shuffle(obfuscator_acy_shuffled)
shuffle(gatak_shuffled)

7. Select the first 5 samples from each shuffled lists

In [8]:
ramnit_train_rand = ramnit_train_shuffled[:5]
lollipop_train_rand = lollipop_train_shuffled[:5]
kelihos_ver3_rand = kelihos_ver3_shuffled[:5]
vundo_rand = vundo_shuffled[:5]
simda_rand = simda_shuffled[:5]
tracur_rand = tracur_shuffled[:5]
kelihos_ver1_rand = kelihos_ver1_shuffled[:5]
obfuscator_acy_rand = obfuscator_acy_shuffled[:5]
gatak_rand = gatak_shuffled[:5]

8. Store the path to the image directories and dataset directory
    
    * If the image directories does not exist, create them for the generated images to be stored

In [9]:
image_dir = os.path.abspath('./images')
if not os.path.exists(image_dir):
    os.mkdir(image_dir)

data_dir = os.path.abspath('/data/train')

In [None]:
ramnit_image_dir = os.path.join(image_dir, '1_ramnit')
if not os.path.exists(ramnit_image_dir):
    os.mkdir(ramnit_image_dir)
    
lollipop_image_dir = os.path.join(image_dir, '2_lollipop')
if not os.path.exists(lollipop_image_dir):
    os.mkdir(lollipop_image_dir)

kelihos_ver3_image_dir = os.path.join(image_dir, '3_kelihos_ver3')
if not os.path.exists(kelihos_ver3_image_dir):
    os.mkdir(kelihos_ver3_image_dir)
    
vundo_image_dir = os.path.join(image_dir, '4_vundo')
if not os.path.exists(vundo_image_dir):
    os.mkdir(vundo_image_dir)

simda_image_dir = os.path.join(image_dir, '5_simda')
if not os.path.exists(simda_image_dir):
    os.mkdir(simda_image_dir)

tracur_image_dir = os.path.join(image_dir, '6_tracur')
if not os.path.exists(tracur_image_dir):
    os.mkdir(tracur_image_dir)

kelihos_ver1_image_dir = os.path.join(image_dir, '7_kelihos_ver1')
if not os.path.exists(kelihos_ver1_image_dir):
    os.mkdir(kelihos_ver1_image_dir)

obfuscator_acy_image_dir = os.path.join(image_dir, '8_obfuscator_acy')
if not os.path.exists(obfuscator_acy_image_dir):
    os.mkdir(obfuscator_acy_image_dir)

gatak_image_dir = os.path.join(image_dir, '9_gatak')
if not os.path.exists(gatak_image_dir):
    os.mkdir(gatak_image_dir)


9. Generate the images for each of the samples chosen and store them into their directories

In [10]:
for file in ramnit_train_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(ramnit_image_dir, ('ramnit-'+file+'.png')))
    

In [11]:
for file in lollipop_train_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(lollipop_image_dir, ('lollipop-'+file+'.png')))


In [12]:
for file in kelihos_ver3_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(kelihos_ver3_image_dir, ('kelihos-ver3-'+file+'.png')))
    

In [13]:
for file in vundo_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(vundo_image_dir, ('vundo-'+file+'.png')))


In [14]:
for file in simda_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(simda_image_dir, ('simda-'+file+'.png')))


In [15]:
for file in tracur_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(tracur_image_dir, ('tracur-'+file+'.png')))


In [16]:
for file in kelihos_ver1_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(kelihos_ver1_image_dir, ('kelihos-ver1-'+file+'.png')))


In [17]:
for file in obfuscator_acy_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(obfuscator_acy_image_dir, ('obfuscator-acy-'+file+'.png')))


In [18]:
for file in gatak_rand:
    filename = os.path.join(data_dir, (file+'.bytes'))
    img_arr = byte2img.convert(filename)
    img = Image.fromarray(img_arr)
    img.save(os.path.join(gatak_image_dir, ('gatak-'+file+'.png')))
    