In [1]:
import numpy as np 
import pandas as pd 
import random

import cv2
import os


### 1. Preparing training and test csv files

In [3]:
root_dir = r'../'
images_dir = os.path.join(root_dir,'db','malevis_train')

data_dir_list = os.listdir(images_dir)
print ('the data list is: ',data_dir_list)


the data list is:  ['Vilsel', 'Androm', 'Snarasite', 'MultiPlug', 'Hlux', 'VBA', 'Regrun', 'Injector', 'BrowseFox', 'Fasong', 'Allaple', 'Agent', 'Amonetize', 'Other', 'VBKrypt', 'HackKMS', 'Dinwod', 'Adposhel', 'Neshta', 'Autorun', 'InstallCore', 'Sality', 'Neoreklami', 'Stantinko', 'Elex', 'Expiro']


In [4]:
# Assigning labels to each flower category
num_classes = len(data_dir_list)
labels_name = {}
# Preenchendo o dicionário com a classe como chave e o índice como valor
for idx, class_name in enumerate(data_dir_list):
    labels_name[class_name] = idx
    
# Imprimindo o número de classes e o dicionário de mapeamento
print("Número de classes: ", num_classes)
print("Classes mapeadas: ", labels_name)

Número de classes:  26
Classes mapeadas:  {'Vilsel': 0, 'Androm': 1, 'Snarasite': 2, 'MultiPlug': 3, 'Hlux': 4, 'VBA': 5, 'Regrun': 6, 'Injector': 7, 'BrowseFox': 8, 'Fasong': 9, 'Allaple': 10, 'Agent': 11, 'Amonetize': 12, 'Other': 13, 'VBKrypt': 14, 'HackKMS': 15, 'Dinwod': 16, 'Adposhel': 17, 'Neshta': 18, 'Autorun': 19, 'InstallCore': 20, 'Sality': 21, 'Neoreklami': 22, 'Stantinko': 23, 'Elex': 24, 'Expiro': 25}


In [5]:
# Criação dos DataFrames
train_data = []
test_data = []

# Número de imagens para o conjunto de teste de cada categoria de malware
num_images_for_test = 60

# Loop sobre cada categoria de malware
for dataset in data_dir_list:
    # Carregar a lista de nomes de imagem em cada categoria
    img_list = os.listdir(os.path.join(images_dir, dataset))
    print('Loading the images of dataset-' + '{}\n'.format(dataset))
    label = labels_name[dataset]
    num_img_files = len(img_list)
    num_corrupted_files = 0
    
    # Certifique-se de que não tente amostrar mais imagens do que existem
    num_images_for_test = min(num_images_for_test, num_img_files - 1)
    test_list_index = random.sample(range(num_img_files), num_images_for_test)

    # Ler cada arquivo
    for i in range(num_img_files):
        img_name = img_list[i]
        img_filename = os.path.join(images_dir, dataset, img_name)
        
        try:
            input_img = cv2.imread(img_filename)
            if input_img is None:
                raise ValueError("Image is None")  # Lança erro se a imagem não puder ser lida
            
            if i in test_list_index:
                test_data.append({'FileName': img_filename, 'Label': label, 'ClassName': dataset})
            else:
                train_data.append({'FileName': img_filename, 'Label': label, 'ClassName': dataset})
                
        except Exception as e:
            print(f'{img_filename} is corrupted or not readable: {e}\n')
            num_corrupted_files += 1
    
    print('Read {0} images out of {1} images from data dir {2}\n'.format(num_img_files - num_corrupted_files, num_img_files, dataset))

# Criação dos DataFrames finais
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print('Completed reading all the image files and assigned labels accordingly')


Loading the images of dataset-Vilsel

Read 350 images out of 350 images from data dir Vilsel

Loading the images of dataset-Androm

Read 350 images out of 350 images from data dir Androm

Loading the images of dataset-Snarasite

Read 350 images out of 350 images from data dir Snarasite

Loading the images of dataset-MultiPlug

Read 350 images out of 350 images from data dir MultiPlug

Loading the images of dataset-Hlux

Read 350 images out of 350 images from data dir Hlux

Loading the images of dataset-VBA

Read 350 images out of 350 images from data dir VBA

Loading the images of dataset-Regrun

Read 350 images out of 350 images from data dir Regrun

Loading the images of dataset-Injector

Read 350 images out of 350 images from data dir Injector

Loading the images of dataset-BrowseFox

Read 350 images out of 350 images from data dir BrowseFox

Loading the images of dataset-Fasong

Read 350 images out of 350 images from data dir Fasong

Loading the images of dataset-Allaple

Read 350 

In [14]:
dest_path = os.path.join(os.getcwd(), 'dataset', 'annotations')

if not os.path.exists(dest_path):
    try:
        os.makedirs(dest_path, exist_ok=True)
        print(f"Diretório {dest_path} criado com sucesso!")
    except Exception as e:
        print(f"Erro ao criar diretório: {e}")
else:
    print(f"Diretório {dest_path} já existe.")

train_df.to_csv(os.path.join(dest_path,'malevis_recognition_train.csv'))
test_df.to_csv(os.path.join(dest_path,'malevis_recognition_test.csv'))
print('Os arquivos csv de treino e teste foram criados com sucesso.')

Diretório /home/ivo/data/Studys/Electric/TCC/Code/ssl/dataset/annotations criado com sucesso!
The train and test csv files are saved
