## 1 Pre-tratamiento datos

Realiza un filtrado previo de los datos de entrada:

1. Elimina los blobs de pequeñas dimensiones (menos de 20 píxeles)

2. Elimina los blobs que aparacen en mas de una imagen

3. Agrupa los distintos dataSets de un mismos material en un único dataSet (*)

* **Datos origen:** C:\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\01_DATASET_ORIGINAL
* **Datos destino:** C:\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\02_DATASET_Pre_Treatment

(*) Al poderse dar el caso que dos dataset contengan un mismo identificador de imagen, al concatenarlos se incrementarà 1.000 el identificador de imagen cosecutivamente a cada dataset (1.000 al segundo, 2.000 al tercero,...) para evitar duplicidades de identificadores

In [2]:
from pathlib import Path
import pandas as pd

def filterData(fileData, showLogs = False):
    THRESHOLD_SIZE = 100

    lastImgs = [[],[]]
    thisImg = []
    idxImg = -1
    
    nAllBlobs = 0
    nRepeatedBlobs = 0
    nTooSmallBlobs = 0
    nRemainingBlobs = 0

    dataOut = pd.DataFrame()
    dataBlobs= pd.read_csv(fileData, sep='\t', header=None)
        
    groups= dataBlobs.groupby([0, 1]) #Imagen + blob
    keys = groups.groups.keys()
    nAllBlobs= len(keys)
    for idx, key in enumerate(keys):

        if(idxImg!=key[0]):
            lastImgs.append(thisImg)
            lastImgs = lastImgs[-2:]
            thisImg=[]
            idxImg = key[0]

        data = groups.get_group(key) 
        blob=(len(data),data[3].mean())
        if not((blob in lastImgs[0]) or (blob in lastImgs[1])):
            if len(data)>= THRESHOLD_SIZE:
                thisImg.append(blob)
                dataOut = pd.concat([dataOut, data], ignore_index=True, sort=False)
                nRemainingBlobs += 1
            else:
                nTooSmallBlobs += 1
        else:
            nRepeatedBlobs += 1

    if showLogs:
        print("Data File: ",Path(fileData).name)
        print("All Blobs: ",nAllBlobs,"  Repeated blobs:",nRepeatedBlobs,"  Too small blobs:",nTooSmallBlobs,"  Remaining blobs -->",nRemainingBlobs,end='\n\n')
    
    return dataOut

Llamadas para aplicar el filtrado y obtener un único dataset por clase al conjunto de datos iniciales:

In [3]:
import os 

sourceData = r"C:\\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\01_DATASET_ORIGINAL"
destData = r"C:\\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\02_DATASET_Pre_Treatment"


# HDPE (Polietileno alta densidad) ----------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_01_bis.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_04.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_05.csv"),showLogs=True)
dfData3[0] +=2000
dfData = pd.concat([dfData1,dfData2,dfData3])
dfData.to_csv(os.path.join(destData,"blobs_001_01a_HDPE.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_tricapa_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_tricapa_02.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_tricapa_02_bis.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_hdpe_tricapa_03.csv"),showLogs=True)
dfData4[0] +=3000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4])
dfData.to_csv(os.path.join(destData,"blobs_001_01b_HDPE_Tricapa.csv"), sep='\t', header=None, index=False) 
print('=======================================================================================')


# PET (Polietileno tereftalato) -------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_monocapa_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_monocapa_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_02a_PET_BandejaMonocapa.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_multi_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_multi_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_multi_02.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_multi_03.csv"),showLogs=True)
dfData4[0] +=3000
dfData5 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_bandeja_multi_03_bis.csv"),showLogs=True)
dfData5[0] +=4000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4,dfData5])
dfData.to_csv(os.path.join(destData,"blobs_001_02b_PET_BandejaMulticapa.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_azulado_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_azulado_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_azulado_02.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_azulado_02_bis.csv"),showLogs=True)
dfData4[0] +=3000
dfData5 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_azulado_03.csv"),showLogs=True)
dfData5[0] +=4000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4,dfData5])
dfData.to_csv(os.path.join(destData,"blobs_001_02c_PET_BotellaAzulado.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_color_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_color_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_color_02.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_color_02_bis.csv"),showLogs=True)
dfData4[0] +=3000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4])
dfData.to_csv(os.path.join(destData,"blobs_001_02d_PET_BotellaColor.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_light_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_light_02.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_light_02_bis.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_pet_botella_light_03_bis.csv"),showLogs=True)
dfData4[0] +=3000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4])
dfData.to_csv(os.path.join(destData,"blobs_001_02e_PET_BotellaLight.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# PP (Polipropileno) ------------------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_02.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_02_bis.csv"),showLogs=True)
dfData3[0] +=2000
dfData4 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_03.csv"),showLogs=True)
dfData4[0] +=3000
dfData5 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_03_bis.csv"),showLogs=True)
dfData5[0] +=4000
dfData6 = filterData(os.path.join(sourceData,"blobs_Data_clase_pp_04.csv"),showLogs=True)
dfData6[0] +=5000
dfData = pd.concat([dfData1,dfData2,dfData3,dfData4,dfData5,dfData6])
dfData.to_csv(os.path.join(destData,"blobs_001_03a_PP.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_film_pp_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_film_pp_02.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_03b_PP_Film.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# PS (Poliestireno) -------------------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_ps_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_ps_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_04a_PS.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# PVC (Policloruro de vinilo) ---------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_pvc_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_05a_PVC.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# Papel y cartón ----------------------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_cartoncillo_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_cartoncillo_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_06a_Cartoncillo.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_carton_color_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_carton_color_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_06b_CartonColor.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_carton_marron_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_06c_CartonMarron.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_papel_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_papel_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_06d_Papel.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# Latas -------------------------------------------------------------------------------------
dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_latas_metal_ferrico_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_latas_metal_ferrico_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData3 = filterData(os.path.join(sourceData,"blobs_Data_clase_latas_metal_ferrico_02.csv"),showLogs=True)
dfData3[0] +=2000
dfData = pd.concat([dfData1,dfData2,dfData3])
dfData.to_csv(os.path.join(destData,"blobs_001_07a_Latas_MetalFerrico.csv"), sep='\t', header=None, index=False)    
print('---------------------------------------------------------------------------')

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_latas_metal_no_ferrico_01.csv"),showLogs=True)
dfData2 = filterData(os.path.join(sourceData,"blobs_Data_clase_latas_metal_no_ferrico_01_bis.csv"),showLogs=True)
dfData2[0] +=1000
dfData = pd.concat([dfData1,dfData2])
dfData.to_csv(os.path.join(destData,"blobs_001_07b_Latas_MetalNoFerrico.csv"), sep='\t', header=None, index=False)    
print('=======================================================================================')


# Material impropio -------------------------------------------------------------------------

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_impropio_ferrico_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_08a_Impropio_Ferrico.csv"), sep='\t', header=None, index=False)    

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_impropio_no_ferrico_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_08b_Impropio_NoFerrico.csv"), sep='\t', header=None, index=False)    

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_impropio_madera_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_08c_Impropio_Madera.csv"), sep='\t', header=None, index=False)    

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_impropio_organico_hojas_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_08d_Impropio_OrganicoHojas.csv"), sep='\t', header=None, index=False)    

dfData1 = filterData(os.path.join(sourceData,"blobs_Data_clase_impropio_textil_algodon_01.csv"),showLogs=True)
dfData1.to_csv(os.path.join(destData,"blobs_001_08e_Impropio_Textil.csv"), sep='\t', header=None, index=False)    

print("###################### Pre-treatment finished ######################")


Data File:  blobs_Data_clase_hdpe_01_bis.csv
All Blobs:  339   Repeated blobs: 97   Too small blobs: 60   Remaining blobs --> 182

Data File:  blobs_Data_clase_hdpe_04.csv
All Blobs:  812   Repeated blobs: 164   Too small blobs: 221   Remaining blobs --> 427

Data File:  blobs_Data_clase_hdpe_05.csv
All Blobs:  300   Repeated blobs: 86   Too small blobs: 49   Remaining blobs --> 165

---------------------------------------------------------------------------
Data File:  blobs_Data_clase_hdpe_tricapa_01.csv
All Blobs:  523   Repeated blobs: 153   Too small blobs: 63   Remaining blobs --> 307

Data File:  blobs_Data_clase_hdpe_tricapa_02.csv
All Blobs:  1293   Repeated blobs: 363   Too small blobs: 227   Remaining blobs --> 703

Data File:  blobs_Data_clase_hdpe_tricapa_02_bis.csv
All Blobs:  104   Repeated blobs: 36   Too small blobs: 14   Remaining blobs --> 54

Data File:  blobs_Data_clase_hdpe_tricapa_03.csv
All Blobs:  721   Repeated blobs: 199   Too small blobs: 172   Remaining blo

Data File:  blobs_Data_clase_impropio_ferrico_01.csv
All Blobs:  47   Repeated blobs: 10   Too small blobs: 25   Remaining blobs --> 12

Data File:  blobs_Data_clase_impropio_no_ferrico_01.csv
All Blobs:  268   Repeated blobs: 45   Too small blobs: 185   Remaining blobs --> 38

Data File:  blobs_Data_clase_impropio_madera_01.csv
All Blobs:  140   Repeated blobs: 37   Too small blobs: 43   Remaining blobs --> 60

Data File:  blobs_Data_clase_impropio_organico_hojas_01.csv
All Blobs:  723   Repeated blobs: 158   Too small blobs: 404   Remaining blobs --> 161

Data File:  blobs_Data_clase_impropio_textil_algodon_01.csv
All Blobs:  161   Repeated blobs: 6   Too small blobs: 14   Remaining blobs --> 141

###################### Pre-treatment finished ######################
