## 3 Limpiar datos

Se procede a analizar el cojunto de blobs para eliminar los que pueden contener outliers que puedan falsear la búsqueda de patrones. Por lo que debemos eliminar:

* Los blobs que puedan contener mas de un tipo de material (etiquetas, tapones, etc...)

Esta eliminación debe hacerse de forma totalmente automàtica. El limpiado de datos de ejecutar una pasadas:

1. Eliminar los blobs que contengan mas de un material

Para ello se procede a la detección de ***outliers*** con tècnicas de anàlisis de **desviación estandard** y **clustening**


In [5]:
from tqdm.notebook import tqdm  #mostra la barra s'estat
from statsmodels.stats.diagnostic import lilliefors
from scipy import stats
from pathlib import Path
import pandas as pd
import numpy as np

def cleanData(fileData,showLogs=False):

    THRES_shapiro = 0.91
    THRES_lilliefors = 0.08

    dataOutPRE = pd.DataFrame()
    dataOut = pd.DataFrame()
    dataBlobs= pd.read_csv(fileData, sep='\t')

    groups= dataBlobs.groupby(["image","blob"]) #Imagen + blob
    keys = groups.groups.keys()
    nAllBlobs = len(keys)
    nMixMaterialsBlobs = 0
    nRepresentativeBlobs = 0

    for idx, key in enumerate(tqdm(keys)):    
        dataBlob = groups.get_group(key)
        
        limInf = np.percentile(dataBlob['20_StDev_NOR'],5)
        limSup = np.percentile(dataBlob['20_StDev_NOR'],95)
        dataBlob= dataBlob.drop(dataBlob[(dataBlob['20_StDev_NOR']>limSup) | (dataBlob['20_StDev_NOR']<limInf)].index)
        
        stat1, p1 = stats.shapiro(pd.DataFrame(dataBlob['20_StDev_NOR']))
        stat2, p2 = lilliefors(dataBlob['20_StDev_NOR'])

        if stat1>THRES_shapiro and stat2<THRES_lilliefors:
            nRepresentativeBlobs +=1

            dataOutPRE = pd.concat([dataOutPRE, dataBlob], ignore_index=True, sort=False)
#            print(dataBlob['20_StDev_NOR'].mean())
            #w = sns.histplot(color= "red",data=pd.DataFrame(dataBlob['20_StDev_NOR']), kde=True, stat='density',ax=axes[0]).set(title=str(key)+"## OK ##")
        else:
            nMixMaterialsBlobs += 1

    if showLogs:
        print("Data File: ",Path(fileData).name)
        print("All Blobs: ",nAllBlobs,"  Representative blobs:",nRepresentativeBlobs,"  (Discarded : "+str(nAllBlobs-nRepresentativeBlobs)+")",end='\n\n')

    return dataOutPRE

Llamadas para aplicar el filtrado al conjunto de datos iniciales:

In [6]:
import os 

sourceData = r"C:\\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\03_DATASET_Normalization"
destData = r"C:\\Users\jrosell\Hyperspectral\___PFM___\01_DATASET\04_DATASET_OnlyOneMaterial"


dfData = cleanData(os.path.join(sourceData,"blobs_002_01a_HDPE.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_01a_HDPE.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_01b_HDPE_Tricapa.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_01b_HDPE_Tricapa.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_02a_PET_BandejaMonocapa.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_02a_PET_BandejaMonocapa.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_02b_PET_BandejaMulticapa.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_02b_PET_BandejaMulticapa.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_02c_PET_BotellaAzulado.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_02c_PET_BotellaAzulado.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_02d_PET_BotellaColor.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_02d_PET_BotellaColor.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_02e_PET_BotellaLight.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_02e_PET_BotellaLight.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_03a_PP.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_03a_PP.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_03b_PP_Film.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_03b_PP_Film.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_04a_PS.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_04a_PS.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_05a_PVC.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_05a_PVC.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_06a_Cartoncillo.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_06a_Cartoncillo.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_06b_CartonColor.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_06b_CartonColor.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_06c_CartonMarron.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_06c_CartonMarron.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_06d_Papel.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_06d_Papel.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_07a_Latas_MetalFerrico.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_07a_Latas_MetalFerrico.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_07b_Latas_MetalNoFerrico.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_07b_Latas_MetalNoFerrico.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_08a_Impropio_Ferrico.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_08a_Impropio_Ferrico.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_08b_Impropio_NoFerrico.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_08b_Impropio_NoFerrico.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_08c_Impropio_Madera.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_08c_Impropio_Madera.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_08d_Impropio_OrganicoHojas.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_08d_Impropio_OrganicoHojas.csv"), sep='\t', index=False)

dfData = cleanData(os.path.join(sourceData,"blobs_002_08e_Impropio_Textil.csv"),showLogs=True)
dfData.to_csv(os.path.join(destData,"blobs_003_08e_Impropio_Textil.csv"), sep='\t', index=False)


  0%|          | 0/774 [00:00<?, ?it/s]

Data File:  blobs_002_01a_HDPE.csv
All Blobs:  774   Representative blobs: 258   (Discarded : 516)



  0%|          | 0/1414 [00:00<?, ?it/s]

Data File:  blobs_002_01b_HDPE_Tricapa.csv
All Blobs:  1414   Representative blobs: 618   (Discarded : 796)



  0%|          | 0/206 [00:00<?, ?it/s]

Data File:  blobs_002_02a_PET_BandejaMonocapa.csv
All Blobs:  206   Representative blobs: 105   (Discarded : 101)



  0%|          | 0/1080 [00:00<?, ?it/s]

Data File:  blobs_002_02b_PET_BandejaMulticapa.csv
All Blobs:  1080   Representative blobs: 386   (Discarded : 694)



  0%|          | 0/534 [00:00<?, ?it/s]

Data File:  blobs_002_02c_PET_BotellaAzulado.csv
All Blobs:  534   Representative blobs: 223   (Discarded : 311)



  0%|          | 0/629 [00:00<?, ?it/s]

Data File:  blobs_002_02d_PET_BotellaColor.csv
All Blobs:  629   Representative blobs: 252   (Discarded : 377)



  0%|          | 0/652 [00:00<?, ?it/s]

Data File:  blobs_002_02e_PET_BotellaLight.csv
All Blobs:  652   Representative blobs: 254   (Discarded : 398)



  0%|          | 0/1069 [00:00<?, ?it/s]

Data File:  blobs_002_03a_PP.csv
All Blobs:  1069   Representative blobs: 472   (Discarded : 597)



  0%|          | 0/444 [00:00<?, ?it/s]

Data File:  blobs_002_03b_PP_Film.csv
All Blobs:  444   Representative blobs: 247   (Discarded : 197)



  0%|          | 0/332 [00:00<?, ?it/s]

Data File:  blobs_002_04a_PS.csv
All Blobs:  332   Representative blobs: 132   (Discarded : 200)



  0%|          | 0/61 [00:00<?, ?it/s]

Data File:  blobs_002_05a_PVC.csv
All Blobs:  61   Representative blobs: 34   (Discarded : 27)



  0%|          | 0/329 [00:00<?, ?it/s]

Data File:  blobs_002_06a_Cartoncillo.csv
All Blobs:  329   Representative blobs: 105   (Discarded : 224)



  0%|          | 0/519 [00:00<?, ?it/s]

Data File:  blobs_002_06b_CartonColor.csv
All Blobs:  519   Representative blobs: 173   (Discarded : 346)



  0%|          | 0/528 [00:00<?, ?it/s]

Data File:  blobs_002_06c_CartonMarron.csv
All Blobs:  528   Representative blobs: 203   (Discarded : 325)



  0%|          | 0/823 [00:00<?, ?it/s]

Data File:  blobs_002_06d_Papel.csv
All Blobs:  823   Representative blobs: 269   (Discarded : 554)



  0%|          | 0/237 [00:00<?, ?it/s]

Data File:  blobs_002_07a_Latas_MetalFerrico.csv
All Blobs:  237   Representative blobs: 43   (Discarded : 194)



  0%|          | 0/255 [00:00<?, ?it/s]

Data File:  blobs_002_07b_Latas_MetalNoFerrico.csv
All Blobs:  255   Representative blobs: 137   (Discarded : 118)



  0%|          | 0/12 [00:00<?, ?it/s]

Data File:  blobs_002_08a_Impropio_Ferrico.csv
All Blobs:  12   Representative blobs: 4   (Discarded : 8)



  0%|          | 0/38 [00:00<?, ?it/s]

Data File:  blobs_002_08b_Impropio_NoFerrico.csv
All Blobs:  38   Representative blobs: 21   (Discarded : 17)



  0%|          | 0/60 [00:00<?, ?it/s]

Data File:  blobs_002_08c_Impropio_Madera.csv
All Blobs:  60   Representative blobs: 27   (Discarded : 33)



  0%|          | 0/161 [00:00<?, ?it/s]

Data File:  blobs_002_08d_Impropio_OrganicoHojas.csv
All Blobs:  161   Representative blobs: 67   (Discarded : 94)



  0%|          | 0/141 [00:00<?, ?it/s]



Data File:  blobs_002_08e_Impropio_Textil.csv
All Blobs:  141   Representative blobs: 95   (Discarded : 46)

