# 1. Bibliotecas

In [2]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
import os
import scipy.ndimage
import matplotlib.pyplot as plt

from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection


import sys
import logging
import six
from radiomics import featureextractor, getFeatureClasses
import radiomics
from radiomics import featureextractor

import SimpleITK as sitk
import pylidc as pl
from pylidc.utils import consensus


# 2. Features Pylidc

In [3]:
scans = pl.query(pl.Scan).all()
annotations = pl.query(pl.Annotation).all()
print(len(scans))
print(len(annotations))

1018
6859


--------------------------------------------------

##### Função que cria id para cada nódulo com base em clusters da função scan.cluster_annotations():

In [4]:
def id_nodulo(scans):
    #Criar o dataframe
    nodulos = pd.DataFrame(columns = ['Nodule_Id', 'Patient_id', 'Annotation_id', 'Scan_id'])
    i = 1
    j = 0
    for scan in scans:
        patient = scan.patient_id
        nods = scan.cluster_annotations()
        for anns in nods:
            for ann in anns:
                lista = []
                lista.append(i)
                lista.append(patient)
                lista.append(ann.id)
                lista.append(ann.scan_id)
                nodulos.loc[j] = lista
                j = j+1
            i = i+1
    return(nodulos)

##### Função que recebe uma anotação e transforma numa row para o dataframe:

In [5]:
def ann_row(ann):
    ids = np.array([ann.scan.patient_id, ann.id, ann.scan_id], dtype = '<U14')
    features = ann.feature_vals()
    return(ids, features)    

##### Função que recebe uma lista de anotações e cria um dataframe:

In [6]:
def anns_df(anotacoes):
    
    #Garantir que estamos a trabalhar com uma lista
    if  not isinstance(anotacoes, list):
        anotacoes = [anotacoes]
    
    #Criar 2 dataframes, uma para os ids e outro para as features:
    df_id = pd.DataFrame(columns = ["Patient_id", "Annotation_id", "Scan_id"])
    df_feat = pd.DataFrame(columns=['Subtlety', 'Internalstructure', 'Calcification','Sphericity','Margin','Lobulation',
                           'Spiculation','Texture','Malignancy'])
    
    for i, ann in enumerate(anotacoes):
        array_id, array_feat = ann_row(ann)
        df_id.loc[i] = array_id
        df_feat.loc[i] = array_feat

    df = pd.concat([df_id, df_feat], axis = 1)
    return(df)  

##### Função que junta dois dataframes (anotações + nódulos):

In [8]:
def juntar(df1, df2):
    #df1 = anotações 
    #df2 = nódulos
    df = df1.merge(df2[['Patient_id', 'Annotation_id', 'Scan_id', 'Nodule_Id']], 
                    on=['Patient_id', 'Annotation_id', 'Scan_id'], how='left')
    ordem_colunas = [df.columns[0], 'Nodule_Id'] + list(df.columns[1:-1]) #Reordenar as colunas
    df = df[ordem_colunas]
    return(df)

##### Função que recebe um dataframe (anotações + nódulos) e agrupa por nódulo:

In [28]:
def cluster(df):
    #Retirar as colunas 'Patient_id' e 'Annotation_id' e agrupar as anotações para cada nódulo (média)
    df1 = df.drop(['Patient_id', 'Annotation_id'], axis=1)
    group = df1.groupby('Nodule_Id').mean().reset_index()

    #Voltar a adicionar a coluna 'Patient_id':
    df2 = df[['Patient_id', 'Nodule_Id']] #Criar dataframe só com a coluna Patient_id e Nodule_id
    df2 = df2.sort_values(by='Nodule_Id', ascending=True, ignore_index=True) # Ordenar por ordem crescente Nodule_id (ordem do df group)
    df2 = df2.drop_duplicates(subset='Nodule_Id')  # Ficar com uma linha para cada nódulo
    df2 = df2.reset_index(drop=True) # Redefinir os índices para começar a partir de 0

    group['Patient_id'] = df2['Patient_id'] #Adicionar a coluna 
    ordem_colunas = ['Patient_id'] + list(group.columns[0:-1]) #Reordenar as colunas
    group = group[ordem_colunas]
    group['Scan_id'] = group['Scan_id'].astype(int) #Passar para int 

    return(group)

----------------------------------------------------------------------

## Exemplos

In [8]:
anotacoes = annotations[:144]
data = anns_df(anotacoes)
data

Unnamed: 0,Patient_id,Annotation_id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,1,1,5,1,6,3,4,1,1,5,3
1,LIDC-IDRI-0078,2,1,4,1,6,4,4,1,2,5,3
2,LIDC-IDRI-0078,3,1,5,1,4,3,5,2,3,5,4
3,LIDC-IDRI-0078,4,1,5,1,6,4,2,4,1,5,5
4,LIDC-IDRI-0078,5,1,4,1,6,4,2,3,1,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
139,LIDC-IDRI-0010,140,21,5,1,6,3,4,3,2,5,4
140,LIDC-IDRI-0010,141,21,3,1,6,4,3,2,1,4,2
141,LIDC-IDRI-0010,142,21,5,1,6,4,5,3,1,5,2
142,LIDC-IDRI-0010,143,21,5,1,6,2,5,2,1,5,2


In [11]:
nodulos = id_nodulo(scans[:21])
nodulos

Unnamed: 0,Nodule_Id,Patient_id,Annotation_id,Scan_id
0,1,LIDC-IDRI-0078,2,1
1,1,LIDC-IDRI-0078,6,1
2,1,LIDC-IDRI-0078,10,1
3,1,LIDC-IDRI-0078,13,1
4,2,LIDC-IDRI-0078,1,1
...,...,...,...,...
139,54,LIDC-IDRI-0010,143,21
140,55,LIDC-IDRI-0010,138,21
141,55,LIDC-IDRI-0010,139,21
142,55,LIDC-IDRI-0010,140,21


In [10]:
anns_id = anns_nods(anotacoes, scans[:21])
anns_id

Unnamed: 0,Patient_id,Nodule_id,Annotation_id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,2,1,1,5,1,6,3,4,1,1,5,3
1,LIDC-IDRI-0078,1,2,1,4,1,6,4,4,1,2,5,3
2,LIDC-IDRI-0078,4,3,1,5,1,4,3,5,2,3,5,4
3,LIDC-IDRI-0078,4,4,1,5,1,6,4,2,4,1,5,5
4,LIDC-IDRI-0078,2,5,1,4,1,6,4,2,3,1,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,LIDC-IDRI-0010,55,140,21,5,1,6,3,4,3,2,5,4
140,LIDC-IDRI-0010,54,141,21,3,1,6,4,3,2,1,4,2
141,LIDC-IDRI-0010,55,142,21,5,1,6,4,5,3,1,5,2
142,LIDC-IDRI-0010,54,143,21,5,1,6,2,5,2,1,5,2


In [15]:
df =  pd.read_csv('/home/claudia/Lab/manifest-1600709154662/CSV/pylidc_nod.csv')
grouped = cluster(df)
grouped

Unnamed: 0,Patient_id,Nodule_id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,1,1,4.500000,1.0,6.0,4.000000,3.250000,2.250000,2.25,4.750000,3.75
1,LIDC-IDRI-0078,2,1,4.750000,1.0,6.0,4.000000,2.750000,3.000000,2.25,4.500000,3.75
2,LIDC-IDRI-0078,3,1,4.000000,1.0,5.0,5.000000,5.000000,1.000000,1.00,5.000000,1.00
3,LIDC-IDRI-0078,4,1,5.000000,1.0,5.0,3.750000,3.250000,3.250000,2.75,4.750000,4.25
4,LIDC-IDRI-0069,5,2,2.500000,1.0,6.0,4.500000,4.250000,4.000000,4.00,4.750000,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...
2646,LIDC-IDRI-0639,2647,1016,4.333333,1.0,6.0,3.666667,2.333333,2.333333,3.00,3.333333,4.00
2647,LIDC-IDRI-0639,2648,1016,2.500000,1.0,6.0,4.500000,2.000000,1.000000,1.50,1.000000,3.50
2648,LIDC-IDRI-0638,2649,1017,3.000000,1.0,6.0,3.500000,3.500000,1.500000,1.00,5.000000,3.50
2649,LIDC-IDRI-0638,2650,1017,5.000000,1.0,6.0,4.000000,5.000000,1.000000,1.00,5.000000,2.00


---------------------------------------

## Criar csv

In [10]:
df1 = anns_df(annotations)
df1

Unnamed: 0,Patient_id,Annotation_id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,1,1,5,1,6,3,4,1,1,5,3
1,LIDC-IDRI-0078,2,1,4,1,6,4,4,1,2,5,3
2,LIDC-IDRI-0078,3,1,5,1,4,3,5,2,3,5,4
3,LIDC-IDRI-0078,4,1,5,1,6,4,2,4,1,5,5
4,LIDC-IDRI-0078,5,1,4,1,6,4,2,3,1,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
6854,LIDC-IDRI-0638,6855,1017,5,1,6,4,5,1,1,5,2
6855,LIDC-IDRI-0638,6856,1017,2,1,6,4,3,1,1,5,3
6856,LIDC-IDRI-0638,6857,1017,4,1,6,3,4,2,1,5,4
6857,LIDC-IDRI-0127,6858,1018,5,1,2,5,5,4,5,5,2


In [32]:
print("Número de pacientes analisados: ",df1['Patient_id'].nunique())
print("Número de Anotações: ",df1['Annotation_id'].nunique())
print("Número de scans analisados: ", df1['Scan_id'].nunique())

Número de pacientes analisados:  875
Número de Anotações:  6859
Número de scans analisados:  883


--------------------------------------------------

In [19]:
# Guardar o df1 
nome_do_arquivo = 'pylidc.csv'

# Salve o DataFrame como um arquivo CSV
df1.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

O DataFrame foi salvo como "pylidc.csv"


-----------------------------------------------------------

In [11]:
df2 = id_nodulo(scans)
df2

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.
Failed to reduce all groups to <= 4 Annotations.
Some n

Unnamed: 0,Nodule_Id,Patient_id,Annotation_id,Scan_id
0,1,LIDC-IDRI-0078,2,1
1,1,LIDC-IDRI-0078,6,1
2,1,LIDC-IDRI-0078,10,1
3,1,LIDC-IDRI-0078,13,1
4,2,LIDC-IDRI-0078,1,1
...,...,...,...,...
6854,2649,LIDC-IDRI-0638,6856,1017
6855,2649,LIDC-IDRI-0638,6857,1017
6856,2650,LIDC-IDRI-0638,6855,1017
6857,2651,LIDC-IDRI-0127,6858,1018


In [13]:
print("Número de pacientes analisados: ",df2['Patient_id'].nunique())
print("Número de nódulos: ",df2['Nodule_Id'].nunique())
print("Número de anotações: ",df2['Annotation_id'].nunique())
print("Número de scans: ", df2['Scan_id'].nunique())

Número de pacientes analisados:  875
Número de nódulos:  2651
Número de anotações:  6859
Número de scans:  883


----------------------------------------------------------------------

In [20]:
#Guardar o df2
nome_do_arquivo = 'noduleid.csv'

# Salve o DataFrame como um arquivo CSV
df2.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

O DataFrame foi salvo como "noduleid.csv"


----------------------------------------------------------

In [18]:
lista1 = df1['Patient_id'].unique()
lista2 = df2['Patient_id'].unique()
elementos = [x for x in lista1 if x not in lista2]
len(elementos) 

0

--------------------------------------------------------------

In [21]:
d1 = pd.read_csv('/home/claudia/Lab/manifest-1600709154662/csv/pylidc.csv')
d2 = pd.read_csv('/home/claudia/Lab/manifest-1600709154662/csv/noduleid.csv')

In [18]:
df3 = juntar(d1,d2)
df3

Unnamed: 0,Patient_id,Nodule_Id,Annotation_id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,2,1,1,5,1,6,3,4,1,1,5,3
1,LIDC-IDRI-0078,1,2,1,4,1,6,4,4,1,2,5,3
2,LIDC-IDRI-0078,4,3,1,5,1,4,3,5,2,3,5,4
3,LIDC-IDRI-0078,4,4,1,5,1,6,4,2,4,1,5,5
4,LIDC-IDRI-0078,2,5,1,4,1,6,4,2,3,1,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6854,LIDC-IDRI-0638,2650,6855,1017,5,1,6,4,5,1,1,5,2
6855,LIDC-IDRI-0638,2649,6856,1017,2,1,6,4,3,1,1,5,3
6856,LIDC-IDRI-0638,2649,6857,1017,4,1,6,3,4,2,1,5,4
6857,LIDC-IDRI-0127,2651,6858,1018,5,1,2,5,5,4,5,5,2


In [19]:
print("Número de pacientes analisados: ",df3['Patient_id'].nunique())
print("Número de nódulos: ",df3['Nodule_Id'].nunique())
print("Número de anotações: ",df3['Annotation_id'].nunique())
print("Número de scans: ", df3['Scan_id'].nunique())

Número de pacientes analisados:  875
Número de nódulos:  2651
Número de anotações:  6859
Número de scans:  883


In [20]:
# Guardar o df1 = df1+df2
nome_do_arquivo = 'pylidc_nod.csv'

# Salve o DataFrame como um arquivo CSV
df3.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

O DataFrame foi salvo como "pylidc_nod.csv"


-------------------------------------------------------------

In [29]:
d3 = pd.read_csv('/home/claudia/Lab/manifest-1600709154662/csv/pylidc_nod.csv')

In [31]:
df4 = cluster(d3)
df4

Unnamed: 0,Patient_id,Nodule_Id,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,1,1,4.500000,1.0,6.0,4.000000,3.250000,2.250000,2.25,4.750000,3.75
1,LIDC-IDRI-0078,2,1,4.750000,1.0,6.0,4.000000,2.750000,3.000000,2.25,4.500000,3.75
2,LIDC-IDRI-0078,3,1,4.000000,1.0,5.0,5.000000,5.000000,1.000000,1.00,5.000000,1.00
3,LIDC-IDRI-0078,4,1,5.000000,1.0,5.0,3.750000,3.250000,3.250000,2.75,4.750000,4.25
4,LIDC-IDRI-0069,5,2,2.500000,1.0,6.0,4.500000,4.250000,4.000000,4.00,4.750000,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...
2646,LIDC-IDRI-0639,2647,1016,4.333333,1.0,6.0,3.666667,2.333333,2.333333,3.00,3.333333,4.00
2647,LIDC-IDRI-0639,2648,1016,2.500000,1.0,6.0,4.500000,2.000000,1.000000,1.50,1.000000,3.50
2648,LIDC-IDRI-0638,2649,1017,3.000000,1.0,6.0,3.500000,3.500000,1.500000,1.00,5.000000,3.50
2649,LIDC-IDRI-0638,2650,1017,5.000000,1.0,6.0,4.000000,5.000000,1.000000,1.00,5.000000,2.00


-------------------------------------------------------

In [32]:
print("Número de pacientes analisados: ",df4['Patient_id'].nunique())
print("Número de nódulos: ",df4['Nodule_Id'].nunique())
print("Número de scans: ", df4['Scan_id'].nunique())

Número de pacientes analisados:  875
Número de nódulos:  2651
Número de scans:  883


---------------------------------------------------

In [34]:
# Guardar o df4
nome_do_arquivo = 'pylidc_grouped.csv'

# Salve o DataFrame como um arquivo CSV
df4.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

O DataFrame foi salvo como "pylidc_grouped.csv"


# 2. Features Pyradiomics

### Setting up Logging

In [19]:
# Get the PyRadiomics logger (default log-level = INFO)
logger = radiomics.logger
logger.setLevel(logging.DEBUG)  # set level to DEBUG to include debug log messages in log file

# Write out all log entries to a file
handler = logging.FileHandler(filename='testLog.txt', mode='w')
formatter = logging.Formatter('%(levelname)s:%(name)s: %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

### Ativar Feature Extractor

In [20]:
#Alternative: use hardcoded settings (separate for settings, input image types and enabled features)
settings = {}
settings['binWidth'] = 25
settings['resampledPixelSpacing'] = None
# settings['resampledPixelSpacing'] = [3, 3, 3]  # This is an example for defining resampling (voxels with size 3x3x3mm)
settings['interpolator'] = 'sitkBSpline'
settings['verbose'] = True

extractor = featureextractor.RadiomicsFeatureExtractor(**settings)

## Extrair Features

In [21]:
# Getting the docstrings of the active feature
featureClasses = getFeatureClasses()

print('Active features:')
for cls, features in six.iteritems(extractor.enabledFeatures):
    if len(features) == 0:
        features = [f for f, deprecated in six.iteritems(featureClasses[cls].getFeatureNames()) if not deprecated]
    for f in features:
        print(f)
        print(getattr(featureClasses[cls], 'get%sFeatureValue' % f).__doc__)

Active features:
10Percentile

    **5. 10th percentile**

    The 10\ :sup:`th` percentile of :math:`\textbf{X}`
    
90Percentile

    **6. 90th percentile**

    The 90\ :sup:`th` percentile of :math:`\textbf{X}`
    
Energy

    **1. Energy**

    .. math::
      \textit{energy} = \displaystyle\sum^{N_p}_{i=1}{(\textbf{X}(i) + c)^2}

    Here, :math:`c` is optional value, defined by ``voxelArrayShift``, which shifts the intensities to prevent negative
    values in :math:`\textbf{X}`. This ensures that voxels with the lowest gray values contribute the least to Energy,
    instead of voxels with gray level intensity closest to 0.

    Energy is a measure of the magnitude of voxel values in an image. A larger values implies a greater sum of the
    squares of these values.

    .. note::
      This feature is volume-confounded, a larger value of :math:`c` increases the effect of volume-confounding.
    
Entropy

    **3. Entropy**

    .. math::
      \textit{entropy} = -\displaystyl

---------------------------------------------------------------------

##### Função para extrair as features de todos os nódulos de todos os scans e passar para dataframe:

In [23]:
def features_to_dataframe(scans):
    
    listafeatures = []
    df1 = pd.DataFrame(columns = ['Patient_id'])
    i=0
    
    for scan in scans:
        nods = scan.cluster_annotations()
        for anns in nods:

            df1.loc[i]=scan.patient_id
        
            # Realize a consolidação de consenso e nível de acordo de 50%
            cmask, cbbox, masks = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)])
    
            # Converter a matriz de pixels para uma imagem SimpleITK
            mask = sitk.GetImageFromArray(cmask.astype(float))

            #Extrair features pra o nódulo
            features = extractor.execute(mask, mask)
            listafeatures.append(features)

            i = i+1

    df1['Nodule_Id'] = range(1, len(df1) + 1)
    df2 = pd.DataFrame(listafeatures)
    df = pd.concat([df1, df2], axis = 1)
    return(df)

## Exemplo

In [28]:
scans1 = scans[:5]
df_ex = features_to_dataframe(scans1)
df_ex

GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Average, only 1 needs to be calculated
GLCM is symmetrical, therefore Sum Average = 2 * Joint Avera

Unnamed: 0,Patient_id,Nodule_Id,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,LIDC-IDRI-0078,1,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},966db6c8d814983bbfe7125907b2ae5008c7994e,...,2.7613561461850897e-07,2.7613561461850897e-07,-3.203426503814917e-16,0.000525486074619,0.0,0.0,1000000.0,0.0,0.0,0.0
1,LIDC-IDRI-0078,2,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},d750f93365c734f6f1972e806b046bb1a203f47f,...,3.299153008448141e-07,3.299153008448141e-07,-3.203426503814917e-16,0.0005743825387708,0.0,0.0,1000000.0,0.0,0.0,0.0
2,LIDC-IDRI-0078,3,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},58d9ee5073ad7215556c1e295a67e9204a623b2d,...,0.0008650519031141,0.0008650519031141,-3.203426503814917e-16,0.0294117647058823,0.0,0.0,1000000.0,0.0,0.0,0.0
3,LIDC-IDRI-0078,4,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},312913e3e22bc6f2e6cffc8abd1120da5fa413fd,...,0.6666666917694576,0.6666666917694576,0.9182958340544888,0.0008228195282501,2949210.888888889,0.0,1000000.0,0.0,0.0,0.0
4,LIDC-IDRI-0069,5,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},08706c40effcb9f7489dc9ce0139a265b8dc6048,...,3.684041835979089e-06,3.684041835979089e-06,-3.203426503814917e-16,0.0019193857965451,0.0,0.0,1000000.0,0.0,0.0,0.0
5,LIDC-IDRI-0069,6,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},0004fd824c3ef47dd24a75dcdf32e63cb34608d0,...,0.0044444444444444,0.0044444444444444,-3.203426503814917e-16,0.0666666666666666,0.0,0.0,1000000.0,0.0,0.0,0.0
6,LIDC-IDRI-0069,7,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},74d8d417fd2ee4347f75aeb3552368a80e022243,...,3.768407256294277e-07,3.768407256294277e-07,-3.203426503814917e-16,0.0006138735420503,0.0,0.0,1000000.0,0.0,0.0,0.0
7,LIDC-IDRI-0079,8,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},a8d045cecf7c3b21ec73d42e3f7b6cde49c3560f,...,1.374364528201273e-06,1.374364528201273e-06,-3.203426503814917e-16,0.0011723329425556,0.0,0.0,1000000.0,0.0,0.0,0.0
8,LIDC-IDRI-0101,9,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},41c68d6e929711daf42ec210cf23475488543c23,...,7.181844297615628e-05,7.181844297615628e-05,-3.203426503814917e-16,0.0084745762711864,0.0,0.0,1000000.0,0.0,0.0,0.0
9,LIDC-IDRI-0110,10,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},c1507e3aa4f94670103bbd8f0bbfe507f8718cbc,...,1.841993774061044e-05,1.841993774061044e-05,-3.203426503814917e-16,0.0042918454935622,0.0,0.0,1000000.0,0.0,0.0,0.0


## Criar CSV

In [150]:
df5 = features_to_dataframe(scans)
df5

NameError: name 'features_to_dataframe' is not defined

--------------------------------

In [None]:
nome_do_arquivo = 'pyradiomics.csv'

# Salve o DataFrame como um arquivo CSV
df5.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

--------------------------------

#### Juntar as features pylidc + pyradiomics:

In [35]:
d5 = pd.read_csv('/home/claudia/Lab/manifest-1600709154662/csv/pyradiomics.csv')
d4 = pd.read_csv('/home/claudia/Lab/manifest-1600709154662/csv/pylidc_grouped.csv')

In [38]:
df6 = d5.merge(d4, on=['Patient_id','Nodule_Id'], how='left')

In [39]:
df6

Unnamed: 0,Patient_id,Nodule_Id,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,...,Scan_id,Subtlety,Internalstructure,Calcification,Sphericity,Margin,Lobulation,Spiculation,Texture,Malignancy
0,LIDC-IDRI-0078,1,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},966db6c8d814983bbfe7125907b2ae5008c7994e,...,1,4.500000,1.0,6.0,4.000000,3.250000,2.250000,2.25,4.750000,3.75
1,LIDC-IDRI-0078,2,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},d750f93365c734f6f1972e806b046bb1a203f47f,...,1,4.750000,1.0,6.0,4.000000,2.750000,3.000000,2.25,4.500000,3.75
2,LIDC-IDRI-0078,3,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},58d9ee5073ad7215556c1e295a67e9204a623b2d,...,1,4.000000,1.0,5.0,5.000000,5.000000,1.000000,1.00,5.000000,1.00
3,LIDC-IDRI-0078,4,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},312913e3e22bc6f2e6cffc8abd1120da5fa413fd,...,1,5.000000,1.0,5.0,3.750000,3.250000,3.250000,2.75,4.750000,4.25
4,LIDC-IDRI-0069,5,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},08706c40effcb9f7489dc9ce0139a265b8dc6048,...,2,2.500000,1.0,6.0,4.500000,4.250000,4.000000,4.00,4.750000,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2646,LIDC-IDRI-0639,2647,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},db18dcbfdf1ebd1c1390db9dd1fbfd85cce4724b,...,1016,4.333333,1.0,6.0,3.666667,2.333333,2.333333,3.00,3.333333,4.00
2647,LIDC-IDRI-0639,2648,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},fe63df501e056c7f875ab771151194481c70a0b1,...,1016,2.500000,1.0,6.0,4.500000,2.000000,1.000000,1.50,1.000000,3.50
2648,LIDC-IDRI-0638,2649,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},e25d0da08cbc45ea3587c9426cc102940176a4de,...,1017,3.000000,1.0,6.0,3.500000,3.500000,1.500000,1.00,5.000000,3.50
2649,LIDC-IDRI-0638,2650,v3.0.1,1.19.5,2.3.0,1.4.1,3.9.18,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},5888efc13966913742433819826d00f4d79f0c24,...,1017,5.000000,1.0,6.0,4.000000,5.000000,1.000000,1.00,5.000000,2.00


In [40]:
df6.columns

Index(['Patient_id', 'Nodule_Id', 'diagnostics_Versions_PyRadiomics',
       'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK',
       'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python',
       'diagnostics_Configuration_Settings',
       'diagnostics_Configuration_EnabledImageTypes',
       'diagnostics_Image-original_Hash',
       ...
       'Scan_id', 'Subtlety', 'Internalstructure', 'Calcification',
       'Sphericity', 'Margin', 'Lobulation', 'Spiculation', 'Texture',
       'Malignancy'],
      dtype='object', length=141)

--------------------------------

In [41]:
nome_do_arquivo = 'dataset.csv'

# Salve o DataFrame como um arquivo CSV
df6.to_csv(nome_do_arquivo, index=False)

# Confirme que o arquivo foi salvo com sucesso
print(f'O DataFrame foi salvo como "{nome_do_arquivo}"')

O DataFrame foi salvo como "dataset.csv"
