<a href="https://colab.research.google.com/github/vkt1414/CloudSegmentator/blob/feat-convert-raw-radiomics-to-dataframe/workflows/TotalSegmentator/Notebooks/postProcessingRadiomicsJsonToDataFrame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **This notebook converts raw radiomics features in JSON format to a pandas dataframe. It takes the raw radiomics files in lz4 format as input, decompresses them, and flattens them to a dataframe, output a csv.lz4**

### **Installing Packages**

In [1]:
%%capture
import sys
if 'google.colab' in sys.modules:
    !sudo apt-get update \
    && apt-get install -y --no-install-recommends \
    lz4

### **Importing Packages**

In [6]:
import os
import sys
import subprocess
import json
import pandas as pd
from pandas import json_normalize

### **Parameters for papermill**

In [7]:
if 'google.colab' in sys.modules:
    !wget -q https://github.com/vkt1414/CloudSegmentator/releases/download/test/pyradiomicsRadiomicsFeatures.tar.lz4
    rawJsonRadiomicsFiles=["pyradiomicsRadiomicsFeatures.tar.lz4"]


### **This is the cell used on cloud, as the file paths are passed to the notebook as a string**

In [None]:
if not 'google.colab' in sys.modules:
    rawJsonRadiomicsFiles=rawJsonRadiomicsFiles.split(',')

In [4]:
def flatten_json(seriesInstanceUID, radiomics_file_path):
    try:
        with open(radiomics_file_path, 'r', encoding='utf-8', errors='ignore') as f:
            data = json.load(f, strict=False)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        # Write the file path to the error log
        with open('error_file.txt', 'a') as error_file:
            error_file.write(f"{radiomics_file_path}\n")
        return None 

    # Create an empty list to store DataFrames
    df_list = []

    # Iterate over the items in the dictionary and flatten each to a row
    for organ, properties in data.items():
        # Normalize the nested dictionary
        organ_df = json_normalize(properties)
        # Add SeriesInstanceUID
        organ_df['seriesInstanceUID'] = seriesInstanceUID
        # Add the organ name as a column
        organ_df['organ'] = organ
        # Append the result to the list
        df_list.append(organ_df)

    # Concatenate all DataFrames in the list
    df = pd.concat(df_list, ignore_index=True)

    return df


### **Convert Radiomics features in JSON to DataFrame, finally to a csv**

In [None]:
# Main script to decompress files and flatten JSON
all_dataframes = []  # List to store all DataFrames

for rawJsonRadiomicsFile in rawJsonRadiomicsFiles:
  # Decompress the LZ4 file and extract the tar file
  !lz4 -d --rm $rawJsonRadiomicsFile -c | tar  -xvf -
  # Assuming 'radiomics' is a directory in the current working directory
  for series_folder in os.listdir('radiomics'):
    # The directory name is the seriesInstanceUID
    seriesInstanceUID = series_folder
    for file in os.listdir(os.path.join('radiomics',series_folder)):
        if file.endswith('_raw.json.lz4'):
            # Construct the full file path
            file_path = os.path.join('radiomics',series_folder, file)
            # Decompress the file using the lz4 command
            !lz4 -d --rm $file_path
            # Truncate to get the JSON filename
            json_file_path=file_path[:-4]
            # Flatten the JSON file into a DataFrame
            df = flatten_json(seriesInstanceUID, json_file_path)
            # Add the DataFrame to the list
            all_dataframes.append(df)
  !rm -r radiomics

# Concatenate all DataFrames in the list
final_df = pd.concat(all_dataframes, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_parquet('raw_radiomics.parquet', index=False)

