In [1]:
import pandas as pd
import json
import os

directory_path = "C:\\Jayesh Personal"
json_filename = "biospecimen.project-tcga-coad.json"
full_file_path = os.path.join(directory_path, json_filename)

try:
    with open(full_file_path, 'r') as file:
        data = json.load(file)
        print("JSON data loaded successfully:")
        print("Top-level keys:", list(data[0].keys()) if isinstance(data, list) else list(data.keys()))
        print("Preview of first entry:")
        print(json.dumps(data[0], indent=2) if isinstance(data, list) else json.dumps(data, indent=2))

except FileNotFoundError:
    print(f"Error: The file '{full_file_path}' was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{full_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

JSON data loaded successfully:
Top-level keys: ['case_id', 'project', 'submitter_id', 'samples']
Preview of first entry:
{
  "case_id": "01240896-3f3f-4bf9-9799-55c87bfacf36",
  "project": {
    "project_id": "TCGA-COAD"
  },
  "submitter_id": "TCGA-F4-6854",
  "samples": [
    {
      "sample_type_id": "01",
      "tumor_descriptor": "Primary",
      "sample_id": "358bd002-2a75-4a4c-a583-c692c630750b",
      "submitter_id": "TCGA-F4-6854-01Z",
      "sample_type": "Primary Tumor",
      "created_datetime": "2018-05-17T12:15:41.728837-05:00",
      "oct_embedded": "No",
      "specimen_type": "Solid Tissue",
      "days_to_sample_procurement": 0,
      "updated_datetime": "2023-11-08T11:17:24.814426-06:00",
      "state": "released",
      "is_ffpe": "true",
      "preservation_method": "FFPE",
      "tissue_type": "Tumor",
      "portions": [
        {
          "analytes": [],
          "portion_id": "72d3cda9-1fa1-53a7-a560-bae805015c7b",
          "slides": [
            {
        

In [2]:
flat_data = []

for person in data:
    flat_row = {}

    for key, value in person.items():
        if isinstance(value, dict):
            # For dicts like 'demographic' or 'project', flatten keys
            for subkey, subval in value.items():
                flat_row[f"{key}.{subkey}"] = subval
        elif isinstance(value, list):
            # Leave lists as-is (we'll handle them in later steps)
            flat_row[key] = value
        else:
            flat_row[key] = value

    flat_data.append(flat_row)

# Convert to DataFrame
biospec_df = pd.DataFrame(flat_data)

In [3]:
biospec_df

Unnamed: 0,case_id,project.project_id,submitter_id,samples
0,01240896-3f3f-4bf9-9799-55c87bfacf36,TCGA-COAD,TCGA-F4-6854,"[{'sample_type_id': '01', 'tumor_descriptor': ..."
1,01ad5016-f691-4bca-82a0-910429d8d25b,TCGA-COAD,TCGA-AA-3561,"[{'intermediate_dimension': 0.6, 'sample_type_..."
2,01f493d4-229d-47a6-baa8-32a342c65d01,TCGA-COAD,TCGA-AA-A00O,"[{'sample_type_id': '01', 'tumor_descriptor': ..."
3,022f39e9-57ee-4b2b-8b3a-8929e3d69a37,TCGA-COAD,TCGA-DM-A28F,"[{'sample_type_id': '01', 'tumor_descriptor': ..."
4,02f9668c-71e6-485f-88b1-b37dc8bdd2ab,TCGA-COAD,TCGA-AA-3866,"[{'sample_type_id': '10', 'tumor_descriptor': ..."
...,...,...,...,...
456,fce1fd5a-54d2-4260-b187-4eb7035e96e9,TCGA-COAD,TCGA-CM-6675,"[{'sample_type_id': '01', 'tumor_descriptor': ..."
457,fd16b634-2e04-44a3-862d-fb03cd73c057,TCGA-COAD,TCGA-AA-3556,"[{'intermediate_dimension': 0.7, 'sample_type_..."
458,fdffda5f-72b2-4153-b7f1-d7043b7ca898,TCGA-COAD,TCGA-AA-3818,"[{'intermediate_dimension': 0.8, 'sample_type_..."
459,ff1407c6-9174-4bae-a19b-d34ca71b898c,TCGA-COAD,TCGA-A6-2680,"[{'intermediate_dimension': 0.7, 'sample_type_..."


In [4]:
biospec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 461 entries, 0 to 460
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   case_id             461 non-null    object
 1   project.project_id  461 non-null    object
 2   submitter_id        461 non-null    object
 3   samples             461 non-null    object
dtypes: object(4)
memory usage: 14.5+ KB


In [5]:
biospec_df.to_csv('bio_spec.csv', index=False)

In [6]:
[{'sample_type_id': '01', 'tumor_descriptor': 'Primary', 'sample_id': '358bd002-2a75-4a4c-a583-c692c630750b', 'submitter_id': 'TCGA-F4-6854-01Z', 'sample_type': 'Primary Tumor', 'created_datetime': '2018-05-17T12:15:41.728837-05:00', 'oct_embedded': 'No', 'specimen_type': 'Solid Tissue', 'days_to_sample_procurement': 0, 'updated_datetime': '2023-11-08T11:17:24.814426-06:00', 'state': 'released', 'is_ffpe': 'true', 'preservation_method': 'FFPE', 'tissue_type': 'Tumor', 'portions': [{'analytes': [], 'portion_id': '72d3cda9-1fa1-53a7-a560-bae805015c7b', 'slides': [{'updated_datetime': '2018-08-23T19:24:29.874737-05:00', 'submitter_id': 'TCGA-F4-6854-01Z-00-DX1', 'section_location': 'Not Reported', 'state': 'released', 'slide_id': 'e76d04bf-1417-4359-928b-6e0ea8a1fa17', 'created_datetime': '2018-05-17T13:38:19.166901-05:00'}]}]}, {'intermediate_dimension': 0.8, 'sample_type_id': '01', 'tumor_descriptor': 'Primary', 'sample_id': '7c12327f-05c5-4721-8031-93a318835580', 'pathology_report_uuid': '7b65634e-37d9-4edd-9b0a-e3eb627999b0', 'submitter_id': 'TCGA-F4-6854-01A', 'shortest_dimension': 0.4, 'sample_type': 'Primary Tumor', 'specimen_type': 'Solid Tissue', 'composition': 'Not Reported', 'updated_datetime': '2023-11-08T11:21:35.429548-06:00', 'longest_dimension': 2.3, 'state': 'released', 'is_ffpe': 'false', 'preservation_method': 'Unknown', 'tissue_type': 'Tumor', 'portions': [{'analytes': [{'experimental_protocol_type': 'Allprep RNA Extraction', 'aliquots': [{'aliquot_quantity': 2.0, 'aliquot_id': '523f117f-e6a5-401d-80c4-834a20229618', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:17.584873-05:00', 'center': {'code': '07', 'center_id': 'ee7a85b3-8177-5d60-a10c-51180eb9009c', 'name': 'University of North Carolina', 'namespace': 'unc.edu', 'short_name': 'UNC', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11R-1928-07', 'concentration': 0.15, 'state': 'released', 'aliquot_volume': 13.3}], 'submitter_id': 'TCGA-F4-6854-01A-11R', 'concentration': 0.15, 'ribosomal_rna_28s_16s_ratio': 1.54, 'analyte_type_id': 'R', 'analyte_id': '12b354e7-594c-49ab-8519-54606bd9250a', 'analyte_type': 'RNA', 'updated_datetime': '2022-07-20T13:31:16.333153-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'rna_integrity_number': 9.3, 'spectrophotometer_method': 'UV Spec', 'state': 'released', 'a260_a280_ratio': 1.7}, {'analyte_id': 'c5863e3d-9732-4194-b19d-c796688be2d9', 'experimental_protocol_type': 'Repli-G', 'aliquots': [{'aliquot_quantity': 40.0, 'aliquot_id': '984c4c55-3666-4f58-8b77-4948745f050e', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:20.715855-05:00', 'center': {'code': '10', 'center_id': 'd3b8c887-498b-5490-903e-760403c68307', 'name': 'Baylor College of Medicine', 'namespace': 'hgsc.bcm.edu', 'short_name': 'BCM', 'center_type': 'GSC'}, 'submitter_id': 'TCGA-F4-6854-01A-11W-1967-10', 'concentration': 0.5, 'state': 'released', 'aliquot_volume': 80.0}], 'analyte_type': 'Repli-G (Qiagen) DNA', 'updated_datetime': '2022-07-20T13:31:19.486329-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'spectrophotometer_method': 'PicoGreen', 'submitter_id': 'TCGA-F4-6854-01A-11W', 'concentration': 0.5, 'state': 'released', 'analyte_type_id': 'W'}, {'analyte_id': 'd274e173-2fcc-40ac-8af4-7fb0459387e6', 'experimental_protocol_type': 'aDNA Preparation Type', 'aliquots': [{'aliquot_quantity': 1.86, 'aliquot_id': '1f7e29ec-8450-4800-ad42-1d5346f41a70', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:09.822827-05:00', 'center': {'code': '05', 'center_id': '7ef3885b-37ce-5e16-8ba3-9d75b6690008', 'name': 'Johns Hopkins / University of Southern California', 'namespace': 'jhu-usc.edu', 'short_name': 'JHU_USC', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11D-1926-05', 'concentration': 0.14, 'state': 'released', 'aliquot_volume': 13.3}, {'aliquot_quantity': 0.93, 'aliquot_id': '268fed61-4093-47e8-9c7b-3356c32b8f45', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:06.127282-05:00', 'center': {'code': '01', 'center_id': '5069ce55-a23f-57c4-a28c-70a3c3cb0e4c', 'name': 'Broad Institute of MIT and Harvard', 'namespace': 'broad.mit.edu', 'short_name': 'BI', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11D-1923-01', 'concentration': 0.14, 'state': 'released', 'aliquot_volume': 6.67}, {'aliquot_quantity': 3.0, 'aliquot_id': '278ab701-5bca-47ba-b788-d47e0ed5c08c', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:07.769188-05:00', 'center': {'code': '10', 'center_id': 'd3b8c887-498b-5490-903e-760403c68307', 'name': 'Baylor College of Medicine', 'namespace': 'hgsc.bcm.edu', 'short_name': 'BCM', 'center_type': 'GSC'}, 'submitter_id': 'TCGA-F4-6854-01A-11D-1924-10', 'concentration': 0.05, 'state': 'released', 'aliquot_volume': 60.0}, {'aliquot_id': '938e06f4-5299-40a7-8298-69dce6fc6593', 'source_center': '23', 'updated_datetime': '2018-11-27T11:13:44.305228-06:00', 'center': {'code': '23', 'center_id': '12d33d76-08bf-5c0b-91c8-46739653739f', 'name': 'NCH BCR', 'namespace': 'nationwidechildrens.org', 'short_name': 'NCH', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11D-1925-23', 'concentration': 0.14, 'state': 'released'}, {'aliquot_quantity': 1.4, 'aliquot_id': '9456a7e9-02b6-40ba-af03-7f1277ad70d1', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:04.503152-05:00', 'center': {'code': '02', 'center_id': 'c8611490-4cbd-5651-8de2-64484a515eec', 'name': 'Harvard Medical School', 'namespace': 'hms.harvard.edu', 'short_name': 'HMS', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11D-1922-02', 'concentration': 0.14, 'state': 'released', 'aliquot_volume': 10.0}, {'no_matched_normal_targeted_sequencing': 'false', 'selected_normal_low_pass_wgs': 'false', 'selected_normal_wgs': 'false', 'submitter_id': 'TCGA-F4-6854-01A-11D-A920-36', 'concentration': 0.04, 'no_matched_normal_wgs': 'false', 'aliquot_volume': 50.0, 'created_datetime': '2022-07-20T13:31:12.737359-05:00', 'aliquot_quantity': 2.0, 'aliquot_id': '984c580b-6af0-466e-8265-157437b0b7e2', 'source_center': '23', 'updated_datetime': '2023-11-17T14:38:05.071949-06:00', 'selected_normal_wxs': 'false', 'no_matched_normal_low_pass_wgs': 'false', 'no_matched_normal_wxs': 'false', 'selected_normal_targeted_sequencing': 'false', 'state': 'released'}], 'analyte_type': 'DNA', 'updated_datetime': '2022-07-20T13:31:03.237476-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'spectrophotometer_method': 'UV Spec', 'submitter_id': 'TCGA-F4-6854-01A-11D', 'concentration': 0.14, 'state': 'released', 'a260_a280_ratio': 1.9, 'analyte_type_id': 'D'}, {'experimental_protocol_type': 'mirVana (Allprep DNA) RNA', 'aliquots': [{'aliquot_quantity': 3.0, 'aliquot_id': 'fd53c449-c724-4de4-89a6-37d60f385dc7', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:14.640121-05:00', 'center': {'code': '13', 'center_id': '6eba705a-0f00-5aa2-b1d0-04dbf62100cc', 'name': "Canada's Michael Smith Genome Sciences Centre", 'namespace': 'bcgsc.ca', 'short_name': 'BCGSC', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-01A-11H-1927-13', 'concentration': 0.15, 'state': 'released', 'aliquot_volume': 20.0}], 'submitter_id': 'TCGA-F4-6854-01A-11H', 'concentration': 0.15, 'ribosomal_rna_28s_16s_ratio': 1.68, 'analyte_type_id': 'H', 'analyte_id': 'f1198a61-39ab-4e9a-86b9-0697f0434b9e', 'analyte_type': 'RNA', 'updated_datetime': '2022-07-20T13:31:13.370572-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'rna_integrity_number': 9.5, 'spectrophotometer_method': 'UV Spec', 'state': 'released', 'a260_a280_ratio': 1.8}], 'portion_id': '0079832c-9cfd-4758-aa36-ef11816157b1', 'slides': [{'updated_datetime': '2018-09-06T16:20:48.972378-05:00', 'percent_necrosis': 5.0, 'percent_stromal_cells': 35.0, 'submitter_id': 'TCGA-F4-6854-01A-01-BS1', 'percent_normal_cells': 5.0, 'section_location': 'BOTTOM', 'percent_tumor_cells': 55.0, 'state': 'released', 'slide_id': '3f102306-747a-485f-8f28-235c0d234b0d', 'percent_tumor_nuclei': 80.0}, {'updated_datetime': '2018-09-06T16:20:48.972378-05:00', 'percent_necrosis': 3.0, 'percent_stromal_cells': 35.0, 'submitter_id': 'TCGA-F4-6854-01A-01-TS1', 'percent_normal_cells': 0.0, 'section_location': 'TOP', 'percent_tumor_cells': 62.0, 'state': 'released', 'slide_id': '7be0f38e-08fc-4bfb-ad12-24fa03e490f1', 'percent_tumor_nuclei': 80.0}], 'updated_datetime': '2018-09-06T16:20:48.972378-05:00', 'creation_datetime': 1307491200.0, 'submitter_id': 'TCGA-F4-6854-01A-11', 'weight': 27.0, 'portion_number': '11', 'state': 'released', 'is_ffpe': 'false'}, {'analytes': [], 'portion_id': '13b08446-9fcb-4404-b947-9f26898319f6', 'updated_datetime': '2021-07-13T16:14:52.782260-05:00', 'submitter_id': 'TCGA-F4-6854-01A-13', 'state': 'released', 'created_datetime': '2020-09-17T16:44:31.817224-05:00'}, {'analytes': [], 'portion_id': '51be499f-a141-42fc-ae63-5e67b68928ed', 'updated_datetime': '2022-07-20T13:31:22.374001-05:00', 'center': {'code': '20', 'center_id': 'a35fab26-d715-5e48-9563-6cd6b189b989', 'name': 'MD Anderson - RPPA Core Facility (Proteomics)', 'namespace': 'mdanderson.org', 'short_name': 'MDA', 'center_type': 'CGCC'}, 'annotations': [{'annotation_id': '2f213e40-a8b7-5416-8da7-aab0aaa33666', 'entity_submitter_id': 'TCGA-F4-6854-01A-13-1935-20', 'entity_type': 'portion', 'notes': 'COAD Proteomics MDA', 'updated_datetime': '2018-08-23T16:32:20.747393-05:00', 'submitter_id': '4662', 'state': 'released', 'category': 'Item is noncanonical', 'classification': 'Notification', 'entity_id': '51be499f-a141-42fc-ae63-5e67b68928ed', 'created_datetime': '2011-12-14T00:00:00', 'status': 'Approved'}], 'creation_datetime': 1312416000.0, 'submitter_id': 'TCGA-F4-6854-01A-13-1935-20', 'weight': 13.0, 'portion_number': '13', 'state': 'released', 'is_ffpe': 'false'}]}, {'sample_type_id': '10', 'tumor_descriptor': 'Not Applicable', 'sample_id': 'a066f082-58be-478d-abce-5f971652fcae', 'submitter_id': 'TCGA-F4-6854-10A', 'sample_type': 'Blood Derived Normal', 'specimen_type': 'Peripheral Blood NOS', 'composition': 'Not Reported', 'updated_datetime': '2023-11-08T11:21:35.429548-06:00', 'state': 'released', 'is_ffpe': 'false', 'preservation_method': 'Unknown', 'tissue_type': 'Normal', 'portions': [{'analytes': [{'analyte_id': 'bdddcb82-3930-4948-9da4-e0d2ae74a1a4', 'experimental_protocol_type': 'Chemical Lysis DNA Extraction', 'aliquots': [{'aliquot_quantity': 1.0, 'aliquot_id': '363b32fa-07f2-4d00-b978-0b31e4c30b10', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:30.638108-05:00', 'center': {'code': '01', 'center_id': '5069ce55-a23f-57c4-a28c-70a3c3cb0e4c', 'name': 'Broad Institute of MIT and Harvard', 'namespace': 'broad.mit.edu', 'short_name': 'BI', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-10A-01D-1923-01', 'concentration': 0.15, 'state': 'released', 'aliquot_volume': 6.67}, {'aliquot_quantity': 1.5, 'aliquot_id': '73c5c2cc-a72f-4c30-abc0-1e93f2876e89', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:28.986425-05:00', 'center': {'code': '02', 'center_id': 'c8611490-4cbd-5651-8de2-64484a515eec', 'name': 'Harvard Medical School', 'namespace': 'hms.harvard.edu', 'short_name': 'HMS', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-10A-01D-1922-02', 'concentration': 0.15, 'state': 'released', 'aliquot_volume': 10.0}, {'aliquot_id': '775bbf4f-4c84-4397-a4a9-047914a9b2a2', 'source_center': '23', 'updated_datetime': '2018-11-27T11:43:14.098994-06:00', 'center': {'code': '23', 'center_id': '12d33d76-08bf-5c0b-91c8-46739653739f', 'name': 'NCH BCR', 'namespace': 'nationwidechildrens.org', 'short_name': 'NCH', 'center_type': 'CGCC'}, 'submitter_id': 'TCGA-F4-6854-10A-01D-1925-23', 'concentration': 0.15, 'state': 'released'}, {'no_matched_normal_targeted_sequencing': 'false', 'selected_normal_low_pass_wgs': 'false', 'selected_normal_wgs': 'false', 'submitter_id': 'TCGA-F4-6854-10A-01D-A920-36', 'concentration': 0.04, 'no_matched_normal_wgs': 'false', 'aliquot_volume': 50.0, 'created_datetime': '2022-07-20T13:31:35.421700-05:00', 'aliquot_quantity': 2.0, 'aliquot_id': '9aaf75ef-44d1-4ad8-8e48-e3f777862250', 'source_center': '23', 'updated_datetime': '2023-11-17T14:38:05.071949-06:00', 'selected_normal_wxs': 'false', 'no_matched_normal_low_pass_wgs': 'false', 'no_matched_normal_wxs': 'false', 'selected_normal_targeted_sequencing': 'false', 'state': 'released'}, {'aliquot_quantity': 3.0, 'aliquot_id': 'e0b7a231-d1cb-40c5-95c0-42d4c24344b4', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:32.358806-05:00', 'center': {'code': '10', 'center_id': 'd3b8c887-498b-5490-903e-760403c68307', 'name': 'Baylor College of Medicine', 'namespace': 'hgsc.bcm.edu', 'short_name': 'BCM', 'center_type': 'GSC'}, 'submitter_id': 'TCGA-F4-6854-10A-01D-1924-10', 'concentration': 0.05, 'state': 'released', 'aliquot_volume': 60.0}], 'analyte_type': 'DNA', 'updated_datetime': '2022-07-20T13:31:27.661735-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'spectrophotometer_method': 'UV Spec', 'submitter_id': 'TCGA-F4-6854-10A-01D', 'concentration': 0.15, 'state': 'released', 'a260_a280_ratio': 1.8, 'analyte_type_id': 'D'}, {'analyte_id': 'd5d49325-b69d-4c0f-83f8-3dee28d0a2e7', 'experimental_protocol_type': 'Repli-G', 'aliquots': [{'aliquot_quantity': 40.0, 'aliquot_id': '5a3ffcc0-311c-41b3-ac9e-4a5f945650e1', 'source_center': '22', 'updated_datetime': '2022-07-20T13:31:37.959730-05:00', 'center': {'code': '10', 'center_id': 'd3b8c887-498b-5490-903e-760403c68307', 'name': 'Baylor College of Medicine', 'namespace': 'hgsc.bcm.edu', 'short_name': 'BCM', 'center_type': 'GSC'}, 'submitter_id': 'TCGA-F4-6854-10A-01W-1967-10', 'concentration': 0.5, 'state': 'released', 'aliquot_volume': 80.0}], 'analyte_type': 'Repli-G (Qiagen) DNA', 'updated_datetime': '2022-07-20T13:31:35.973432-05:00', 'normal_tumor_genotype_snp_match': 'Yes', 'spectrophotometer_method': 'PicoGreen', 'submitter_id': 'TCGA-F4-6854-10A-01W', 'concentration': 0.5, 'state': 'released', 'analyte_type_id': 'W'}], 'portion_id': 'b6a2c8bc-d89a-44f5-97c8-5a5cb7beae45', 'updated_datetime': '2018-09-06T16:20:48.972378-05:00', 'creation_datetime': 1307923200.0, 'submitter_id': 'TCGA-F4-6854-10A-01', 'portion_number': '01', 'state': 'released', 'is_ffpe': 'false'}]}]

[{'sample_type_id': '01',
  'tumor_descriptor': 'Primary',
  'sample_id': '358bd002-2a75-4a4c-a583-c692c630750b',
  'submitter_id': 'TCGA-F4-6854-01Z',
  'sample_type': 'Primary Tumor',
  'created_datetime': '2018-05-17T12:15:41.728837-05:00',
  'oct_embedded': 'No',
  'specimen_type': 'Solid Tissue',
  'days_to_sample_procurement': 0,
  'updated_datetime': '2023-11-08T11:17:24.814426-06:00',
  'state': 'released',
  'is_ffpe': 'true',
  'preservation_method': 'FFPE',
  'tissue_type': 'Tumor',
  'portions': [{'analytes': [],
    'portion_id': '72d3cda9-1fa1-53a7-a560-bae805015c7b',
    'slides': [{'updated_datetime': '2018-08-23T19:24:29.874737-05:00',
      'submitter_id': 'TCGA-F4-6854-01Z-00-DX1',
      'section_location': 'Not Reported',
      'state': 'released',
      'slide_id': 'e76d04bf-1417-4359-928b-6e0ea8a1fa17',
      'created_datetime': '2018-05-17T13:38:19.166901-05:00'}]}]},
 {'intermediate_dimension': 0.8,
  'sample_type_id': '01',
  'tumor_descriptor': 'Primary',
  's

In [7]:
biospec_df['sample.type'] = biospec_df['samples'].apply(
    lambda s: s[0].get('sample_type') if isinstance(s, list) and s else None
)

In [8]:
biospec_df

Unnamed: 0,case_id,project.project_id,submitter_id,samples,sample.type
0,01240896-3f3f-4bf9-9799-55c87bfacf36,TCGA-COAD,TCGA-F4-6854,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor
1,01ad5016-f691-4bca-82a0-910429d8d25b,TCGA-COAD,TCGA-AA-3561,"[{'intermediate_dimension': 0.6, 'sample_type_...",Primary Tumor
2,01f493d4-229d-47a6-baa8-32a342c65d01,TCGA-COAD,TCGA-AA-A00O,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor
3,022f39e9-57ee-4b2b-8b3a-8929e3d69a37,TCGA-COAD,TCGA-DM-A28F,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor
4,02f9668c-71e6-485f-88b1-b37dc8bdd2ab,TCGA-COAD,TCGA-AA-3866,"[{'sample_type_id': '10', 'tumor_descriptor': ...",Blood Derived Normal
...,...,...,...,...,...
456,fce1fd5a-54d2-4260-b187-4eb7035e96e9,TCGA-COAD,TCGA-CM-6675,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor
457,fd16b634-2e04-44a3-862d-fb03cd73c057,TCGA-COAD,TCGA-AA-3556,"[{'intermediate_dimension': 0.7, 'sample_type_...",Primary Tumor
458,fdffda5f-72b2-4153-b7f1-d7043b7ca898,TCGA-COAD,TCGA-AA-3818,"[{'intermediate_dimension': 0.8, 'sample_type_...",Primary Tumor
459,ff1407c6-9174-4bae-a19b-d34ca71b898c,TCGA-COAD,TCGA-A6-2680,"[{'intermediate_dimension': 0.7, 'sample_type_...",Solid Tissue Normal


In [9]:
# From portion
biospec_df['portion.weight'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0].get('weight') if s and 'portions' in s[0] and s[0]['portions'] else None
)

biospec_df['portion.is_ffpe'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0].get('is_ffpe') if s and 'portions' in s[0] and s[0]['portions'] else None
)

biospec_df['portion.number'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0].get('portion_number') if s and 'portions' in s[0] and s[0]['portions'] else None
)

# From slides
biospec_df['slide.percent_tumor_cells'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['slides'][0].get('percent_tumor_cells')
    if s and 'portions' in s[0] and s[0]['portions'] and 'slides' in s[0]['portions'][0] and s[0]['portions'][0]['slides'] else None
)

biospec_df['slide.percent_necrosis'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['slides'][0].get('percent_necrosis')
    if s and 'portions' in s[0] and s[0]['portions'] and 'slides' in s[0]['portions'][0] and s[0]['portions'][0]['slides'] else None
)

biospec_df['slide.percent_normal_cells'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['slides'][0].get('percent_normal_cells')
    if s and 'portions' in s[0] and s[0]['portions'] and 'slides' in s[0]['portions'][0] and s[0]['portions'][0]['slides'] else None
)

biospec_df['slide.percent_tumor_nuclei'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['slides'][0].get('percent_tumor_nuclei')
    if s and 'portions' in s[0] and s[0]['portions'] and 'slides' in s[0]['portions'][0] and s[0]['portions'][0]['slides'] else None
)

# From analyte
biospec_df['analyte.type'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0].get('analyte_type')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0] and s[0]['portions'][0]['analytes'] else None
)

biospec_df['analyte.rna_integrity'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0].get('rna_integrity_number')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0] and s[0]['portions'][0]['analytes'] else None
)

biospec_df['analyte.a260_a280'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0].get('a260_a280_ratio')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0] and s[0]['portions'][0]['analytes'] else None
)

# From aliquots
biospec_df['aliquot.quantity'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0]['aliquots'][0].get('aliquot_quantity')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0]
    and s[0]['portions'][0]['analytes'] and 'aliquots' in s[0]['portions'][0]['analytes'][0]
    and s[0]['portions'][0]['analytes'][0]['aliquots'] else None
)

biospec_df['aliquot.volume'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0]['aliquots'][0].get('aliquot_volume')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0]
    and s[0]['portions'][0]['analytes'] and 'aliquots' in s[0]['portions'][0]['analytes'][0]
    and s[0]['portions'][0]['analytes'][0]['aliquots'] else None
)

biospec_df['aliquot.center'] = biospec_df['samples'].apply(
    lambda s: s[0]['portions'][0]['analytes'][0]['aliquots'][0]['center'].get('short_name')
    if s and 'portions' in s[0] and s[0]['portions'] and 'analytes' in s[0]['portions'][0]
    and s[0]['portions'][0]['analytes'] and 'aliquots' in s[0]['portions'][0]['analytes'][0]
    and s[0]['portions'][0]['analytes'][0]['aliquots']
    and 'center' in s[0]['portions'][0]['analytes'][0]['aliquots'][0] else None
)

In [10]:
biospec_df

Unnamed: 0,case_id,project.project_id,submitter_id,samples,sample.type,portion.weight,portion.is_ffpe,portion.number,slide.percent_tumor_cells,slide.percent_necrosis,slide.percent_normal_cells,slide.percent_tumor_nuclei,analyte.type,analyte.rna_integrity,analyte.a260_a280,aliquot.quantity,aliquot.volume,aliquot.center
0,01240896-3f3f-4bf9-9799-55c87bfacf36,TCGA-COAD,TCGA-F4-6854,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,,,,,,,,,,,,,
1,01ad5016-f691-4bca-82a0-910429d8d25b,TCGA-COAD,TCGA-AA-3561,"[{'intermediate_dimension': 0.6, 'sample_type_...",Primary Tumor,57.0,false,22,,,,,,,,,,
2,01f493d4-229d-47a6-baa8-32a342c65d01,TCGA-COAD,TCGA-AA-A00O,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,50.0,false,13,,,,,,,,,,
3,022f39e9-57ee-4b2b-8b3a-8929e3d69a37,TCGA-COAD,TCGA-DM-A28F,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,30.0,false,11,80.0,3.0,0.0,80.0,RNA,8.5,1.68,2.34,13.0,UNC
4,02f9668c-71e6-485f-88b1-b37dc8bdd2ab,TCGA-COAD,TCGA-AA-3866,"[{'sample_type_id': '10', 'tumor_descriptor': ...",Blood Derived Normal,79.0,false,1,,,,,DNA,,,2.00,50.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,fce1fd5a-54d2-4260-b187-4eb7035e96e9,TCGA-COAD,TCGA-CM-6675,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,,,,,,,,,,,,,
457,fd16b634-2e04-44a3-862d-fb03cd73c057,TCGA-COAD,TCGA-AA-3556,"[{'intermediate_dimension': 0.7, 'sample_type_...",Primary Tumor,145.0,false,01,40.0,0.0,0.0,60.0,Total RNA,,,3.00,30.0,BCGSC
458,fdffda5f-72b2-4153-b7f1-d7043b7ca898,TCGA-COAD,TCGA-AA-3818,"[{'intermediate_dimension': 0.8, 'sample_type_...",Primary Tumor,,,,,,,,,,,,,
459,ff1407c6-9174-4bae-a19b-d34ca71b898c,TCGA-COAD,TCGA-A6-2680,"[{'intermediate_dimension': 0.7, 'sample_type_...",Solid Tissue Normal,69.0,false,01,,,,,RNA,7.4,1.80,1.73,13.3,UNC


In [11]:
biospec_df['sample.intermediate_dimension'] = biospec_df['samples'].apply(
    lambda s: s[0].get('intermediate_dimension') if isinstance(s, list) and s else None
)

biospec_df['sample.shortest_dimension'] = biospec_df['samples'].apply(
    lambda s: s[0].get('shortest_dimension') if isinstance(s, list) and s else None
)

biospec_df['sample.longest_dimension'] = biospec_df['samples'].apply(
    lambda s: s[0].get('longest_dimension') if isinstance(s, list) and s else None
)

biospec_df['sample.composition'] = biospec_df['samples'].apply(
    lambda s: s[0].get('composition') if isinstance(s, list) and s else None
)

biospec_df['sample.specimen_type'] = biospec_df['samples'].apply(
    lambda s: s[0].get('specimen_type') if isinstance(s, list) and s else None
)

biospec_df['sample.preservation_method'] = biospec_df['samples'].apply(
    lambda s: s[0].get('preservation_method') if isinstance(s, list) and s else None
)

biospec_df['sample.is_ffpe'] = biospec_df['samples'].apply(
    lambda s: s[0].get('is_ffpe') if isinstance(s, list) and s else None
)

biospec_df['sample.tissue_type'] = biospec_df['samples'].apply(
    lambda s: s[0].get('tissue_type') if isinstance(s, list) and s else None
)

In [12]:
biospec_df

Unnamed: 0,case_id,project.project_id,submitter_id,samples,sample.type,portion.weight,portion.is_ffpe,portion.number,slide.percent_tumor_cells,slide.percent_necrosis,...,aliquot.volume,aliquot.center,sample.intermediate_dimension,sample.shortest_dimension,sample.longest_dimension,sample.composition,sample.specimen_type,sample.preservation_method,sample.is_ffpe,sample.tissue_type
0,01240896-3f3f-4bf9-9799-55c87bfacf36,TCGA-COAD,TCGA-F4-6854,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,,,,,,...,,,,,,,Solid Tissue,FFPE,true,Tumor
1,01ad5016-f691-4bca-82a0-910429d8d25b,TCGA-COAD,TCGA-AA-3561,"[{'intermediate_dimension': 0.6, 'sample_type_...",Primary Tumor,57.0,false,22,,,...,,,0.6,0.5,1.1,Not Reported,Unknown,Unknown,false,Tumor
2,01f493d4-229d-47a6-baa8-32a342c65d01,TCGA-COAD,TCGA-AA-A00O,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,50.0,false,13,,,...,,,,,,Not Reported,Solid Tissue,Unknown,false,Tumor
3,022f39e9-57ee-4b2b-8b3a-8929e3d69a37,TCGA-COAD,TCGA-DM-A28F,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,30.0,false,11,80.0,3.0,...,13.0,UNC,,,,Not Reported,Solid Tissue,Unknown,false,Tumor
4,02f9668c-71e6-485f-88b1-b37dc8bdd2ab,TCGA-COAD,TCGA-AA-3866,"[{'sample_type_id': '10', 'tumor_descriptor': ...",Blood Derived Normal,79.0,false,1,,,...,50.0,,,,,Not Reported,Peripheral Blood NOS,Unknown,false,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,fce1fd5a-54d2-4260-b187-4eb7035e96e9,TCGA-COAD,TCGA-CM-6675,"[{'sample_type_id': '01', 'tumor_descriptor': ...",Primary Tumor,,,,,,...,,,,,,,Solid Tissue,FFPE,true,Tumor
457,fd16b634-2e04-44a3-862d-fb03cd73c057,TCGA-COAD,TCGA-AA-3556,"[{'intermediate_dimension': 0.7, 'sample_type_...",Primary Tumor,145.0,false,01,40.0,0.0,...,30.0,BCGSC,0.7,0.3,0.7,Not Reported,Unknown,Unknown,false,Tumor
458,fdffda5f-72b2-4153-b7f1-d7043b7ca898,TCGA-COAD,TCGA-AA-3818,"[{'intermediate_dimension': 0.8, 'sample_type_...",Primary Tumor,,,,,,...,,,0.8,0.8,1.0,Not Reported,Unknown,Unknown,false,Tumor
459,ff1407c6-9174-4bae-a19b-d34ca71b898c,TCGA-COAD,TCGA-A6-2680,"[{'intermediate_dimension': 0.7, 'sample_type_...",Solid Tissue Normal,69.0,false,01,,,...,13.3,UNC,0.7,0.6,1.0,Not Reported,Solid Tissue,Unknown,false,Normal


In [13]:
biospec_df.to_csv("biospec_df_join", index=False)

In [15]:
biospec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 461 entries, 0 to 460
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   case_id                        461 non-null    object 
 1   project.project_id             461 non-null    object 
 2   submitter_id                   461 non-null    object 
 3   samples                        461 non-null    object 
 4   sample.type                    461 non-null    object 
 5   portion.weight                 175 non-null    float64
 6   portion.is_ffpe                276 non-null    object 
 7   portion.number                 276 non-null    object 
 8   slide.percent_tumor_cells      76 non-null     float64
 9   slide.percent_necrosis         75 non-null     float64
 10  slide.percent_normal_cells     77 non-null     float64
 11  slide.percent_tumor_nuclei     75 non-null     float64
 12  analyte.type                   233 non-null    obj