In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("max_colwidth", 200)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    print("\n***** Shape: ", df.shape, " *****\n")

    columns_list = df.columns.values.tolist()
    isnull_list = df.isnull().sum().values.tolist()
    isunique_list = df.nunique().values.tolist()
    dtypes_list = df.dtypes.tolist()

    list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
    df_stat_val = pd.DataFrame(
        list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"]
    )
    print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [2]:
import os


def list_files(dir):
    r = []
    subdirs = [x[0] for x in os.walk(dir)]
    for subdir in subdirs:
        files = os.walk(subdir).__next__()[2]

        if len(files) > 0:
            for file in files:
                r.append(os.path.join(subdir, file))
    return r

In [3]:
r = list_files("/media/data1/ravram/DeepOCT_Abbott")
display(len(r), " files found")
df = pd.DataFrame(list(zip(r)), columns=["path"])
new_2 = df["path"].str.split("/", n=8, expand=True)

df["StudyInstanceUID"] = new_2[6]
df["patient_id"] = new_2[5]
df["dicom_id"] = new_2[7]
display(df.head(n=15))

293

' files found'

Unnamed: 0,path,StudyInstanceUID,patient_id,dicom_id
0,/media/data1/ravram/DeepOCT_Abbott/requests.whole,,requests.whole,
1,/media/data1/ravram/DeepOCT_Abbott/requests.completed,,requests.completed,
2,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm
3,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003820.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003820.dcm
4,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003824.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003824.dcm
5,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003827.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003827.dcm
6,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003833.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003833.dcm
7,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003836.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003836.dcm
8,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003858.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003858.dcm
9,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003872.dcm,2.16.124.113611.1.118.1.1.6202367,199385,1.3.12.2.1107.5.4.5.135214.30000022072510371002300003872.dcm


In [4]:
df_extracted = pd.read_csv("data/Abbott_OCT/database-c-find_ABBOTT.csv")
df_extracted = pd.DataFrame({"path": r})
df_extracted["FileType"] = df_extracted.path.apply(lambda x: x.split(".")[-1])
display("Total files", df_extracted.FileType.value_counts())

df_extracted = df_extracted.loc[df_extracted["FileType"] == "dcm"]
new_2 = df_extracted["path"].str.split("/", n=7, expand=True)

df_extracted["mrn"] = new_2[5]
df_extracted["StudyInstanceUID"] = new_2[6]
df_extracted["dicom_id"] = new_2[7].str.rstrip(".dcm")
df_extracted_study_level = (
    df_extracted.groupby(["mrn", "StudyInstanceUID"]).first().reset_index()
)

'Total files'

dcm          291
completed      1
whole          1
Name: FileType, dtype: int64

In [5]:
df_extracted_m = pd.merge(
    df_extracted_study_level.drop(columns={"dicom_id", "path"}),
    df,
    how="inner",
    on=["StudyInstanceUID"],
)
display(
    df_extracted_m.loc[
        df_extracted_m["dicom_id"]
        == "1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm"
    ]
)
df_extracted_m.to_csv("data/Abbott_OCT/df_extracted_m_ABBOTT.csv", index=False)

Unnamed: 0,mrn,StudyInstanceUID,FileType,path,patient_id,dicom_id
0,199385,2.16.124.113611.1.118.1.1.6202367,dcm,/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm,199385,1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm


In [6]:
df_extracted_m = pd.read_csv("data/Abbott_OCT/df_extracted_m_ABBOTT.csv")
display(
    df_extracted_m.loc[
        df_extracted_m["StudyInstanceUID"] == "2.16.124.113611.1.118.1.1.5884039"
    ]
)

Unnamed: 0,mrn,StudyInstanceUID,FileType,path,patient_id,dicom_id
194,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522494813600000000.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522494813600000000.dcm
195,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000031.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000031.dcm
196,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000034.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000034.dcm
197,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000039.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000039.dcm
198,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000042.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000042.dcm
199,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000063.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000063.dcm
200,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000071.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000071.dcm
201,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000073.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000073.dcm
202,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000078.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000078.dcm
203,595183,2.16.124.113611.1.118.1.1.5884039,dcm,/media/data1/ravram/DeepOCT_Abbott/595183/2.16.124.113611.1.118.1.1.5884039/1.3.12.2.1107.5.4.5.135214.30000021090522432726100000081.dcm,595183,1.3.12.2.1107.5.4.5.135214.30000021090522432726100000081.dcm


### Extract AVI Metadata




In [7]:
from downloadAvi import extract_avi_metadata as avi_meta

avi_meta.extract_avi_and_metadata("data/Abbott_OCT/df_extracted_m_ABBOTT.csv")

291it [02:37,  1.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata["FPS"] = metadata["FPS"].fillna(15.0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata["StudyInstanceUID"] = metadata["StudyInstanceUID"].str.replace("'", "")


Unnamed: 0,brand,sex,FPS,NumberOfFrames,date,study_time,series_time,birthdate,color_format,mrn,StudyID,StudyInstanceUID,SeriesInstanceUID,dicom_path,FileName,Split
0,Siemens,F,10.0,44.0,20220730,231044.0,234643.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003817.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072511311760100000156.dcm.avi,inference
1,Siemens,F,10.0,45.0,20220730,231044.0,234703.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003819.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003820',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003820.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003820.dcm.avi,inference
2,Siemens,F,10.0,55.0,20220730,231044.0,234733.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003823.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003824',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003824.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003824.dcm.avi,inference
3,Siemens,F,10.0,35.0,20220730,231044.0,234758.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003826.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003827',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003827.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003827.dcm.avi,inference
4,Siemens,F,10.0,53.0,20220730,231044.0,234938.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003832.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003833',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003833.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003833.dcm.avi,inference
5,Siemens,F,10.0,42.0,20220730,231044.0,234953.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003835.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003836',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003836.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003836.dcm.avi,inference
6,Siemens,F,10.0,53.0,20220730,231044.0,35.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003859.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003858',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003858.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003858.dcm.avi,inference
7,Siemens,F,10.0,30.0,20220730,231044.0,302.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003873.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003872',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003872.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003872.dcm.avi,inference
8,Siemens,F,10.0,257.0,20220730,231044.0,524.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003879.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003878',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003878.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003878.dcm.avi,inference
9,Siemens,F,10.0,7.0,20220730,231044.0,556.0,19581128,MONOCHROME2,199385,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003880.512',2.16.124.113611.1.118.1.1.6202367,'1.3.12.2.1107.5.4.5.135214.30000022072510371002300003881',/media/data1/ravram/DeepOCT_Abbott/199385/2.16.124.113611.1.118.1.1.6202367/1.3.12.2.1107.5.4.5.135214.30000022072510371002300003881.dcm,dicom_avi_extracted/2.16.124.113611.1.118.1.1.6202367_1.3.12.2.1107.5.4.5.135214.30000022072510371002300003881.dcm.avi,inference
