tabular 

In [45]:
import pandas as pd

In [46]:
def general_data_clearing(df):
    """#+
    This function performs data cleaning and preprocessing on a given DataFrame.#+
#+
    Parameters:#+
    df (pandas.DataFrame): The input DataFrame containing the data.#+
#+
    Returns:#+
    pandas.DataFrame: The cleaned and preprocessed DataFrame.#+
#+
    The function performs the following steps:#+
    1. Drops rows with missing values from the DataFrame.#+
    2. Splits image paths into separate columns for easier access to corresponding image files.#+
    3. Replaces spaces in column names with underscores.#+
    4. Returns the cleaned and preprocessed DataFrame.#+
    """
    df.dropna(inplace=True)

    # spilting image paths into separate columns so I can find the corresponding image file easily
    df = path_spilting(df, 'image file path','full')
    df = path_spilting(df, 'cropped image file path','cropped')
    df = path_spilting(df, 'ROI mask file path','mask')

    # replacing spaces in column names to underscores
    df.columns = df.columns.str.replace(' ', '_')

    df['pathology'] = df['pathology'].replace('BENIGN_WITHOUT_CALLBACK', 'BENIGN')

    return df


def path_spilting(df, col_name,image_type):
    new = df[col_name].str.split('/',n =3,expand=True)
    df[f'image_folder_{image_type}']=new[2]
    if image_type == 'cropped':
        df[f'image_name_{image_type}']=new[3]
        df[f'image_name_{image_type}'] = df[f'image_name_{image_type}'].str.replace(r'.dcm\n', '')

    df.drop(columns=[col_name], inplace=True)
    return df

In [47]:
calc_df = pd.read_csv('data\csv\calc_case_description.csv')
calc_df.columns

Index(['patient_id', 'breast density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'calc type', 'calc distribution',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path'],
      dtype='object')

In [48]:
calc_df = pd.read_csv('data\csv\calc_case_description.csv')
df = general_data_clearing(calc_df)

df.head()

Unnamed: 0,patient_id,breast_density,left_or_right_breast,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,image_folder_full,image_folder_cropped,image_name_cropped,image_folder_mask
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,1.3.6.1.4.1.9590.100.1.2.374115997511889073021...,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,000001.dcm\n,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,1.3.6.1.4.1.9590.100.1.2.174390361112646747718...,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,000001.dcm\n,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...
14,P_00100,4,RIGHT,CC,1,calcification,PLEOMORPHIC,CLUSTERED,4,BENIGN,4,1.3.6.1.4.1.9590.100.1.2.153667468610757963111...,1.3.6.1.4.1.9590.100.1.2.416175242512972515029...,000001.dcm\n,1.3.6.1.4.1.9590.100.1.2.416175242512972515029...
15,P_00100,4,RIGHT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,4,BENIGN,4,1.3.6.1.4.1.9590.100.1.2.159953246125132978053...,1.3.6.1.4.1.9590.100.1.2.304205501811264755442...,000001.dcm\n,1.3.6.1.4.1.9590.100.1.2.304205501811264755442...
16,P_00127,2,RIGHT,CC,1,calcification,PLEOMORPHIC,CLUSTERED,4,MALIGNANT,3,1.3.6.1.4.1.9590.100.1.2.613475814104326682401...,1.3.6.1.4.1.9590.100.1.2.284207981147845672484...,000001.dcm\n,1.3.6.1.4.1.9590.100.1.2.284207981147845672484...


In [49]:
df['pathology'].value_counts()

pathology
BENIGN       739
MALIGNANT    670
Name: count, dtype: int64

In [52]:
df = df.drop(columns="patient_id")

In [53]:
df.to_pickle('data/df.pkl')