In [5]:
import pandas as pd
import os
import glob

os.listdir()

['.git',
 '.gitignore',
 '.idea',
 'config',
 'data',
 'dataloader.py',
 'jupyter',
 'lightning_logs',
 'main.py',
 'models',
 'nvidia_dali.py',
 'output',
 'README.md',
 'requirements.txt',
 'test.py',
 'utils',
 'wandb',
 '__pycache__']

In [None]:
os.chdir("../")

## Extracts information like actual age, estimated bone age, gender and radiologist initials from the text file.

In [58]:
import re


def calculate_total_months(duration):
    total_months = 0

    # Extract years and months from the duration string
    years = re.findall(r'(\d+)\s*y', duration, re.IGNORECASE)
    months = re.findall(r'(\d+)\s*m', duration, re.IGNORECASE)

    # Add the years and months to the total
    if years:
        total_months += int(years[0]) * 12
    if months:
        total_months += int(months[0])

    # If no years or months are found, check for a duration in months
    if not years and not months:
        months_only = re.findall(r'(\d+)\s*months?', duration, re.IGNORECASE)
        if months_only:
            total_months += int(months_only[0])

    return total_months


def check_gender(data):
    pattern = r'(?i)GENDER:\s*(M|F)'
    match = re.search(pattern, data)
    if match:
        return match.group(1).upper()
    else:
        return None


def clean_values(data):
    age = calculate_total_months(data[0])
    gender = [0 if data[1] == "MALE" else 1][0]

    # Check if the age is in the second line or third line
    if bool(re.search(r'\d', data[2])):
        bage = calculate_total_months(data[2])
        radiologist = data[3].split(":")[1].strip().upper()
    else:
        bage = calculate_total_months(data[3])
        radiologist = data[2].split(":")[1].strip().upper()

    return age, gender, bage, radiologist

# age, bage, gender, radiologist= [], [], [], []
# for dir in glob.glob("./data/Mex_sample_data/*"):
#     for file in glob.glob(dir + "/*.txt"):
#         with open(file, "r") as f:
#             data = f.readlines()
#             data = [line.strip() for line in data if line.strip()]
#
#             cleaned = clean_values(data)
#             age.append(cleaned[0])
#             gender.append(cleaned[1])
#             bage.append(cleaned[2])
#             radiologist.append(cleaned[3])
# mexico_data = pd.DataFrame(data=zip(age, gender, bage, radiologist), columns=['age', 'gender', 'bage', 'radiologist'])

In [137]:
age, bage, gender, radiologist, year_entry, id_, paths = [], [], [], [], [], [], []
years = [2019]
for year in years:
    for dir in glob.glob(f"data/Mexico_private_dataset/{year}/*"):
        for file in glob.glob(dir + "/*.txt"):
            with open(file, "r", encoding="utf8") as f:
                data = f.readlines()
                data = [line.strip() for line in data if line.strip()]
                id_.append(len(id_) + 1)
                dicom_file = glob.glob(dir + "/*.dcm")
                paths.append(dicom_file[0])
                cleaned = clean_values(data)
                age.append(cleaned[0])
                gender.append(cleaned[1])
                bage.append(cleaned[2])
                radiologist.append(cleaned[3])
                year_entry.append(year)


In [138]:
for year in [2020, 2021]:
    for dir in glob.glob(f"data/Mexico_private_dataset/{year}/*"):
        for file in glob.glob(dir + "/*.txt"):
            with open(file, "r", encoding="utf8") as f:
                data = f.readlines()
                data = [line.strip() for line in data if line.strip()]
                id_.append(len(id_) + 1)
                dicom_file = glob.glob(dir + "/*.dcm")
                paths.append(dicom_file[0])
                age.append(calculate_total_months(data[0]))
                if check_gender(data[1]) == "M":
                    gender.append(0)
                elif check_gender(data[1]) == "F":
                    gender.append(1)
                else:
                    gender.append(None)
                bage.append(calculate_total_months(data[2].split(":")[1]))
                radiologist.append(data[3].split(":")[1].strip().upper())
                year_entry.append(year)


In [139]:
mexico_data = pd.DataFrame(data=zip(id_, age, gender, bage, radiologist, year_entry, paths),
                           columns=['id','age', 'gender', 'boneage', 'radiologist', 'year_entry', 'path'])

In [140]:
mexico_data.dropna(inplace=True)
mexico_data['gender'] = mexico_data.gender.astype(int)

In [141]:
len(mexico_data)

343

In [142]:
# save pandas df as csv without index
mexico_data.to_csv("./data/Mexico_private_dataset/mexico_raw_dataset.csv", index=False)

In [143]:
mexico_data['path'] = mexico_data['id'].map(lambda x: os.path.join("data",
                                                         "Mexico_private_dataset",
                                                         'preprocessed',
                                                         '{}.png'.format(x)))

In [144]:
mexico_data

Unnamed: 0,id,age,gender,boneage,radiologist,year_entry,path
0,1,192,0,204,JSA,2019,data\Mexico_private_dataset\preprocessed\1.png
1,2,173,0,192,JSA,2019,data\Mexico_private_dataset\preprocessed\2.png
2,3,48,0,36,JSA,2019,data\Mexico_private_dataset\preprocessed\3.png
3,4,48,1,60,JSA,2019,data\Mexico_private_dataset\preprocessed\4.png
4,5,163,0,162,JSA,2019,data\Mexico_private_dataset\preprocessed\5.png
...,...,...,...,...,...,...,...
340,341,122,1,108,JSA,2021,data\Mexico_private_dataset\preprocessed\341.png
341,342,182,0,186,JSA,2021,data\Mexico_private_dataset\preprocessed\342.png
342,343,138,1,132,JSA,2021,data\Mexico_private_dataset\preprocessed\343.png
343,344,116,0,96,JSA,2021,data\Mexico_private_dataset\preprocessed\344.png


In [145]:
mexico_data.gender.value_counts()

gender
0    174
1    169
Name: count, dtype: int64

In [146]:
# Checking if all the images exist
mexico_data['exists'] = mexico_data['path'].map(os.path.exists)
print(mexico_data['exists'].sum(), 'images found of', mexico_data.shape[0], 'total')
# Drop row if exist column is false does not exist
mexico_data = mexico_data[mexico_data['exists']]

337 images found of 343 total


In [147]:
mexico_data.to_csv("./data/Mexico_private_dataset/mexico_preprocessed_dataset.csv", index=False)
