In [None]:
# Imports

# For deep learning stuff
import tensorflow as tf

# For data
import pandas as pd

# For plotting
import matplotlib.pyplot as plt

# For file paths
import os
import random

In [2]:

# GPU check, this is very important!


# This checks for GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"Great! Found the GPU:")
    for gpu in gpus:
        print(f"- {gpu}")
else:
    print("No GPU found!!! The model will run on the CPU, which will be a lot slower. Make sure you have installed TensorFlow and the correct CUDA drivers before proceeding with the next steps!")

Great! Found the GPU:
- PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
# Load our datasets CSV file


# The path to your data folder, this is mine, change it to yours!
# This should be the only line you need to change.
data_path = r'C:\Users\Leo\Desktop\Group_Pneumonia_Detection\data'

# Make the full paths to the csv files
train_csv_path = os.path.join(data_path, 'train.csv')
valid_csv_path = os.path.join(data_path, 'valid.csv')

# Load the csv files
try:
    train_df = pd.read_csv(train_csv_path)
    valid_df = pd.read_csv(valid_csv_path)
    
    print("\nTraining data info:")
    train_df.info()
    
    print("\nValidation data head:")
    print(valid_df.head())

except FileNotFoundError:
    print("Error! Make sure you have change the data_path with your actual path!")


Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223414 entries, 0 to 223413
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Path                        223414 non-null  object 
 1   Sex                         223414 non-null  object 
 2   Age                         223414 non-null  int64  
 3   Frontal/Lateral             223414 non-null  object 
 4   AP/PA                       191027 non-null  object 
 5   No Finding                  22381 non-null   float64
 6   Enlarged Cardiomediastinum  44839 non-null   float64
 7   Cardiomegaly                46203 non-null   float64
 8   Lung Opacity                117778 non-null  float64
 9   Lung Lesion                 11944 non-null   float64
 10  Edema                       85956 non-null   float64
 11  Consolidation               70622 non-null   float64
 12  Pneumonia                   27608 non-null   float6

In [14]:
# Analyze the labels in our data

print("--- Analyzing the Labels in Our Data ---")

# Label meaning in the CheXpert dataset:
print("\nLabel Meanings for 'Pneumonia':")
print(" 1.0: Positive finding.")
print(" 0.0: Negative finding.")
print("-1.0: Uncertain finding. The report is unclear about the presence of pneumonia.")
print(" NaN(empty): Not mentioned. The report does not mention pneumonia.")


# Count the different labels in the training data
print("\n--- Training Data Label Counts ---")
train_counts = train_df['Pneumonia'].value_counts(dropna=False)
print(train_counts)


# Count the different labels in the validation data
print("\n--- Validation Data Label Counts ---")
valid_counts = valid_df['Pneumonia'].value_counts(dropna=False)
print(valid_counts)

--- Analyzing the Labels in Our Data ---

Label Meanings for 'Pneumonia':
 1.0: Positive finding.
 0.0: Negative finding.
-1.0: Uncertain finding. The report is unclear about the presence of pneumonia.
 NaN(empty): Not mentioned. The report does not mention pneumonia.

--- Training Data Label Counts ---
 NaN    195806
-1.0     18770
 1.0      6039
 0.0      2799
Name: Pneumonia, dtype: int64

--- Validation Data Label Counts ---
0.0    226
1.0      8
Name: Pneumonia, dtype: int64
