# Install and Import Required Libraries

In [None]:
!pip install mlflow dagshub

In [1]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras import layers, models, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, roc_auc_score, roc_curve, auc
import numpy as np
import zipfile
import os
from tqdm import tqdm
import mlflow
import mlflow.tensorflow
import dagshub
import seaborn as sns
import matplotlib.pyplot as plt
from dagshub import upload_files
from dagshub.data_engine import datasources
import requests
from sklearn.model_selection import KFold

# Initial Configuration (Parameters)

In [2]:
# Constants for DagsHub
REPO_OWNER = 'Jesteban247'
REPO_NAME = 'COVID-19_CT_Scan_Classification_with_Transfer_Learning'

In [3]:
# Constants for Model Training
EPOCHS = 25
BATCH_SIZE = 64
LEARNING_RATE = 0.001
IMAGE_SIZE = (150, 150)
CLASS_NAMES = ['1NonCOVID', '2COVID', '3CAP']

In [4]:
# Constants for Data Directories
DATA_DIR = '/content/large-covid19-ct-slice-dataset/curated_data/curated_data'
PREPROCESSED_DIR = '/content/preprocessed_data'
ARTIFACTS_DIR = '/content/artifacts'

In [5]:
# Ensure the artifacts directory exists
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

In [6]:
# DagsHub Setup
def setup_dagshub():
    """Initialize DagsHub for MLflow logging if not already initialized."""
    if os.getenv('MLFLOW_TRACKING_URI'):
        print("MLflow tracking is already set up. Skipping initialization.")
    else:
        print("Initializing DagsHub for MLflow logging...")
        dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)
        print("DagsHub initialization completed.")

# Data Download and Preprocessing

To download the data from Kaggle, use the following function.

You can find the dataset at this link:
[Kaggle Large COVID-19 CT Slice Dataset.](https://www.kaggle.com/datasets/maedemaftouni/large-covid19-ct-slice-dataset/data)

In [None]:
def download_data():
    """Download and unzip the dataset from Kaggle."""
    print("Starting dataset download...")
    !kaggle datasets download -d maedemaftouni/large-covid19-ct-slice-dataset
    zip_file_path = '/content/large-covid19-ct-slice-dataset.zip'
    extract_dir = '/content/large-covid19-ct-slice-dataset/'

    print("Extracting dataset...")
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("Data downloaded and extracted successfully! \n")

This function handles the complete pipeline for preparing the dataset. It loads the data, processes it by splitting into training and test sets, resizes images, and one-hot encodes labels. The preprocessed data is then saved for later use, streamlining the workflow and ensuring the data is ready for model training and evaluation.

In [None]:
def preprocess_data():
    """Load, preprocess, and save the dataset."""
    print("Loading and preprocessing dataset...")
    dataset = tf.keras.preprocessing.image_dataset_from_directory(
        directory=DATA_DIR,
        labels='inferred',
        label_mode='int',
        class_names=CLASS_NAMES,
        image_size=IMAGE_SIZE,
        batch_size=BATCH_SIZE,
        shuffle=True
    )

    # Splitting dataset into training and test sets
    num_batches = tf.data.experimental.cardinality(dataset).numpy()
    split_index = int(num_batches * 0.7)
    train_ds = dataset.take(split_index)
    test_ds = dataset.skip(split_index)

    def dataset_to_numpy(dataset):
        """Convert TensorFlow dataset to NumPy arrays."""
        images, labels = [], []
        for img_batch, lbl_batch in tqdm(dataset, desc='Processing Dataset', unit='batch'):
            images.append(img_batch.numpy())
            labels.append(lbl_batch.numpy())
        return tf.concat(images, axis=0), tf.concat(labels, axis=0)

    # Converting datasets to NumPy arrays
    train_images, train_labels = dataset_to_numpy(train_ds)
    test_images, test_labels = dataset_to_numpy(test_ds)

    # Resizing images and one-hot encoding labels
    train_images = tf.image.resize(train_images, IMAGE_SIZE)
    test_images = tf.image.resize(test_images, IMAGE_SIZE)
    train_labels = to_categorical(train_labels, num_classes=len(CLASS_NAMES))
    test_labels = to_categorical(test_labels, num_classes=len(CLASS_NAMES))

    # Saving preprocessed data
    os.makedirs(PREPROCESSED_DIR, exist_ok=True)
    np.save(os.path.join(PREPROCESSED_DIR, 'train_images.npy'), train_images)
    np.save(os.path.join(PREPROCESSED_DIR, 'train_labels.npy'), train_labels)
    np.save(os.path.join(PREPROCESSED_DIR, 'test_images.npy'), test_images)
    np.save(os.path.join(PREPROCESSED_DIR, 'test_labels.npy'), test_labels)
    print("Data preprocessing and saving completed! \n")

# Alternative (Download Data from DagsHub)

Since I've already run the project and uploaded the preprocessed data to DagsHub, you can avoid downloading and preprocessing the data from Kaggle again. Instead, you can directly download the already processed data from DagsHub.

In [8]:
def download_data_from_dagshub(data_source_name, save_dir=PREPROCESSED_DIR):
    """Download the preprocessed data from DagsHub and save it to the specified directory."""
    os.makedirs(save_dir, exist_ok=True)

    # Access the data source from DagsHub
    ds = datasources.get(f'{REPO_OWNER}/{REPO_NAME}', data_source_name)

    # Query for all data points with size larger than 1 byte
    query = ds["size"] > 1
    data_points = query.all().dataframe

    # Iterate through each file in the dataframe and download it
    for index, row in data_points.iterrows():
        file_name = row['path']
        file_url = row['dagshub_download_url']
        file_size = int(row['size'])

        # Prepare file path
        file_path = os.path.join(save_dir, file_name)

        # Download with a single progress bar per file
        with requests.get(file_url, stream=True) as response, open(file_path, 'wb') as file:
            total_size_in_bytes = int(response.headers.get('content-length', file_size))
            block_size = 1024  # 1 Kibibyte
            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True, desc=f"Downloading {file_name}")

            for data in response.iter_content(block_size):
                file.write(data)
                progress_bar.update(len(data))

            progress_bar.close()

        print(f"Downloaded: {file_name} ({file_size} bytes)")

    print("Data download complete!")

# Load Data to variables

This function loads preprocessed dataset files with progress indicators. It reads the training and test images and labels from the specified directory, providing visual feedback on the loading process. Once all files are loaded, the function returns the datasets for further use.



In [7]:
def load_data():
    """Load preprocessed data with progress indicators."""
    print("Loading preprocessed data...")
    with tqdm(total=4, desc="Loading Data", unit="file") as pbar:
        train_images = np.load(os.path.join(PREPROCESSED_DIR, 'train_images.npy'))
        pbar.update(1)
        train_labels = np.load(os.path.join(PREPROCESSED_DIR, 'train_labels.npy'))
        pbar.update(1)
        test_images = np.load(os.path.join(PREPROCESSED_DIR, 'test_images.npy'))
        pbar.update(1)
        test_labels = np.load(os.path.join(PREPROCESSED_DIR, 'test_labels.npy'))
        pbar.update(1)
    print("\nData loaded successfully! \n")
    return train_images, train_labels, test_images, test_labels

# Model Building and Training



The `build_model` function constructs a Convolutional Neural Network (CNN) using VGG16 as a base model, applying transfer learning techniques. Here’s a detailed explanation of the process:

1. **Transfer Learning with VGG16:**
   - **VGG16 Overview:** VGG16 is a well-established deep learning model trained on the ImageNet dataset. It features 16 layers and excels in image classification tasks due to its effective feature extraction capabilities. For more on VGG16 and its use in transfer learning, refer to this [article](https://towardsdatascience.com/transfer-learning-with-vgg16-and-keras-50ea161580b4).
   - **Feature Extraction:** In this approach, we use VGG16 as a feature extractor. Instead of training the model from scratch, we leverage the features learned by VGG16, which were derived from a large and diverse dataset.

2. **Building the Model:**
   - **Input Layer:** The model is designed to accept images of size 150x150 pixels with 3 color channels (RGB).
   - **Base Model:** VGG16 is loaded with pre-trained weights from ImageNet, but without its final classification layers (`include_top=False`). This allows us to add our own custom classifier on top.
   - **Freezing the Base Model:** We set `base_model.trainable = False` to prevent the weights of VGG16 from being updated during training. This ensures that the feature extraction capabilities of VGG16 remain unchanged while we train the new layers.
   - **Custom Layers:**
     - **Flatten:** Converts the 3D feature maps output by VGG16 into a 1D vector.
     - **Dense Layers:** Two fully connected layers with ReLU activation functions are added to further process the extracted features. Dropout is applied to these layers to mitigate overfitting.
     - **Output Layer:** A final dense layer with a softmax activation function produces class probabilities for three classes (adjustable depending on the specific classification task).

3. **Compiling the Model:**
   - **Optimizer:** The Adam optimizer is used with a defined learning rate to update model weights during training.
   - **Loss Function:** Categorical crossentropy is employed as the loss function for multi-class classification.
   - **Metrics:** Accuracy is monitored to evaluate model performance.

4. **Saving Model Summary:**
   - The architecture of the model, including layer details and parameters, is saved to a text file (`model_summary.txt`) in the `ARTIFACTS_DIR`. This file provides a comprehensive overview of the model structure.

By utilizing VGG16 as a pre-trained base model, this function benefits from its robust feature extraction capabilities, while the additional layers tailor the model to the specific classification task at hand.

In [8]:
def build_model():
    """Build the CNN model using VGG16 as the base and save the model summary."""
    print("Building CNN model with VGG16 as base...")

    # Define input and base model
    inputs = Input(shape=(150, 150, 3))
    base_model = VGG16(weights="imagenet", include_top=False, input_tensor=inputs)
    base_model.trainable = False

    # Add custom layers on top of the base model
    x = base_model.output
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation='relu')(x)
    predictions = layers.Dense(3, activation='softmax')(x)  # Assuming 3 classes

    # Create the full model
    model = models.Model(inputs=inputs, outputs=predictions)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Save the model summary to a text file in the artifacts folder
    summary_file = os.path.join(ARTIFACTS_DIR, 'model_summary.txt')
    with open(summary_file, 'w') as f:
        model.summary(print_fn=lambda x: f.write(x + '\n'))

    print("Model built and summary saved. \n")
    return model

The `train_model` function trains the provided CNN model with early stopping and returns the training history. Here’s an in-depth look at the function:

1. **Early Stopping:**
   - **Purpose:** To prevent overfitting and ensure that training stops when the model performance on the validation set no longer improves.
   - **Configuration:** `EarlyStopping` is configured to monitor the validation accuracy (`val_accuracy`). It will stop training if no improvement is observed for 5 consecutive epochs (`patience=5`). Additionally, it restores the model weights from the epoch with the highest validation accuracy to ensure the best model is retained.

2. **Model Training:**
   - **Data Preparation:** The `preprocess_input` function is applied to the training images to ensure they are appropriately scaled for the VGG16 base model.
   - **Training Process:**
     - **Epochs:** The model is trained for a defined number of epochs (`EPOCHS`).
     - **Validation Split:** 20% of the training data is set aside for validation to monitor model performance during training.
     - **Batch Size:** The model processes data in batches of size `BATCH_SIZE` for efficiency.
     - **Callbacks:** The `EarlyStopping` callback is used to manage training interruptions based on validation performance.

3. **Completion and Return:**
   - **Completion Message:** Prints a message indicating that model training is complete.
   - **Return Value:** The function returns the `history` object, which contains detailed information about the training process, including metrics and losses for each epoch.

This function ensures efficient and effective training of the model, leveraging early stopping to optimize performance and prevent overfitting.

In [9]:
def train_model(model, train_images, train_labels):
    """Train the model without early stopping and return the training history."""
    print("Training model...")
    history = model.fit(
        preprocess_input(train_images),
        train_labels,
        epochs=EPOCHS,
        validation_split=0.2,
        batch_size=BATCH_SIZE
    )
    print("Model training completed. \n")
    return history

In [10]:
def save_model(model):
    """Save the trained model to the artifacts directory."""
    model_path = os.path.join(ARTIFACTS_DIR, "covid19_ct_model.keras")
    model.save(model_path)
    print("Model saved to artifacts directory.\n")

# Model Evaluation

In [11]:
def plot_confusion_matrix(cm, cm_path):
    """Plot and save the confusion matrix to a file."""
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.savefig(cm_path)
    plt.close()
    print(f"Confusion matrix saved to {cm_path}\n")

The `evaluate_model` function assesses the performance of the trained model on the test dataset. It calculates key metrics such as accuracy, loss, precision, recall, F1 score, and ROC-AUC. It also generates and saves a confusion matrix, which visualizes the model’s performance across different classes. Here’s a brief overview:

1. **Model Predictions:** The function makes predictions on the test dataset and converts these into class labels.
2. **Metric Calculation:** It computes precision, recall, F1 score, accuracy, and loss to evaluate model performance.
3. **Confusion Matrix:** A confusion matrix is plotted and saved, providing a visual representation of classification results.
4. **ROC-AUC Score:** This metric evaluates the model’s ability to distinguish between classes.

In [12]:
def evaluate_model(model, test_images, test_labels):
    """Evaluate the model and save confusion matrix and other metrics."""
    print("Evaluating model...")
    y_pred = model.predict(preprocess_input(test_images))
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(test_labels, axis=1)

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_classes, y_pred_classes, average='weighted')
    test_accuracy = np.mean(y_pred_classes == y_true_classes)
    test_loss = model.evaluate(preprocess_input(test_images), test_labels, verbose=0)[0]

    # Confusion Matrix
    cm = confusion_matrix(y_true_classes, y_pred_classes)
    cm_path = os.path.join(ARTIFACTS_DIR, 'confusion_matrix.png')
    plot_confusion_matrix(cm, cm_path)

    # ROC-AUC
    roc_auc = roc_auc_score(test_labels, y_pred, average='weighted', multi_class='ovr')

    print(f"Test Accuracy: {test_accuracy}, Test Loss: {test_loss}, F1 Score: {f1}, ROC-AUC: {roc_auc}")
    print("Model evaluation completed. \n")
    return test_accuracy, test_loss, precision, recall, f1, roc_auc

# MLflow Integration

The `mlflow_tracking` function logs experiment details, including hyperparameters, key metrics, artifacts, and the trained model, to MLflow. This helps in tracking and managing machine learning experiments. It starts an MLflow run, records relevant data, and saves the model and artifacts for future reference.

In [13]:
def mlflow_tracking(train_history, test_metrics, model):
    """Log metrics, artifacts, and model to MLflow."""
    print("Starting MLflow tracking...")

    with mlflow.start_run() as run:
        # Log hyperparameters
        mlflow.log_param("learning_rate", LEARNING_RATE)
        mlflow.log_param("epochs", EPOCHS)
        mlflow.log_param("batch_size", BATCH_SIZE)

        # Log important metrics
        mlflow.log_metric("train_accuracy", train_history.history['accuracy'][-1])
        mlflow.log_metric("test_accuracy", test_metrics[0])
        mlflow.log_metric("test_loss", test_metrics[1])
        mlflow.log_metric("test_precision", test_metrics[2])
        mlflow.log_metric("test_recall", test_metrics[3])
        mlflow.log_metric("test_f1_score", test_metrics[4])
        mlflow.log_metric("test_roc_auc", test_metrics[5])

        # Log artifacts
        mlflow.log_artifacts(ARTIFACTS_DIR)

        # Log the model
        mlflow.keras.log_model(
            model,
            artifact_path="model",
            registered_model_name="covid19_ct_model"
        )

        print("MLflow tracking completed. \n")

# Extra (Upload Data to DagsHub)

This is how to uploa the preprocessed data to DagsHub

In [None]:
def upload_data_to_dagshub():
    """Upload preprocessed data to DagsHub."""
    print("Uploading preprocessed data to DagsHub...")
    repo_path = f'{REPO_OWNER}/{REPO_NAME}'
    data_path = PREPROCESSED_DIR
    upload_files(repo_path, data_path)
    print("Data uploaded to DagsHub successfully!")

# How to Run the Code


## Explanation

To execute the entire workflow, follow these organized steps. Ensure you have the required functions defined before running the main script.

1. **Set Up DagsHub:**
   ```python
   setup_dagshub()
   ```
   - Initializes the DagsHub environment for data versioning and management.

2. **Download and Preprocess Data:**
   ```python
   download_data()
   preprocess_data()
   ```
   - Downloads the raw data and preprocesses it for training.

3. **Download Preprocessed Data from DagsHub:**
   ```python
   download_data_from_dagshub(data_source_name='preprocessed_data')
   ```
   - Retrieves preprocessed data if it has already been processed and uploaded to DagsHub.

4. **Load Data:**
   ```python
   train_images, train_labels, test_images, test_labels = load_data()
   ```
   - Loads the preprocessed training and test data from local files.

5. **Build and Train Model:**
   ```python
   model = build_model()
   train_history = train_model(model, train_images, train_labels)
   save_model(model)
   ```
   - Constructs the CNN model using VGG16, trains it with the training data, and saves the trained model.

6. **Evaluate Model:**
   ```python
   test_metrics = evaluate_model(model, test_images, test_labels)
   ```
   - Assesses the model’s performance on the test data and calculates metrics.

7. **Track with MLflow:**
   ```python
   mlflow_tracking(train_history, test_metrics, model)
   ```
   - Logs metrics, artifacts, and the model to MLflow for tracking and management.

8. **Upload Data to DagsHub:**
   ```python
   upload_data_to_dagshub()
   ```
   - Uploads the processed data and model artifacts to DagsHub.



## Code

In [None]:
setup_dagshub()

In [None]:
download_data()
preprocess_data()

In [None]:
download_data_from_dagshub(data_source_name='preprocessed_data')

In [14]:
train_images, train_labels, test_images, test_labels = load_data()

Loading preprocessed data...


Loading Data: 100%|██████████| 4/4 [00:18<00:00,  4.67s/file]


Data loaded successfully! 






In [15]:
model = build_model()
train_history = train_model(model, train_images, train_labels)
save_model(model)

Building CNN model with VGG16 as base...


Model built and summary saved. 

Training model...
Epoch 1/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 287ms/step - accuracy: 0.6492 - loss: 4.0077 - val_accuracy: 0.8438 - val_loss: 0.3976
Epoch 2/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 163ms/step - accuracy: 0.8087 - loss: 0.4839 - val_accuracy: 0.8571 - val_loss: 0.3484
Epoch 3/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 164ms/step - accuracy: 0.8457 - loss: 0.4026 - val_accuracy: 0.8789 - val_loss: 0.3041
Epoch 4/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 164ms/step - accuracy: 0.8496 - loss: 0.3803 - val_accuracy: 0.9018 - val_loss: 0.2440
Epoch 5/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 163ms/step - accuracy: 0.8755 - loss: 0.3091 - val_accuracy: 0.9031 - val_loss: 0.2757
Epoch 6/25
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 164ms/step - accuracy: 0.8801 - loss: 0.3116 -

In [16]:
test_metrics = evaluate_model(model, test_images, test_labels)

Evaluating model...
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 92ms/step
Confusion matrix saved to /content/artifacts/confusion_matrix.png

Test Accuracy: 0.9524922118380063, Test Loss: 0.36015427112579346, F1 Score: 0.9525053629110724, ROC-AUC: 0.9895720547519103
Model evaluation completed. 



In [17]:
mlflow_tracking(train_history, test_metrics, model)

Starting MLflow tracking...


Successfully registered model 'covid19_ct_model'.
Created version '1' of model 'covid19_ct_model'.


MLflow tracking completed. 



In [None]:
upload_data_to_dagshub()