# **Data Collection**

## Objectives

* Fetch data from Kaggle for further processing

## Inputs

* Kaggle JSON file - the authentication token.

## Outputs

* Generate Dataset: inputs/datasets/brain_tumor_mri_dataset

## Additional Comments

* No comments 


---

# Change working directory

* The notebooks in a subfolder, therefore when running the notebook in the editor, we will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Install The Kaggle

In [None]:
# installing kaggle packages
!pip install kaggle

---

Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON.

In [9]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

Set the Kaggle Dataset and Download it.

In [None]:
KaggleDatasetPath = "masoudnickparvar/brain-tumor-mri-dataset"
DestinationFolder = "inputs/brain-tumor-mri-dataset"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

In [11]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/brain-tumor-mri-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/brain-tumor-mri-dataset.zip')

# Data Preparation

* combine the dataset and delete the old ones
* Load Dataset
* Resize the Images
* Normalize the Pixel Values
* Convert Image Modes
* Split into Training and Testing Sets
* Save Preprocessed Dataset

In [None]:
import os
import shutil

# Define the paths for the training and testing directories
train_dir = 'inputs/brain-tumor-mri-dataset/Training'
test_dir = 'inputs/brain-tumor-mri-dataset/Testing'
combined_dir = 'inputs/brain-tumor-mri-dataset/mri-images'  # Combined directory name

# Create the combined directory if it doesn't exist
os.makedirs(combined_dir, exist_ok=True)

# Define the categories
categories = ['glioma', 'meningioma', 'notumor', 'pituitary']

# Function to copy images from source to destination
def copy_images(src_dir, dest_dir):
    """
    Copy images from the source directory to the destination directory.

    Parameters:
    - src_dir: The source directory containing category subdirectories.
    - dest_dir: The destination directory to copy images to.
    """
    for category in categories:
        src_category_dir = os.path.join(src_dir, category)
        dest_category_dir = os.path.join(dest_dir, category)

        # Create category directory in combined dataset
        os.makedirs(dest_category_dir, exist_ok=True)

        # Copy each image from the source category to the destination category
        for img_file in os.listdir(src_category_dir):
            if img_file.endswith(('jpg', 'jpeg', 'png')):
                src_file_path = os.path.join(src_category_dir, img_file)
                dest_file_path = os.path.join(dest_category_dir, img_file)
                
                shutil.copy(src_file_path, dest_file_path)
                print(f"Copied {src_file_path} to {dest_file_path}")

# Copy images from the Training directory
copy_images(train_dir, combined_dir)

# Copy images from the Testing directory
copy_images(test_dir, combined_dir)

print("Combining of datasets is complete.")

# Function to remove a directory and its contents
def remove_directory(dir_path):
    """
    Remove a directory and its contents.

    Parameters:
    - dir_path: The path of the directory to be removed.
    """
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
        print(f"Removed directory: {dir_path}")
    else:
        print(f"Directory does not exist: {dir_path}")

# Remove the old training and testing directories
remove_directory(train_dir)
remove_directory(test_dir)

print("Old training and testing data have been removed.")


In [None]:
# Function to count and display the number of images in each category
def count_images_in_combined_dataset(combined_dir):
    """
    Count and display the number of images in each category of the combined dataset.

    Parameters:
    - combined_dir: The path to the combined dataset directory.
    """
    for category in categories:
        category_dir = os.path.join(combined_dir, category)
        if os.path.exists(category_dir):
            num_images = len([f for f in os.listdir(category_dir) if f.endswith(('jpg', 'jpeg', 'png'))])
            print(f"Number of images in {category}: {num_images}")
        else:
            print(f"{category} directory does not exist.")

count_images_in_combined_dataset(combined_dir)

## Data cleaning

In [16]:
def remove_non_image_files(my_data_dir):
    """
    Remove non-image files from the specified directory and log the results.

    Parameters:
    - my_data_dir: The path to the directory containing subdirectories of images.
    """
    image_extension = ('.png', '.jpg', '.gpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + "/" + folder)

        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [20]:
remove_non_image_files(my_data_dir="inputs/brain-tumor-mri-dataset/mri-images")

Folder: glioma - has image file 1621
Folder: glioma - has non-image file 0
Folder: meningioma - has image file 1645
Folder: meningioma - has non-image file 0
Folder: notumor - has image file 2000
Folder: notumor - has non-image file 0
Folder: pituitary - has image file 1757
Folder: pituitary - has non-image file 0


## Split train validation test set

In [21]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    """
    Splits images into training, validation, and test sets.

    Parameters:
    - my_data_dir: Path to the directory containing subdirectories of labeled images.
    - train_set_ratio: Ratio of images to be used for training.
    - validation_set_ratio: Ratio of images to be used for validation.
    - test_set_ratio: Ratio of images to be used for testing.
    """
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

In [22]:
split_train_validation_test_images(my_data_dir=f"inputs/brain-tumor-mri-dataset/mri-images",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )