In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
from ydata_profiling import ProfileReport

In [2]:
image_dir = Path("../data/Fish_Dataset")

## Create DataFrame

In [3]:
# Get images and labels
images = list(image_dir.glob(r'**/*.png'))
labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], images))

In [4]:
images = pd.Series(images, name='Image').astype(str)
labels = pd.Series(labels, name='Label')

# Concatenate images and labels
df = pd.concat([images, labels], axis=1)

In [5]:
df

Unnamed: 0,Image,Label
0,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
1,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
2,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
3,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
4,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
...,...,...
17995,..\data\Fish_Dataset\Trout\Trout GT\00996.png,Trout GT
17996,..\data\Fish_Dataset\Trout\Trout GT\00997.png,Trout GT
17997,..\data\Fish_Dataset\Trout\Trout GT\00998.png,Trout GT
17998,..\data\Fish_Dataset\Trout\Trout GT\00999.png,Trout GT


## GT Image Folder Cleanup

In [6]:
# Drop GT images
df['Label'] = df['Label'].apply(lambda x: np.NaN if x[-2:] == 'GT' else x)

## Preprocessing

In [7]:
df = df.dropna()

In [8]:
df

Unnamed: 0,Image,Label
0,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
1,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
2,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
3,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
4,..\data\Fish_Dataset\Black Sea Sprat\Black Sea...,Black Sea Sprat
...,...,...
16995,..\data\Fish_Dataset\Trout\Trout\00996.png,Trout
16996,..\data\Fish_Dataset\Trout\Trout\00997.png,Trout
16997,..\data\Fish_Dataset\Trout\Trout\00998.png,Trout
16998,..\data\Fish_Dataset\Trout\Trout\00999.png,Trout


## Statistics Report

In [9]:
report = {
    'Total Samples': len(df),
    'Unique Labels': df['Label'].nunique(),
    'Label Distribution': df['Label'].value_counts().to_dict()
}

In [10]:
print("Statistics Report:")
for key, value in report.items():
    print(f"{key}: {value}")

Statistics Report:
Total Samples: 9000
Unique Labels: 9
Label Distribution: {'Black Sea Sprat': 1000, 'Gilt-Head Bream': 1000, 'Hourse Mackerel': 1000, 'Red Mullet': 1000, 'Red Sea Bream': 1000, 'Sea Bass': 1000, 'Shrimp': 1000, 'Striped Red Mullet': 1000, 'Trout': 1000}


## Generate Report

In [11]:
# Generate the report
report = ProfileReport(df)

In [12]:
# report

## Create Dataset

In [13]:
df.to_csv('../data/preprocessed_dataset.csv', index=False)