In [1]:
import os

In [2]:
%pwd

'd:\\Project\\P\\Mlops-Project\\research'

In [4]:
os.chdir("../")
%pwd

'd:\\Project\\P\\Mlops-Project'

In [12]:
import tensorflow_data_validation as tfdv

[2023-12-19 23:45:07,257: INFO: native_type_compatibility: Using Any for unsupported type: typing.Sequence[~T]]
]
]
]


In [13]:
import pandas as pd
data = pd.read_csv("artifacts/data_validation/label.csv")
data.head()

Unnamed: 0,ImageID,XMin,XMax,YMin,YMax
0,a72ac22f5228e450,0.865782,0.992625,0.442478,0.935841
1,d71c1399a5e52c93,0.0,1.0,0.15708,1.0
2,90576c0143252086,0.696903,0.70354,0.942478,0.949853
3,447a3db76e3b25aa,0.421829,0.458702,0.699115,0.811947
4,5a175741a2a390f3,0.51875,0.672917,0.175,0.996875


In [14]:
approved_cols = [col for col in data.columns]
stats_options = tfdv.StatsOptions(feature_allowlist=approved_cols)
stats_options.feature_allowlist


['ImageID', 'XMin', 'XMax', 'YMin', 'YMax']

In [15]:
train_stats = tfdv.generate_statistics_from_dataframe(data, stats_options)
print(f"Number of features used: {len(train_stats.datasets[0].features)}")

print(f"Number of examples used: {train_stats.datasets[0].num_examples}")



Number of features used: 5
Number of examples used: 23854


In [16]:
tfdv.visualize_statistics(train_stats)

In [18]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'ImageID',BYTES,required,,-
'XMin',FLOAT,required,,-
'XMax',FLOAT,required,,-
'YMin',FLOAT,required,,-
'YMax',FLOAT,required,,-


In [19]:
anomalies = tfdv.validate_statistics(train_stats, schema)
tfdv.display_anomalies(anomalies)

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir: Path
    label_process_data: Path
    image_train_data: Path
    image_test_data: Path
    image_root_data: Path

In [6]:
from MLOps_project.constant import *
from MLOps_project.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            label_process_data=config.label_process_data,
            image_train_data = config.image_train_data,
            image_test_data = config.image_test_data,
            image_root_data= config.image_root_data
        )

        return data_transformation_config

In [7]:
import os
from MLOps_project import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import shutil

In [19]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig) :
        self.config = config
    def train_test_spliting(self):
        data = pd.read_csv(self.config.label_process_data)
        train, test = train_test_split(data, random_state=48,test_size=0.25, shuffle=True)
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)
        
        self.create_labels(train, activation="train/")
        self.create_labels(test, activation="test/")    
    

    def create_labels(self, data, activation: str):
        for _, row in data.iterrows():
            image_file = row['ImageID']
            class_id = "0"
            x = row['XMin']
            y = row['YMin']
            width = row['XMax'] - row['XMin']
            height = row['YMax'] - row['YMin']

            x_center = x + (width / 2)
            y_center = y + (height / 2)
            
            labels_dir = os.path.join(self.config.root_dir, "labels")
            self.create_folder(labels_dir)
            activation_dir = os.path.join(labels_dir, activation)
            self.create_folder(activation_dir)

            annotation_file = os.path.join(activation_dir, image_file + '.txt')
            with open(annotation_file, 'w') as ann_file:
                ann_file.write(f"{class_id} {x_center} {y_center} {width} {height}\n")
        
        logger.info(f"Created label folders finished")

    def image_splitting(self):
        train_data = pd.read_csv(os.path.join(self.config.root_dir, "train.csv"))
        test_data = pd.read_csv(os.path.join(self.config.root_dir, "test.csv"))

        self.create_folder(self.config.image_train_data)
        self.create_folder(self.config.image_test_data)

        self.copy_images(train_data, self.config.image_train_data)
        self.copy_images(test_data, self.config.image_test_data)

        train_image_count = self.count_img_in_dir(self.config.image_train_data)
        logger.info(f"number image in train image directory: {train_image_count}")
        test_image_count = self.count_img_in_dir(self.config.image_test_data)
        logger.info(f"number image in train image directory: {test_image_count}")

    def create_folder(self, folder_path):
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            logger.info(f"Created folder: {folder_path}")

    def count_img_in_dir(self, directory):
        count = sum([len(files) for _, _, files in os.walk(directory)])
        return count
   
    def copy_images(self, data, target_folder):
        source_folder = self.config.image_root_data
        list_data = data["ImageID"].apply(lambda x: x + ".jpg").tolist()

        for file in os.listdir(source_folder):
            if file.endswith(".jpg") and file in list_data:
                source_path = os.path.join(source_folder, file)
                target_path = os.path.join(target_folder, file)
                shutil.copy(source_path, target_path)   

In [20]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_spliting()
    data_transformation.image_splitting()
except Exception as e:
    raise e

[2023-12-20 16:11:17,294: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-20 16:11:17,304: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-20 16:11:17,308: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-20 16:11:17,311: INFO: common: created directory at: artifacts]
[2023-12-20 16:11:17,312: INFO: common: created directory at: artifacts/data_transformation/]


[2023-12-20 16:11:17,495: INFO: 580632499: Splited data into training and test sets]
[2023-12-20 16:11:17,495: INFO: 580632499: (17890, 5)]
[2023-12-20 16:11:17,495: INFO: 580632499: (5964, 5)]
[2023-12-20 16:11:17,499: INFO: 580632499: Created folder: artifacts/data_transformation/labels]
[2023-12-20 16:11:17,499: INFO: 580632499: Created folder: artifacts/data_transformation/labels\train/]
[2023-12-20 16:11:34,749: INFO: 580632499: Created label folders finished]
[2023-12-20 16:11:34,749: INFO: 580632499: Created folder: artifacts/data_transformation/labels\test/]
[2023-12-20 16:11:40,802: INFO: 580632499: Created label folders finished]
[2023-12-20 16:11:40,910: INFO: 580632499: Created folder: artifacts/data_transformation/image/train/]
[2023-12-20 16:11:40,913: INFO: 580632499: Created folder: artifacts/data_transformation/image/test/]
[2023-12-20 16:12:07,534: INFO: 580632499: number image in train image directory: 6515]
[2023-12-20 16:12:07,553: INFO: 580632499: number image in 