# Data Validation and Transformation


In [19]:
import os

In [20]:
os.chdir("E:\[Portfolio Project]\Digit Recognizor")

In [21]:
%pwd

'E:\\[Portfolio Project]\\Digit Recognizor'

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    input_data_file: Path
    output_data_file: Path
    training_dir: Path
    valid_dir: Path


In [24]:
from cnnDigitReco.constants import *
from cnnDigitReco.utils.common import read_yaml, create_directories

In [25]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])


        

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])



        return DataTransformationConfig(
            root_dir = Path(config.root_dir),
            input_data_file = Path(config.input_data_file),
            output_data_file = Path(config.output_data_file),
            training_dir = Path(config.training_dir),
            valid_dir = Path(config.valid_dir)
        )




In [None]:
from sklearn.model_selection import train_test_split

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def split_data_file(self):
        data = pd.read_csv(self.config.input_data_file)
        labels = data["label"]
        pictures = data.drop("label", axis=1)

        output_dir = self.config.output_data_file

        # pictures = pictures.values.reshape(-1, 28, 28, 1)

        # ##
        # pd.DataFrame(pictures)
        ##

        labels.to_csv(output_dir / "labels.csv", index=False)
        pictures.to_csv(output_dir / "pictures.csv", index=False)

    def train_test_split(self):
        picture_file = pd.read_csv(self.config.output_data_file / "pictures.csv")
        labels_file = pd.read_csv(self.config.output_data_file / "labels.csv")


        x_train, x_val, y_train, y_val = train_test_split(
            picture_file, labels_file, test_size=0.2, random_state=42
        )

        # Create training and validation directories
        create_directories([self.config.training_dir, self.config.valid_dir])

        x_train.to_csv(self.config.training_dir / "x_train.csv", index=False)
        y_train.to_csv(self.config.training_dir / "y_train.csv", index=False)
        x_val.to_csv(self.config.valid_dir / "x_val.csv", index=False)
        y_val.to_csv(self.config.valid_dir / "y_val.csv", index=False)

    # def split_data_file(self):
    #     data = pd.read_csv(self.config.input_data_file)
    #     labels = data["label"]
    #     pictures = data.drop("label", axis=1)

    #     output_dir = self.config.output_data_file

    #     # Reshape pictures to (28, 28, 1) and save as .npy
    #     pictures_reshaped = pictures.values.reshape(-1, 28, 28, 1)
    #     np.save(output_dir / "pictures_reshaped.npy", pictures_reshaped)

    #     # Save labels as CSV
    #     labels.to_csv(output_dir / "labels.csv", index=False)


    # def train_test_split(self):
    #     # Load reshaped pictures if saved as .npy
    #     picture_file = np.load(self.config.output_data_file / "pictures_reshaped.npy")
    #     labels_file = pd.read_csv(self.config.output_data_file / "labels.csv")

    #     # Flatten labels to match dimensions
    #     labels_file = labels_file.values.ravel()

    #     x_train, x_val, y_train, y_val = train_test_split(
    #         picture_file, labels_file, test_size=0.2, random_state=42
    #     )

    #     # Create training and validation directories
    #     create_directories([self.config.training_dir, self.config.valid_dir])

    #     # Save data
    #     np.save(self.config.training_dir / "x_train.npy", x_train)
    #     np.save(self.config.training_dir / "y_train.npy", y_train)
    #     np.save(self.config.valid_dir / "x_val.npy", x_val)
    #     np.save(self.config.valid_dir / "y_val.npy", y_val)


In [27]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.split_data_file()
    data_transformation.train_test_split()

except Exception as e:
    print(f"An error occurred: {e}")
    raise e


[2024-11-22 16:40:05,747: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-11-22 16:40:05,755: INFO: common: yaml file: params.yaml loaded successfully]
[2024-11-22 16:40:05,755: INFO: common: created directory at: artifacts]
[2024-11-22 16:40:05,755: INFO: common: created directory at: artifacts/data_transformation]


[2024-11-22 16:40:33,310: INFO: common: created directory at: artifacts\data_transformation\training]
[2024-11-22 16:40:33,325: INFO: common: created directory at: artifacts\data_transformation\validation]
