In [1]:
import os
os.chdir("../")
%pwd

'd:\\projects\\MLOps\\Sign-Language-Recognition'

In [2]:
from dataclasses import dataclass
from pathlib import Path

In [3]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    raw_data_path: Path

In [4]:
from Sign_Language.constants import *
from Sign_Language.utils.common import create_directories, read_yaml

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            raw_data_path=config.raw_data_path 
        )

        return data_ingestion_config

In [6]:
import os
import pandas as pd
from Sign_Language import logger
from Sign_Language.utils.common import get_size
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def load_file(self):
        if not os.path.exists(self.config.raw_data_path):
            df = pd.read_csv("research/data/Data.csv")
            logger.info(f"Read the Raw dataset as DataFrame: {self.config.raw_data_path}")

            df.to_csv(self.config.raw_data_path, index=False)

            # traindf, testdf = train_test_split(df, test_size=0.2, random_state=12)

            # traindf.to_csv(self.config.train_data_path, index=False)
            # testdf.to_csv(self.config.test_data_path, index=False)

            logger.info(f"loaded the raw dataset: {self.config.raw_data_path}")

        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.raw_data_path))}")

    def create_train_test_data(self):
        if not os.path.exists(self.config.train_data_path):
            df = pd.read_csv(self.config.raw_data_path)
            traindf, testdf = train_test_split(df, test_size=0.2, random_state=12)

            traindf.to_csv(self.config.train_data_path, index=False)
            testdf.to_csv(self.config.test_data_path, index=False)

            logger.info("Created the training and test data")
        else:
            logger.info(f"File already exists of size: \n \
                        {get_size(Path(self.config.train_data_path))} \n\
                        {get_size(Path(self.config.train_data_path))}")

In [8]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.load_file()
    data_ingestion.create_train_test_data()
except Exception as e:
    raise e

[2024-02-09 02:11:39,930: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-09 02:11:39,951: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-09 02:11:39,956: INFO: common: created directory at: artifacts]
[2024-02-09 02:11:39,960: INFO: common: created directory at: artifacts/data-ingestion]


[2024-02-09 02:11:41,546: INFO: 1709612983: Read the Raw dataset as DataFrame: artifacts/data-ingestion/data.csv]
[2024-02-09 02:11:50,369: INFO: 1709612983: loaded the raw dataset: artifacts/data-ingestion/data.csv]
[2024-02-09 02:11:57,007: INFO: 1709612983: Created the training and test data]
