In [1]:
import os
os.chdir('..')

In [2]:
from dataclasses import dataclass
from pathlib import Path
from MLOps import logger

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [3]:
from MLOps.constants import *
from MLOps.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH, params_file_path=PARAMS_FILE_PATH, schema_file_path=SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        create_directories(list_of_directories=[self.config.artifacts_root])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        data_transformation_config = self.config.data_transformation
        schema = self.schema.COLUMNS
        create_directories([data_transformation_config.root_dir])
        return DataTransformationConfig(
            root_dir=data_transformation_config.root_dir,
            data_path=data_transformation_config.data_path,
        )

In [15]:
from sklearn.model_selection import train_test_split
import pandas as pd

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_split_and_date_transformation(self):
        data = pd.read_csv(self.config.data_path)
        
        data.date = pd.to_datetime(data.date)
        train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
        # Save all data to Future accuracy improvement
        data['today'] = data['today'] / data['dollar_rate']
        
        last_30_days = data[:-30].copy()
        data = data.copy()
        data.to_csv(os.path.join(self.config.root_dir, 'full.csv'), index=False)
        data[:-30].to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test_set = data[-30:].copy()
        test_set['today'] = test_set['today'] * test_set['dollar_rate']
        test_set.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False)
        
        return train_set, test_set

In [16]:
try:
    logger.info(">>>>>>>>> stage Data T Stage started <<<<<<<<")
    config_manager = ConfigurationManager()
    data_transformation_config = config_manager.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.train_test_split_and_date_transformation()
    logger.info(">>>>>>>>> stage Data Validation Stage completed <<<<<<<<")
except Exception as e:
    raise e

2025-07-07 00:51:56,176 - MLOpsLogger - INFO - >>>>>>>>> stage Data T Stage started <<<<<<<<
Directory already exists: artifacts
Directory already exists: artifacts\data_transformation
2025-07-07 00:51:56,261 - MLOpsLogger - INFO - >>>>>>>>> stage Data Validation Stage completed <<<<<<<<
