In [2]:
import os

In [3]:
os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    scaled_data_file: Path

In [5]:
from src.StockSeer.constants import *
from src.StockSeer.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            scaled_data_file=config.scaled_data_file
        )

        return data_transformation_config

In [6]:
from sklearn.preprocessing import StandardScaler
import pandas as pd 
from src.StockSeer.logging import logger
import numpy as np

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def TestDataStacking(self,scaled_data,training_data_len,df):

        test_data = scaled_data[training_data_len - 100: , :]
        # Create the data sets x_test and y_test
        X_test = []
        y_test = df[training_data_len:]
        for i in range(100, len(test_data)):
            X_test.append(test_data[i-100:i, 0])
            
        # Convert the data to a numpy array
        X_test = np.array(X_test)

        # Reshape the data
        X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1 ))

        np.save(os.path.join(self.config.root_dir,"X_test.npy"),X_test)
        np.save(os.path.join(self.config.root_dir,"y_test.npy"),y_test)
        logger.info("test data stacking completed")


    def TrainDataStacking(self,scaled_data,training_data_len):

        train_data = scaled_data[0:training_data_len, :]
        # Split the data into x_train and y_train data sets
        X_train = []
        y_train = []

        for i in range(100, len(train_data)):
            X_train.append(train_data[i-100:i, 0])
            y_train.append(train_data[i, 0])

        # Convert the x_train and y_train to numpy arrays 
        X_train, y_train = np.array(X_train), np.array(y_train)

        # Reshape the data
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

        np.save(os.path.join(self.config.root_dir,"X_train.npy"),X_train)
        np.save(os.path.join(self.config.root_dir,"y_train.npy"),y_train)
        logger.info("train data stacking completed")



    def StandardScaling(self,data):
        logger.info("data scaling started")
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(data)
        logger.info(f"data scaling completed and shape of data : {scaled_data.shape}")
        np.save(self.config.scaled_data_file,scaled_data)
        logger.info(f"scaled data stored at {self.config.scaled_data_file}")
        return scaled_data

    def DataTransformation(self):

        data = pd.read_csv(self.config.data_path,index_col='Date')

        scaled_data = self.StandardScaling(data)

        test_data_len = np.ceil(len(scaled_data)*0.2)
        if test_data_len > 200:
            test_data_len = 200
        train_data_len = len(scaled_data) - test_data_len

        self.TrainDataStacking(scaled_data,train_data_len)
        self.TestDataStacking(scaled_data,train_data_len,data)

        


In [7]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.DataTransformation()
except Exception as e:
    raise e

[2024-03-04 23:45:25,867: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-03-04 23:45:25,885: INFO: common: yaml file: params.yaml loaded successfully]
[2024-03-04 23:45:25,898: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-03-04 23:45:25,899: INFO: common: created directory at: artifacts]
[2024-03-04 23:45:25,901: INFO: common: created directory at: artifacts/data_transformation]


[2024-03-04 23:45:25,954: INFO: 2376358783: data scaling started]
[2024-03-04 23:45:25,964: INFO: 2376358783: data scaling completed and shape of data : (10895, 1)]
[2024-03-04 23:45:25,967: INFO: 2376358783: scaled data stored at artifacts/data_transformation/scaled_data.npy]
[2024-03-04 23:45:25,992: INFO: 2376358783: train data stacking completed]
[2024-03-04 23:45:25,994: INFO: 2376358783: test data stacking completed]
