In [1]:
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow\\research'

In [2]:
import os

os.chdir('../')
%pwd

'c:\\Users\\User\\Documents\\EndToEndMLProjects\\End-To-End-Machine-Learning-Project-with-MlFlow'

# 1. Update config.yaml

# 2. Update the entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

In [4]:
@dataclass(frozen=True)
class DataSplitConfig:
    root_dir: Path
    data_path: str
    X_train_data_path: str
    X_test_data_path: str
    y_train_data_path: str
    y_test_data_path: str

# 3. Update the configuration manager in src config

In [5]:
from mlproject.constants import *
from mlproject.utils.common import create_directories, read_yaml

In [6]:
class ConfigurationManager:
    """
    Manages the configuration and setup for the project.

    This class is responsible for reading configuration files, creating required directories, 
    and providing specific configuration objects needed for various components of the project.

    Attributes:
        config (dict): Parsed content of the main configuration file.
        params (dict): Parsed content of the parameters file.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
        ) -> None:
        """
        Initializes the ConfigurationManager.

        Reads YAML configuration files for main configuration, parameters, and schema. 
        Also ensures that the root artifacts directory specified in the configuration is created.

        Args:
            config_filepath (str): Path to the main configuration YAML file. Default is `CONFIG_FILE_PATH`.
            params_filepath (str): Path to the parameters YAML file. Default is `PARAMS_FILE_PATH`.           
        """
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
        
    def get_data_split_config(self) -> DataSplitConfig:
        try:
            config = self.config.data_split
            create_directories([config.root_dir])
            
            data_split_config = DataSplitConfig(
                root_dir=config.root_dir,
                data_path=config.data_path,
                X_train_data_path=config.X_train_data_path,
                X_test_data_path=config.X_test_data_path,
                y_train_data_path=config.y_train_data_path,
                y_test_data_path=config.y_test_data_path
            )
            return data_split_config
        except Exception as e:
            raise e

# 4. Update the components

In [7]:
from typing import Tuple
import pandas as pd

from mlproject import logger
from sklearn.model_selection import train_test_split

In [8]:
class DataSplit:
    def __init__(self, config: DataSplitConfig) -> None:
        self.config = config
        
    def split_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        try:
            df = pd.read_csv(self.config.data_path)
            
            X = df.drop(columns=['Status'])
            y = df['Status']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

            X_train.to_csv(self.config.X_train_data_path, index = False)
            X_test.to_csv(self.config.X_test_data_path, index = False)
            y_train.to_csv(self.config.y_train_data_path, index=False)
            y_test.to_csv(self.config.y_test_data_path, index=False)

            return X_train, X_test, y_train, y_test
        except Exception as e:
            raise e

# 5. Update pipeline

In [9]:
try:
    config = ConfigurationManager()
    data_split_config = config.get_data_split_config()
    spliter = DataSplit(data_split_config)
    X_train, X_test, y_train, y_test = spliter.split_data()
except Exception as e:
    raise e

[2024-11-21 11:15:07,561: 35 mlprojectLogger: INFO: common: .yaml file: config\config.yaml loaded successfully.]
[2024-11-21 11:15:07,570: 35 mlprojectLogger: INFO: common: .yaml file: params.yaml loaded successfully.]
[2024-11-21 11:15:07,573: 54 mlprojectLogger: INFO: common: Created directory at artifacts]
[2024-11-21 11:15:07,575: 54 mlprojectLogger: INFO: common: Created directory at artifacts/data_split]


In [None]:
a = pd.read_csv('artifacts/data_split/X_train.csv')
b = pd.read_csv('artifacts/data_split/X_test.csv')
c = pd.read_csv('artifacts/data_split/y_train.csv')
d = pd.read_csv('artifacts/data_split/y_test.csv')

# Converting first column to Series 
c = c.iloc[:,0]
d = d.iloc[:,0]


print("X Training set size: ", a.shape)
print("X Testing set size: ", b.shape)
print("y Training set size: ", c.shape)
print("y Testing set size: ", d.shape)

X Training set size:  (77530, 33)
X Testing set size:  (19383, 33)
y Training set size:  (77530,)
y Testing set size:  (19383,)


In [15]:

print("X Training set size: ", X_train.shape)
print("X Testing set size: ", X_test.shape)
print("y Training set size: ", y_train.shape)
print("y Testing set size: ", y_test.shape)

X Training set size:  (77530, 33)
X Testing set size:  (19383, 33)
y Training set size:  (77530,)
y Testing set size:  (19383,)
