In [1]:
import os
import pandas as pd 
from mlproject import logger

[2025-01-30 19:36:42,357 : INFO : __init__ : Logger has been set up successfully!]


In [2]:
%pwd

'f:\\Files\\DS&ML\\Wine-Quality-Prediction\\research'

In [3]:
os.chdir('../')
%pwd

'f:\\Files\\DS&ML\\Wine-Quality-Prediction'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir:Path
    data_path: Path
    target_column: str
    preprocessor_path: Path

In [5]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml,create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
        root_dir=config.root_dir,
        data_path=config.data_path,
        target_column=config.target_column,
        preprocessor_path=config.preprocessor_path
        )
    
        return data_transformation_config

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import joblib
from mlproject import logger


class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.label_encoder = LabelEncoder()

    def train_test_spliting(self):
        data = pd.read_csv(self.config.data_path)
        
        # Separate features and target before splitting
        X = data.drop(columns=[self.config.target_column])
        y = data[self.config.target_column]
        
        # Encode target labels
        y = self.label_encoder.fit_transform(y)
        
        # Apply SMOTE before train-test split
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        
        # Convert back to DataFrame to maintain column names
        X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
        
        # Combine features and target for saving
        resampled_data = X_resampled.copy()
        resampled_data[self.config.target_column] = y_resampled
        
        # Perform train-test split on resampled data
        train, test = train_test_split(resampled_data, test_size=0.25, random_state=42)

        train_path = os.path.join(self.config.root_dir, "train.csv")
        test_path = os.path.join(self.config.root_dir, "test.csv")
        train.to_csv(train_path, index=False)
        test.to_csv(test_path, index=False)

        logger.info("Applied SMOTE and split data into training and test sets")
        logger.info(f"Original data shape: {data.shape}")
        logger.info(f"Resampled data shape: {resampled_data.shape}")
        logger.info(f"Training data shape: {train.shape}")
        logger.info(f"Test data shape: {test.shape}")

        return train, test
    
    def preprocess_features(self, train, test):
        # Identify numerical columns
        numerical_columns = train.select_dtypes(include=["int64", "float64"]).columns

        # Exclude the target column from numerical columns
        if self.config.target_column in numerical_columns:
            numerical_columns = numerical_columns.drop(self.config.target_column)

        logger.info(f"Numerical columns: {list(numerical_columns)}")

        # Preprocessing pipelines
        num_pipeline = Pipeline(steps=[
            ("scaler", StandardScaler())
        ])
        
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", num_pipeline, numerical_columns),
            ],
            remainder="passthrough"
        )

        # Separate features and target
        train_x = train.drop(columns=[self.config.target_column])
        test_x = test.drop(columns=[self.config.target_column])
        train_y = train[self.config.target_column]
        test_y = test[self.config.target_column]

        # Fit preprocessor and transform features
        train_processed = preprocessor.fit_transform(train_x)
        test_processed = preprocessor.transform(test_x)

        # Ensure target is 2D array
        train_y = train_y.values.reshape(-1, 1)
        test_y = test_y.values.reshape(-1, 1)

        # Combine processed features with target
        train_combined = np.hstack((train_processed, train_y))
        test_combined = np.hstack((test_processed, test_y))

        # Save preprocessor and label encoder
        joblib.dump(preprocessor, self.config.preprocessor_path)
        label_encoder_path = os.path.join(self.config.root_dir, "label_encoder.pkl")
        joblib.dump(self.label_encoder, label_encoder_path)
        
        logger.info(f"Preprocessor saved at {self.config.preprocessor_path}")
        logger.info(f"Label encoder saved at {label_encoder_path}")

        # Save processed data
        np.save(os.path.join(self.config.root_dir, "train_processed.npy"), train_combined)
        np.save(os.path.join(self.config.root_dir, "test_processed.npy"), test_combined)

        logger.info("Preprocessed train and test data saved successfully.")
        return train_processed, test_processed

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train,test = data_transformation.train_test_spliting()
    train_processed, test_processed = data_transformation.preprocess_features(train, test)

except FileNotFoundError as e:
    logger.error(f"File not found: {e}")
except KeyError as e:
    logger.error(f"Missing key in configuration: {e}")
except Exception as e:
    logger.error(f"Unexpected error: {e}")

[2025-01-30 19:36:44,659 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2025-01-30 19:36:44,663 : INFO : common : yaml file: params.yaml loaded successfully]
[2025-01-30 19:36:44,668 : INFO : common : yaml file: schema.yaml loaded successfully]
[2025-01-30 19:36:44,670 : INFO : common : created directory at: artifacts]
[2025-01-30 19:36:44,671 : INFO : common : created directory at: artifacts/data_transformation]
[2025-01-30 19:36:44,985 : INFO : 459171045 : Applied SMOTE and split data into training and test sets]
[2025-01-30 19:36:44,986 : INFO : 459171045 : Original data shape: (1599, 12)]
[2025-01-30 19:36:44,987 : INFO : 459171045 : Resampled data shape: (4086, 12)]
[2025-01-30 19:36:44,988 : INFO : 459171045 : Training data shape: (3064, 12)]
[2025-01-30 19:36:44,989 : INFO : 459171045 : Test data shape: (1022, 12)]
[2025-01-30 19:36:44,991 : INFO : 459171045 : Numerical columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlori