In [1]:
import os
import pandas as pd 
from mlproject import logger

[2025-02-20 22:59:34,437 : INFO : __init__ : Logger has been set up successfully!]


In [2]:
%pwd

'f:\\Files\\DS&ML\\Wine-Quality-Prediction\\research'

In [3]:
os.chdir('../')
%pwd

'f:\\Files\\DS&ML\\Wine-Quality-Prediction'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    target_column: str
    label_encoder: Path
    feature_encoder: Path  # Add this field
    preprocessor_path: Path

In [5]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml,create_directories

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    # Then in ConfigurationManager.get_data_transformation_config:
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            target_column=config.target_column,
            preprocessor_path=config.preprocessor_path,
            label_encoder=config.label_encoder,
            feature_encoder=config.feature_encoder  # Add this line
        )
        return data_transformation_config

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
import joblib
from mlproject import logger

In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.label_encoders = {}
        
        # Define column types
        self.num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
        self.cat_cols_le = [
            'gender', 'SeniorCitizen', 'Partner', 'Dependents',
            'PhoneService', 'PaperlessBilling', 'InternetService',
            'Contract', 'PaymentMethod', 'MultipleLines',
            'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
            'TechSupport', 'StreamingTV', 'StreamingMovies'
        ]
        self.cols_to_drop = ['customerID']

    def preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data = data.copy()
            data = data.drop(columns=self.cols_to_drop, errors='ignore')
            
            # Handle TotalCharges column
            data['TotalCharges'] = pd.to_numeric(data['TotalCharges'].str.strip(), errors='coerce')
            data[self.num_cols] = data[self.num_cols].fillna(data[self.num_cols].mean())
            
            # Label encode categorical columns
            for column in self.cat_cols_le:
                if column in data.columns:
                    le = LabelEncoder()
                    data[column] = le.fit_transform(data[column].astype(str))
                    self.label_encoders[column] = le
            
            # Encode target column if categorical
            if self.config.target_column in data.columns and data[self.config.target_column].dtype == 'object':
                le = LabelEncoder()
                data[self.config.target_column] = le.fit_transform(data[self.config.target_column])
                self.label_encoders[self.config.target_column] = le
            
            # Save label encoders
            os.makedirs(os.path.dirname(self.config.label_encoder), exist_ok=True)
            joblib.dump(self.label_encoders, self.config.label_encoder)
            
            return data
            
        except Exception as e:
            logger.error(f"Error in preprocess_data: {str(e)}")
            raise e

    def train_test_spliting(self) -> tuple:
        try:
            data = pd.read_csv(self.config.data_path)
            data = self.preprocess_data(data)
            
            X = data.drop(columns=[self.config.target_column])
            y = data[self.config.target_column]
            
            # Apply SMOTE
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)
            
            X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
            resampled_data = X_resampled.copy()
            resampled_data[self.config.target_column] = y_resampled
            
            # Train-test split
            train, test = train_test_split(resampled_data, test_size=0.25, random_state=42)
            
            # Save splits
            train_path = os.path.join(self.config.root_dir, "train.csv")
            test_path = os.path.join(self.config.root_dir, "test.csv")
            train.to_csv(train_path, index=False)
            test.to_csv(test_path, index=False)
            
            logger.info(f"Original data shape: {data.shape}")
            logger.info(f"Resampled data shape: {resampled_data.shape}")
            logger.info(f"Training data shape: {train.shape}")
            logger.info(f"Test data shape: {test.shape}")
            
            return train, test
            
        except Exception as e:
            logger.error(f"Error in train_test_spliting: {str(e)}")
            raise e

    def preprocess_features(self, train: pd.DataFrame, test: pd.DataFrame) -> tuple:
        try:
            numeric_transformer = Pipeline(steps=[
                ('scaler', StandardScaler())
            ])
            
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, self.num_cols)
                ],
                remainder='passthrough'
            )

            # Split features and target
            train_x = train.drop(columns=[self.config.target_column])
            test_x = test.drop(columns=[self.config.target_column])
            train_y = train[self.config.target_column]
            test_y = test[self.config.target_column]

            # Apply preprocessing
            train_processed = preprocessor.fit_transform(train_x)
            test_processed = preprocessor.transform(test_x)

            # Reshape targets
            train_y = train_y.values.reshape(-1, 1)
            test_y = test_y.values.reshape(-1, 1)

            # Combine processed features with targets
            train_combined = np.hstack((train_processed, train_y))
            test_combined = np.hstack((test_processed, test_y))

            # Save preprocessor and processed data
            os.makedirs(os.path.dirname(self.config.preprocessor_path), exist_ok=True)
            joblib.dump(preprocessor, self.config.preprocessor_path)
            
            np.save(os.path.join(self.config.root_dir, "train_processed.npy"), train_combined)
            np.save(os.path.join(self.config.root_dir, "test_processed.npy"), test_combined)

            logger.info(f"Preprocessor saved at: {self.config.preprocessor_path}")
            logger.info(f"Training data shape: {train_processed.shape}")
            logger.info(f"Testing data shape: {test_processed.shape}")
            
            return train_processed, test_processed
            
        except Exception as e:
            logger.error(f"Error in preprocess_features: {str(e)}")
            raise e

In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train,test = data_transformation.train_test_spliting()
    train_processed, test_processed = data_transformation.preprocess_features(train, test)

except FileNotFoundError as e:
    logger.error(f"File not found: {e}")
except KeyError as e:
    logger.error(f"Missing key in configuration: {e}")
except Exception as e:
    logger.error(f"Unexpected error: {e}")

[2025-02-20 22:59:36,638 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2025-02-20 22:59:36,641 : INFO : common : yaml file: params.yaml loaded successfully]
[2025-02-20 22:59:36,647 : INFO : common : yaml file: schema.yaml loaded successfully]
[2025-02-20 22:59:36,649 : INFO : common : created directory at: artifacts]
[2025-02-20 22:59:36,651 : INFO : common : created directory at: artifacts/data_transformation]
[2025-02-20 22:59:37,071 : INFO : 2035099503 : Original data shape: (7043, 20)]
[2025-02-20 22:59:37,073 : INFO : 2035099503 : Resampled data shape: (10348, 20)]
[2025-02-20 22:59:37,074 : INFO : 2035099503 : Training data shape: (7761, 20)]
[2025-02-20 22:59:37,075 : INFO : 2035099503 : Test data shape: (2587, 20)]
[2025-02-20 22:59:37,096 : INFO : 2035099503 : Preprocessor saved at: artifacts/data_transformation/preprocessor.pkl]
[2025-02-20 22:59:37,097 : INFO : 2035099503 : Training data shape: (7761, 19)]
[2025-02-20 22:59:37,098 : INFO : 2035099503