In [1]:
import os

In [2]:
%pwd

'f:\\Files\\DS&ML\\Flight-Fare-Price-Prediction\\Exp'

In [3]:
os.chdir('../')
%pwd

'f:\\Files\\DS&ML\\Flight-Fare-Price-Prediction'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    target_column: str
    preprocessor_path: Path
    label_encoder: Path
    categorical_columns: list
    numerical_columns: list

In [5]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml,create_directories

In [6]:
    class ConfigurationManager:
        def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)

            create_directories([self.config.artifacts_root])

        def get_data_transformation_config(self) -> DataTransformationConfig:
            config = self.config.data_transformation
            schema = self.schema
            create_directories([config.root_dir])
            
            data_transformation_config = DataTransformationConfig(
                root_dir=config.root_dir,
                data_path=config.data_path,
                target_column=config.target_column,
                preprocessor_path=config.preprocessor_path,
                label_encoder=config.label_encoder,
                categorical_columns=schema.categorical_columns,
                numerical_columns=schema.numeric_columns
            )
            
            return data_transformation_config

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from mlproject import logger

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.target_column = config.target_column
        self.label_encoders = {}
        
        self.categorical_columns = config.categorical_columns
        self.numerical_columns = config.numerical_columns
        

    def preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            data = data.copy()
            
                        
            if self.target_column not in data.columns:
                if 'Total Fare (BDT)' in data.columns:
                    logger.info(f"Renaming 'Total Fare (BDT)' to '{self.target_column}'")
                    data.rename(columns={'Total Fare (BDT)': self.target_column}, inplace=True)
                else:
                    raise ValueError(f"Target column '{self.target_column}' not found in data")
            
            for column in self.categorical_columns:
                if column in data.columns:
                    le = LabelEncoder()
                    data[column] = le.fit_transform(data[column].astype(str))
                    self.label_encoders[column] = le
            
            os.makedirs(os.path.dirname(self.config.label_encoder), exist_ok=True)
            joblib.dump(self.label_encoders, self.config.label_encoder)
            logger.info(f"Saved label encoders to {self.config.label_encoder}")
            
            return data
            
        except Exception as e:
            logger.error(f"Error in preprocess_data: {str(e)}")
            raise e

    def train_test_splitting(self):
        try:
            logger.info(f"Loading data from {self.config.data_path}")
            data = pd.read_csv(self.config.data_path)
            
            data = self.preprocess_data(data)
            
            train, test = train_test_split(data, test_size=0.25, random_state=42)

            os.makedirs(self.config.root_dir, exist_ok=True)
            
            train_path = Path(self.config.root_dir) / "train.csv"
            test_path = Path(self.config.root_dir) / "test.csv"
            train.to_csv(train_path, index=False)
            test.to_csv(test_path, index=False)

            logger.info("Split data into training and test sets")
            logger.info(f"Training data shape: {train.shape}")
            logger.info(f"Test data shape: {test.shape}")

            return train, test
            
        except Exception as e:
            logger.error(f"Error in train_test_splitting: {e}")
            raise e
    
    def preprocess_features(self, train, test):
        try:
            # Identify numerical and categorical columns
            numerical_columns = train.select_dtypes(include=["int64", "float64"]).columns
            categorical_columns = train.select_dtypes(include=["object", "category"]).columns

            if self.target_column in numerical_columns:
                numerical_columns = numerical_columns.drop(self.target_column)

            logger.info(f"Numerical columns: {list(numerical_columns)}")
            logger.info(f"Categorical columns: {list(categorical_columns)}")

            # Preprocessing pipelines
            num_pipeline = Pipeline(steps=[
                ("scaler", StandardScaler())
            ])
            
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", num_pipeline, numerical_columns)
                ],
                remainder="passthrough"
            )

            # Separate features and target
            train_x = train.drop(columns=[self.target_column])
            test_x = test.drop(columns=[self.target_column])
            train_y = train[self.target_column]
            test_y = test[self.target_column]

            train_processed = preprocessor.fit_transform(train_x)
            test_processed = preprocessor.transform(test_x)

            # Ensure target is 2D array
            train_y = train_y.values.reshape(-1, 1)
            test_y = test_y.values.reshape(-1, 1)

            train_combined = np.hstack((train_processed, train_y))
            test_combined = np.hstack((test_processed, test_y))

            # Save preprocessor using Path
            joblib.dump(preprocessor, self.config.preprocessor_path)
            logger.info(f"Preprocessor saved at {self.config.preprocessor_path}")

            # Save processed data using Path
            np.save(Path(self.config.root_dir) / "train_processed.npy", train_combined)
            np.save(Path(self.config.root_dir) / "test_processed.npy", test_combined)

            logger.info("Preprocessed train and test data saved successfully.")
            return train_processed, test_processed

        except Exception as e:
            logger.error(f"Error in preprocess_features: {e}")
            raise e

In [8]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    train, test = data_transformation.train_test_splitting()
    train_processed, test_processed = data_transformation.preprocess_features(train, test)
        
except FileNotFoundError as e:
    logger.error(f"File not found: {e}")
except KeyError as e:
    logger.error(f"Missing key in configuration: {e}")
except Exception as e:
    logger.error(f"Unexpected error: {e}")

[2025-04-21 22:57:34,992: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-21 22:57:34,996: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-21 22:57:35,023: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-21 22:57:35,023: INFO: common: created directory at: artifacts]
[2025-04-21 22:57:35,023: INFO: common: created directory at: artifacts/data_transformation]
[2025-04-21 22:57:35,023: INFO: 3660953481: Loading data from artifacts/data_cleaning/cleaned_flight_data.csv]
[2025-04-21 22:57:35,306: INFO: 3660953481: Saved label encoders to artifacts/data_transformation/label_encoders.pkl]
[2025-04-21 22:57:35,513: INFO: 3660953481: Split data into training and test sets]
[2025-04-21 22:57:35,522: INFO: 3660953481: Training data shape: (42750, 10)]
[2025-04-21 22:57:35,523: INFO: 3660953481: Test data shape: (14250, 10)]
[2025-04-21 22:57:35,527: INFO: 3660953481: Numerical columns: ['Airline', 'Source', 'Destination', 'Stopovers