In [1]:
import os

In [2]:
pwd%

'f:\\Files\\DS&ML\\Flight-Fare-Price-Prediction\\Exp'

In [3]:
os.chdir("../")

In [4]:
pwd%

'f:\\Files\\DS&ML\\Flight-Fare-Price-Prediction'

In [5]:
import pandas as pd
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import logging
import datetime as dt
import os

In [6]:
from mlproject import logger

In [7]:
from mlproject.constants import *
from mlproject.utils.common import *

In [8]:
@dataclass(frozen=True)
class DataCleaningConfig:
    root_dir: Path
    input_data_path: Path
    cleaned_file: Path 
    columns_to_drop: list
    file_status: dict
    datetime_columns: list
    target_column_mapping: dict

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_cleaning_config(self) -> DataCleaningConfig:
        config = self.config.data_cleaning
        schema = self.schema.data_cleaning

        create_directories([config.root_dir])

        data_cleaning_config = DataCleaningConfig(
            root_dir=config.root_dir,
            input_data_path=config.input_data,
            file_status=config.file_status,
            cleaned_file=config.cleaned_file,
            columns_to_drop=schema.columns_to_drop,
            datetime_columns=schema.datetime_columns,
            target_column_mapping=schema.target_column_mapping
        )

        return data_cleaning_config

In [None]:
class DataCleaning:
    def __init__(self, config: DataCleaningConfig):
        self.config = config

    def drop_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Dropping columns: {self.config.columns_to_drop}")
        return df.drop(columns=self.config.columns_to_drop, errors='ignore')

    def convert_datetime_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        
        for col in self.config.datetime_columns:
            if col in df_copy.columns:
                try:
                    df_copy[col] = pd.to_datetime(df_copy[col])
                except Exception as e:
                    logger.warning(f"Could not convert {col} to datetime: {e}")
        
        return df_copy

    def extract_time_categories(self, df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        time_mappings = {
            "Departure Date & Time": "Departure Time",
            "Arrival Date & Time": "Arrival Time"
        }

        for original_col, new_col in time_mappings.items():
            if original_col in df_copy.columns:
                if not pd.api.types.is_datetime64_dtype(df_copy[original_col]):
                    try:
                        df_copy[original_col] = pd.to_datetime(df_copy[original_col])
                    except Exception as e:
                        logger.warning(f"Could not convert {original_col} to datetime: {e}")
                        continue
                
                hour_col = f"{original_col}_hour"
                df_copy[hour_col] = df_copy[original_col].dt.hour

                conditions = [
                    (df_copy[hour_col] >= 6) & (df_copy[hour_col] < 12),
                    (df_copy[hour_col] >= 12) & (df_copy[hour_col] < 18),
                    (df_copy[hour_col] >= 18) & (df_copy[hour_col] < 24),
                    (df_copy[hour_col] >= 0) & (df_copy[hour_col] < 6)
                ]
                choices = ['Morning', 'Afternoon', 'Evening', 'Night']
                
                df_copy[new_col] = pd.Series(
                    np.select(conditions, choices, default='Unknown'), 
                    index=df_copy.index
                )
                
                df_copy.drop(columns=[hour_col], inplace=True)
                
        return df_copy

    def rename_target_column(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.rename(columns=self.config.target_column_mapping)
        
    def log_transform_target(self, df: pd.DataFrame) -> pd.DataFrame:
        target_column = "Total Fare"
        logger.info(f"Applying log transformation to target column: {target_column}")
        
        df_transformed = df.copy()
        
        if target_column not in df_transformed.columns:
            available_cols = df_transformed.columns.tolist()
            logger.error(f"Target column '{target_column}' not found. Available columns: {available_cols}")
            raise ValueError(f"Target column '{target_column}' not found in dataframe")
        
        df_transformed[target_column] = np.log1p(df_transformed[target_column])
        logger.info(f"Log transformation applied to {target_column}")
        
        return df_transformed

    def check_status(self):
        try:
            with open(self.config.file_status, 'r') as f:
                status_data = json.load(f)
            validation_status = status_data.get("Validation status", False)
            logger.info(f"Data validation status: {validation_status}")
            return validation_status
        except Exception as e:
            logger.error(f"Error reading validation status: {e}")
            return False
    
    def clean_data(self):
        validation_status = self.check_status()
        
        if not validation_status:
            logger.error("Data validation failed. Skipping data cleaning.")
        
        logger.info("Data validation passed. Proceeding with data cleaning.")
        logger.info(f"Reading data from {self.config.input_data_path}")

        df = pd.read_csv(self.config.input_data_path)
        
        if df is None or df.empty:
            logger.error("Input data is empty or None")
            raise ValueError("Input data is empty or None")
        
        logger.info(f"Original DataFrame shape: {df.shape}")
            
        df = self.convert_datetime_columns(df)
        df = self.extract_time_categories(df)            
        df = self.rename_target_column(df)            
        df = self.log_transform_target(df)            
        df = self.drop_columns(df)

        os.makedirs(os.path.dirname(self.config.cleaned_file), exist_ok=True)
        df.to_csv(self.config.cleaned_file, index=False)
        logger.info("Data cleaning completed successfully")

In [11]:
try:
    config_manager = ConfigurationManager()
    data_cleaning_config = config_manager.get_data_cleaning_config()
    data_cleaning = DataCleaning(config=data_cleaning_config)
    data_cleaning.clean_data()

except Exception as e:
    logger.error(f"Data cleaning failed: {e}")

[2025-04-29 17:30:50,850: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-29 17:30:50,854: INFO: common: yaml file: params.yaml loaded successfully]


[2025-04-29 17:30:50,879: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-29 17:30:50,882: INFO: common: created directory at: artifacts]
[2025-04-29 17:30:50,886: INFO: common: created directory at: artifacts/data_cleaning]
[2025-04-29 17:30:50,888: INFO: 959760956: Data validation status: True]
[2025-04-29 17:30:50,888: INFO: 959760956: Data validation passed. Proceeding with data cleaning.]
[2025-04-29 17:30:50,888: INFO: 959760956: Reading data from artifacts/data_ingestion/flight-fare-data.csv]
[2025-04-29 17:30:51,444: INFO: 959760956: Original DataFrame shape: (57000, 17)]
[2025-04-29 17:30:51,962: INFO: 959760956: Applying log transformation to target column: Total Fare]
[2025-04-29 17:30:51,978: INFO: 959760956: Log transformation applied to Total Fare]
[2025-04-29 17:30:52,000: INFO: 959760956: Dropping columns: ['Source Name', 'Destination Name', 'Duration (hrs)', 'Aircraft Type', 'Base Fare (BDT)', 'Tax & Surcharge (BDT)', 'Departure Date & Time', 'Arriva