In [None]:
import os
os.chdir("/Users/naveenkumar/Desktop/formula-1-bot")
%pwd

'/Users/naveenkumar/Desktop/formula-1-bot'

In [12]:
from src.formula_one.logging import logger
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional
from pathlib import Path
import psycopg2
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [13]:
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List, Dict, Any

@dataclass
class DataTransformationConfig:
    """Configuration for data transformation"""
    root_dir: Path
    transformed_data_dir: Path
    cleaned_data_dir: Path
    
    # Data cleaning settings
    missing_value_strategy: str = "impute"
    outlier_strategy: str = "context_aware"
    outlier_threshold: float = 3.0
    
    # Feature engineering settings
    create_tire_features: bool = True
    create_lap_features: bool = True
    create_weather_features: bool = True
    create_driver_features: bool = True
    
    # Data type settings
    numeric_columns: List[str] = None
    categorical_columns: List[str] = None
    datetime_columns: List[str] = None
    
    # Tables to transform
    tables_to_transform: List[str] = None
    
    def __post_init__(self):
        if self.tables_to_transform is None:
            self.tables_to_transform = [
                "sessions", "drivers", "laps", 
                "pit_stops", "stints", "positions", "intervals", 
                "weather"
            ]
        
        if self.numeric_columns is None:
            self.numeric_columns = [
                "lap_duration", "duration_sector_1", "duration_sector_2", "duration_sector_3",
                "pit_duration", "lap_start", "lap_end", "tyre_age_at_start",
                "position", "gap_to_leader", "interval",
                "air_temperature", "track_temperature", "humidity"
            ]
        
        if self.categorical_columns is None:
            self.categorical_columns = [
                "compound", "flag", "category", "scope"
            ]
        
        if self.datetime_columns is None:
            self.datetime_columns = [
                "date_start", "date_end", "date"
            ]

In [14]:
from src.formula_one.constants import *
from src.formula_one.utils.common import read_yaml, create_directories

In [15]:
class ConfigurationManager:
    """Manages configuration loading from YAML files"""
    
    def __init__(self, config_file_path: str = "config/config.yaml"):
        self.config_file_path = Path(config_file_path)
        self.config = read_yaml(self.config_file_path)
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        """Get data transformation configuration"""
        config_data = self.config.get('data_transformation', {})
        
        return DataTransformationConfig(
            root_dir=Path(config_data.get('root_dir', 'artifacts/data_transformation')),
            transformed_data_dir=Path(config_data.get('transformed_data_dir', 'artifacts/data_transformation/transformed')),
            cleaned_data_dir=Path(config_data.get('cleaned_data_dir', 'artifacts/data_transformation/cleaned')),
            missing_value_strategy=config_data.get('missing_value_strategy', 'impute'),
            outlier_strategy=config_data.get('outlier_strategy', 'cap'),
            outlier_threshold=config_data.get('outlier_threshold', 3.0),
            create_tire_features=config_data.get('create_tire_features', True),
            create_lap_features=config_data.get('create_lap_features', True),
            create_weather_features=config_data.get('create_weather_features', True),
            create_driver_features=config_data.get('create_driver_features', True)
        )

In [16]:
from src.formula_one.entity.config_entity import DatabaseConfig
from src.formula_one.components.data_ingestion import DatabaseIngestion

class DataTransformation:
    """Handles data transformation for F1 data - Database-First Approach"""
    
    def __init__(self, transformation_config: DataTransformationConfig, db_config: DatabaseConfig):
        self.config = transformation_config
        self.db_config = db_config
        self.logger = logger
        self.db_ingestion = DatabaseIngestion(None, db_config, None)
        
        # Initialize transformers
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.imputers = {}
        
        # Store transformed data
        self.transformed_data = {}
        self.feature_columns = []
    
    def transform_all_data(self) -> Dict[str, pd.DataFrame]:
        """Transform all tables and update database"""
        self.logger.info("Starting comprehensive data transformation (Database-First)")
        
        # Create transformed tables in database
        self._create_transformed_tables()

        self._debug_table_structure('stints')
        
        # Transform and update each table
        for table in self.config.tables_to_transform:
            self.logger.info(f"Transforming table: {table}")
            try:
                df = self._load_table_data(table)
                if df is not None and not df.empty:
                    transformed_df = self._transform_table(df, table)
                    self._update_database_table(table, transformed_df)
                    self.logger.info(f"Successfully transformed and updated {table}")
                else:
                    self.logger.warning(f"Table {table} is empty or could not be loaded")
            except Exception as e:
                self.logger.error(f"Error transforming table {table}: {str(e)}")
                # Continue with other tables instead of stopping
                continue
        
        # Load all transformed data for return
        self.transformed_data = self._load_all_transformed_data()
        
        return self.transformed_data
    
    def _debug_table_structure(self, table_name: str):
        """Debug table structure to understand what columns exist"""
        conn = self.db_ingestion.connect_to_db()
        cursor = conn.cursor()
        
        try:
            cursor.execute(f"""
                SELECT column_name, data_type, is_nullable 
                FROM information_schema.columns 
                WHERE table_name = '{table_name}'
                ORDER BY ordinal_position
            """)
            columns = cursor.fetchall()
            
            self.logger.info(f"Table structure for {table_name}:")
            for col in columns:
                self.logger.info(f"  {col[0]}: {col[1]} (nullable: {col[2]})")
                
        except Exception as e:
            self.logger.error(f"Error getting table structure for {table_name}: {e}")
        finally:
            cursor.close()
            conn.close()
    
    def _create_transformed_tables(self):
        """Create transformed tables in database"""
        conn = self.db_ingestion.connect_to_db()
        cursor = conn.cursor()
        
        try:
            # Create transformed tables for each original table
            tables_config = {
                'laps': [
                    "lap_time_std FLOAT",
                    "lap_time_mean FLOAT", 
                    "lap_time_deviation FLOAT",
                    "total_sector_time FLOAT",
                    "sector_consistency FLOAT",
                    "had_incident BOOLEAN",
                    "safety_car_lap BOOLEAN",
                    "is_outlier BOOLEAN",
                ],
                'pit_stops': [
                    "pit_stop_count INTEGER",
                    "pit_stop_timing INTEGER",
                    "normal_pit_stop BOOLEAN",
                    "long_pit_stop BOOLEAN", 
                    "penalty_pit_stop BOOLEAN",
                    "is_outlier BOOLEAN",
                ],
                'stints': [
                    "stint_duration INTEGER",
                    "tire_age_progression INTEGER",
                    "is_outlier BOOLEAN",
                ],
                'positions': [
                    "position_change INTEGER",
                    "position_std FLOAT",
                    "is_leader BOOLEAN",
                    "position_improved BOOLEAN",
                    "position_declined BOOLEAN",
                    "is_outlier BOOLEAN",
                ],
                'intervals': [
                    "is_leader BOOLEAN",
                    "is_lapped BOOLEAN",
                    "is_outlier BOOLEAN"
                ],
                'weather': [
                    "temperature_delta FLOAT",
                    "weather_severity FLOAT",
                    "extreme_weather BOOLEAN",
                ],
                'drivers': [
                    "team_name_encoded INTEGER",
                ],
                'sessions': [
                    "session_type_encoded INTEGER",
                ],
            }
            
            for table_name, new_columns in tables_config.items():
                # Create transformed table
                cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS {table_name}_transformed AS 
                    SELECT * FROM {table_name}
                """)
                
                # Add new columns for transformed features
                for column_def in new_columns:
                    try:
                        cursor.execute(f"ALTER TABLE {table_name}_transformed ADD COLUMN IF NOT EXISTS {column_def}")
                    except Exception as e:
                        self.logger.warning(f"Could not add column {column_def} to {table_name}_transformed: {e}")
            
            conn.commit()
            self.logger.info("Created transformed tables in database")
            
        except Exception as e:
            conn.rollback()
            self.logger.error(f"Error creating transformed tables: {e}")
            raise
        finally:
            cursor.close()
            conn.close()
    
    def _load_table_data(self, table_name: str) -> Optional[pd.DataFrame]:
        """Load data from database table"""
        conn = self.db_ingestion.connect_to_db()
        cursor = conn.cursor()
        
        try:
            cursor.execute(f"SELECT * FROM {table_name}")
            columns = [desc[0] for desc in cursor.description]
            data = cursor.fetchall()
            
            if data:
                df = pd.DataFrame(data, columns=columns)
                return df
            else:
                return None
                
        except Exception as e:
            self.logger.error(f"Error loading table {table_name}: {e}")
            return None
        finally:
            cursor.close()
            conn.close()
    
    def _transform_table(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Transform a specific table"""
        self.logger.info(f"Transforming {table_name} with {len(df)} rows")
        
        try:
            # Step 1: Handle missing values
            df = self._handle_missing_values(df, table_name)
            
            # Step 2: Fix data types
            df = self._fix_data_types(df, table_name)
            
            # Step 3: Handle outliers
            df = self._handle_outliers(df, table_name)
            
            # Step 4: Encode categorical variables
            df = self._encode_categorical_variables(df, table_name)
            
            # Step 5: Create table-specific features
            df = self._create_table_specific_features(df, table_name)
            
            # Debug the final result
            self._debug_table_data(df, table_name)
            
            return df
            
        except Exception as e:
            self.logger.error(f"Error in _transform_table for {table_name}: {e}")
            # Return the original dataframe with basic transformations
            # Only add is_outlier for tables that have it in schema
            tables_with_outliers = ['laps', 'pit_stops', 'stints', 'positions', 'intervals']
            if table_name in tables_with_outliers:
                df['is_outlier'] = False
                df['outlier_type'] = 'normal'
            return df
    
    def _handle_missing_values(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Handle missing values based on strategy"""
        self.logger.info(f"Handling missing values in {table_name}")
        
        try:
            if self.config.missing_value_strategy == "impute":
                # Use different strategies for different column types
                for column in df.columns:
                    if column in self.config.numeric_columns and df[column].dtype in ['float64', 'int64']:
                        # Use median for numeric columns
                        try:
                            imputer = SimpleImputer(strategy='median')
                            df[column] = imputer.fit_transform(df[[column]])
                            self.imputers[f"{table_name}_{column}"] = imputer
                        except Exception as e:
                            self.logger.warning(f"Could not impute {column} in {table_name}: {e}")
                            # Fallback to forward fill then backward fill
                            df[column] = df[column].ffill().bfill().fillna(0)
                            
                    elif column in self.config.categorical_columns:
                        # Use most frequent for categorical columns
                        try:
                            imputer = SimpleImputer(strategy='most_frequent')
                            df[column] = imputer.fit_transform(df[[column]])
                            self.imputers[f"{table_name}_{column}"] = imputer
                        except Exception as e:
                            self.logger.warning(f"Could not impute {column} in {table_name}: {e}")
                            # Fallback to 'Unknown'
                            df[column] = df[column].fillna('Unknown')
                            
                    elif column in self.config.datetime_columns:
                        # Forward fill for datetime columns
                        df[column] = df[column].ffill().bfill()

            
            elif self.config.missing_value_strategy == "drop":
                df = df.dropna()
            
            elif self.config.missing_value_strategy == "interpolate":
                # Interpolate numeric columns
                try:
                    numeric_cols = df.select_dtypes(include=[np.number]).columns
                    df[numeric_cols] = df[numeric_cols].interpolate(method='linear')
                except Exception as e:
                    self.logger.warning(f"Could not interpolate numeric columns in {table_name}: {e}")
                    # Fallback to forward fill
                    numeric_cols = df.select_dtypes(include=[np.number]).columns
                    df[numeric_cols] = df[numeric_cols].ffill().bfill().fillna(0)
            
            return df
        
        except Exception as e:
            self.logger.error(f"Error in _handle_missing_values for {table_name}: {e}")
            # Return dataframe with basic missing value handling
            for column in df.columns:
                if df[column].dtype in ['float64', 'int64']:
                    df[column] = df[column].fillna(0)
                elif df[column].dtype == 'object':
                    df[column] = df[column].fillna('Unknown')
            return df
    
    def _fix_data_types(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Fix data types for each column"""
        self.logger.info(f"Fixing data types in {table_name}")
        
        for column in df.columns:
            if column in self.config.numeric_columns:
                try:
                    df[column] = pd.to_numeric(df[column], errors='coerce')
                except:
                    self.logger.warning(f"Could not convert {column} to numeric in {table_name}")
            
            elif column in self.config.datetime_columns:
                try:
                    df[column] = pd.to_datetime(df[column], errors='coerce')
                except:
                    self.logger.warning(f"Could not convert {column} to datetime in {table_name}")
            
            elif column in self.config.categorical_columns:
                df[column] = df[column].astype('category')
        
        return df
    
    def _handle_outliers(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Handle outliers with F1-specific context"""
        if self.config.outlier_strategy == "ignore":
            return df
        
        self.logger.info(f"Handling outliers in {table_name} with F1 context")

        tables_with_outliers = ['laps', 'pit_stops', 'stints', 'positions', 'intervals']
        
        # Detect F1-specific outliers
        df = self._detect_f1_outliers(df, table_name)
        
        # Handle based on strategy
        if self.config.outlier_strategy == "context_aware":
            # Flag outliers but keep all data
            df = self._flag_outliers_context_aware(df, table_name)
            
        elif self.config.outlier_strategy == "remove_system_errors":
            # Only remove obvious system errors
            df = self._remove_only_system_errors(df, table_name)
            
        elif self.config.outlier_strategy == "cap":
            # Cap outliers to reasonable bounds
            df = self._cap_outliers_f1_context(df, table_name)
        
        return df
    
    def _detect_f1_outliers(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Detect outliers with F1-specific rules"""
        
        if table_name == "laps":
            # Lap time outliers
            df['is_outlier'] = (
                (df['lap_duration'] < 0) | 
                (df['lap_duration'] > 9999) |
                (df['lap_duration'] == 0) |
                (df['lap_duration'] > 300)
            )
            
        elif table_name == "pit_stops":
            # Pit stop outliers
            df['is_outlier'] = (
                (df['pit_duration'] < 0) |
                (df['pit_duration'] > 1000) | 
                (df['pit_duration'] == 0)
            )
            
        elif table_name == "positions":
            # Position outliers
            df['is_outlier'] = (
                (df['position'] > 50) |
                (df['position'] < -1)
            )
            
        elif table_name == "intervals":
            # Interval outliers
            df['is_outlier'] = (
                (df['gap_to_leader'] < -1000) |  # System errors
                (df['gap_to_leader'] > 10000) |  # System errors
                (df['interval'] < -1000) |  # System errors
                (df['interval'] > 10000)  # System errors
            )
            
        else:
            # For other tables, use standard outlier detection
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            df['is_outlier'] = False
            
            for column in numeric_cols:
                if column in ['id', 'created_at', 'is_outlier']:  # Skip metadata columns
                    continue
                    
                Q1 = df[column].quantile(0.25)
                Q3 = df[column].quantile(0.75)
                IQR = Q3 - Q1
                
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                column_outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
                df['is_outlier'] = df['is_outlier'] | column_outliers
        
        return df
    
    def _flag_outliers_context_aware(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Flag outliers but keep all data - create features from outliers"""
        
        if 'is_outlier' not in df.columns:
            df['is_outlier'] = False
        
        # Classify outlier types
        df['outlier_type'] = self._classify_outlier_type(df, table_name)
        
        # Create features from outliers
        if table_name == "laps":
            df['had_incident'] = ((df['lap_duration'] > 120) & (df['lap_duration'] <= 300)).astype(bool)
            df['safety_car_lap'] = ((df['lap_duration'] > 90) & (df['lap_duration'] <= 120)).astype(bool)
            
        elif table_name == "pit_stops":
            df['normal_pit_stop'] = ((df['pit_duration'] >= 0.5) & (df['pit_duration'] <= 5)).astype(bool)
            df['long_pit_stop'] = ((df['pit_duration'] > 5) & (df['pit_duration'] <= 60)).astype(bool)
            df['penalty_pit_stop'] = ((df['pit_duration'] > 60) & (df['pit_duration'] <= 300)).astype(bool)
            
        elif table_name == "positions":
            df['is_leader'] = (df['position'] == 0).astype(bool)
            df['is_retired'] = (df['position'] == -1).astype(bool)
            df['is_lapped'] = (df['position'] > 20).astype(bool)
            
        elif table_name == "intervals":
            df['is_leader'] = (df['gap_to_leader'] == 0).astype(bool)
            df['is_lapped'] = ((df['gap_to_leader'] == -999) | (df['interval'] == -999)).astype(bool)

        elif table_name == "weather":
            df['extreme_weather'] = (
                ((df['air_temperature'] < -20) | (df['air_temperature'] > 50)) |
                ((df['track_temperature'] < -10) | (df['track_temperature'] > 80)) |
                (df['humidity'] > 90)
            ).astype(bool)

        self.logger.info(f"Flagged {df['is_outlier'].sum()} outliers in {table_name}")
        return df
    
    def _remove_only_system_errors(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Only remove obvious system errors, keep all race events"""
        
        if table_name == "laps":
            # Remove only system errors
            df = df[df['lap_duration'] > 0]  # Remove negative times
            df = df[df['lap_duration'] < 9999]  # Remove system errors
            self.logger.info(f"Removed system errors from {table_name}")
            
        elif table_name == "pit_stops":
            # Remove only system errors
            df = df[df['pit_duration'] >= 0]  # Remove negative durations
            df = df[df['pit_duration'] < 1000]  # Remove system errors
            self.logger.info(f"Removed system errors from {table_name}")
            
        elif table_name == "positions":
            # Remove only system errors
            df = df[df['position'] <= 50]  # Remove impossible positions
            df = df[df['position'] >= -1]  # Remove invalid negatives
            self.logger.info(f"Removed system errors from {table_name}")
            
        elif table_name == "intervals":
            # Remove only system errors
            df = df[df['gap_to_leader'] >= -1000]  # Remove system errors
            df = df[df['gap_to_leader'] <= 10000]  # Remove system errors
            df = df[df['interval'] >= -1000]  # Remove system errors
            df = df[df['interval'] <= 10000]  # Remove system errors
            self.logger.info(f"Removed system errors from {table_name}")
            
        elif table_name == "weather":
            # Remove only system errors
            df = df[df['air_temperature'] >= -50]  # Remove impossible cold
            df = df[df['air_temperature'] <= 100]  # Remove impossible hot
            df = df[df['humidity'] >= 0]  # Remove impossible humidity
            df = df[df['humidity'] <= 100]  # Remove impossible humidity
            self.logger.info(f"Removed system errors from {table_name}")
        
        return df
    
    def _cap_outliers_f1_context(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Cap outliers to reasonable F1 bounds"""
        
        if table_name == "laps":
            # Cap lap times to reasonable bounds
            df['lap_duration'] = df['lap_duration'].clip(lower=0, upper=300)
            
        elif table_name == "pit_stops":
            # Cap pit durations to reasonable bounds
            df['pit_duration'] = df['pit_duration'].clip(lower=0, upper=300)
            
        elif table_name == "positions":
            # Cap positions to reasonable bounds
            df['position'] = df['position'].clip(lower=-1, upper=50)
            
        elif table_name == "intervals":
            # Cap intervals to reasonable bounds
            df['gap_to_leader'] = df['gap_to_leader'].clip(lower=-999, upper=999)
            df['interval'] = df['interval'].clip(lower=-999, upper=999)
            
        elif table_name == "weather":
            # Cap weather to reasonable bounds
            df['air_temperature'] = df['air_temperature'].clip(lower=-50, upper=100)
            df['track_temperature'] = df['track_temperature'].clip(lower=-50, upper=150)
            df['humidity'] = df['humidity'].clip(lower=0, upper=100)
        
        return df
    
    def _classify_outlier_type(self, df: pd.DataFrame, table_name: str) -> pd.Series:
        """Classify outliers by type"""
        
        outlier_types = []
        
        for idx, row in df.iterrows():
            if not row.get('is_outlier', False):
                outlier_types.append('normal')
                continue
                
            if table_name == "laps":
                if row['lap_duration'] < 0 or row['lap_duration'] > 9999:
                    pass
                elif row['lap_duration'] > 120:
                    outlier_types.append('incident_safety_car')
                elif row['lap_duration'] < 15:
                    outlier_types.append('qualifying_slipstream')
                else:
                    outlier_types.append('unknown')
                    
            elif table_name == "pit_stops":
                if row['pit_duration'] < 0 or row['pit_duration'] > 1000:
                    pass
                elif row['pit_duration'] > 60:
                    outlier_types.append('penalty_repair')
                elif row['pit_duration'] < 0.5:
                    outlier_types.append('very_fast_stop')
                else:
                    outlier_types.append('unknown')
                    
            elif table_name == "positions":
                if row['position'] > 50 or row['position'] < -1:
                    pass
                elif row['position'] == -1:
                    outlier_types.append('retired')
                elif row['position'] > 20:
                    outlier_types.append('lapped')
                else:
                    outlier_types.append('unknown')
                    
            else:
                outlier_types.append('unknown')
        
        return pd.Series(outlier_types)
    
    def _encode_categorical_variables(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Encode categorical variables"""
        self.logger.info(f"Encoding categorical variables in {table_name}")
        
        # Define which columns to encode for each table
        encoding_config = {
            'sessions': ['session_type'],
            'drivers': ['team_name'],
        }
        
        if table_name in encoding_config:
            for column in encoding_config[table_name]:
                if column in df.columns:
                    # Handle missing values first - use a different approach for categorical
                    if df[column].dtype.name == 'category':
                        # If it's already categorical, add 'Unknown' to categories first
                        current_categories = df[column].cat.categories.tolist()
                        if 'Unknown' not in current_categories:
                            df[column] = df[column].cat.add_categories(['Unknown'])
                        df[column] = df[column].fillna('Unknown')
                    else:
                        # Convert to string and fill missing values
                        df[column] = df[column].astype(str).fillna('Unknown')
                    
                    # Create label encoder
                    le = LabelEncoder()
                    try:
                        # Fit and transform
                        encoded_values = le.fit_transform(df[column].astype(str))
                        df[f"{column}_encoded"] = encoded_values
                        self.label_encoders[f"{table_name}_{column}"] = le
                        
                        self.logger.info(f"Encoded {column} in {table_name} with {len(le.classes_)} unique values")
                    except Exception as e:
                        self.logger.warning(f"Could not encode {column} in {table_name}: {e}")
                        df[f"{column}_encoded"] = 0  # Default value
                else:
                    self.logger.warning(f"Column {column} not found in {table_name}")
                    df[f"{column}_encoded"] = 0  # Default value
        
        return df
    
    def _create_table_specific_features(self, df: pd.DataFrame, table_name: str) -> pd.DataFrame:
        """Create table-specific features"""
        try:
            if table_name == "laps":
                df = self._create_lap_features(df)
            elif table_name == "stints":
                df = self._create_stint_features(df)
            elif table_name == "pit_stops":
                df = self._create_pit_stop_features(df)
            elif table_name == "positions":
                df = self._create_position_features(df)
            elif table_name == "intervals":
                df = self._create_interval_features(df)
            elif table_name == "weather":
                df = self._create_weather_features(df)
            
            return df
        
        except Exception as e:
            self.logger.error(f"Error in _create_table_specific_features for {table_name}: {e}")
            return df
    
    def _create_lap_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create lap-specific features"""
        # Lap time consistency
        df['lap_time_std'] = df.groupby('driver_number')['lap_duration'].transform('std')
        df['lap_time_mean'] = df.groupby('driver_number')['lap_duration'].transform('mean')
        df['lap_time_deviation'] = df['lap_duration'] - df['lap_time_mean']
        
        # Sector analysis
        df['total_sector_time'] = df['duration_sector_1'] + df['duration_sector_2'] + df['duration_sector_3']
        df['sector_consistency'] = df[['duration_sector_1', 'duration_sector_2', 'duration_sector_3']].std(axis=1)
        
        # Boolean features for laps
        df['had_incident'] = ((df['lap_duration'] > 120) & (df['lap_duration'] <= 300)).astype(bool)
        df['safety_car_lap'] = ((df['lap_duration'] > 90) & (df['lap_duration'] <= 120)).astype(bool)

        return df
    
    def _create_stint_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create stint-specific features"""
        try:
            # Ensure we have the required columns
            required_cols = ['lap_start', 'lap_end', 'tyre_age_at_start', 'session_key', 'driver_number']
            missing_cols = [col for col in required_cols if col not in df.columns]
            if missing_cols:
                self.logger.warning(f"Missing columns in stints: {missing_cols}")
                # Add default values for missing columns
                for col in missing_cols:
                    df[col] = 0
            
            # Stint duration - handle missing values and invalid data
            df['stint_duration'] = df['lap_end'] - df['lap_start']
            # Handle cases where lap_end < lap_start (invalid data)
            df['stint_duration'] = df['stint_duration'].clip(lower=0, upper=100)
            df['stint_duration'] = df['stint_duration'].fillna(0)
            
            # Tire age progression - handle missing values
            try:
                df['tire_age_progression'] = df.groupby(['session_key', 'driver_number'])['tyre_age_at_start'].diff()
                df['tire_age_progression'] = df['tire_age_progression'].fillna(0).clip(lower=0, upper=50)
            except Exception as e:
                self.logger.warning(f"Could not calculate tire_age_progression: {e}")
                df['tire_age_progression'] = 0
            
            return df
            
        except Exception as e:
            self.logger.error(f"Error in _create_stint_features: {e}")
            # Return dataframe with default values
            df['stint_duration'] = 0
            df['tire_age_progression'] = 0
            return df
    
    def _create_pit_stop_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create pit stop-specific features with ALL features included"""

        df['pit_stop_count'] = df.groupby(['session_key', 'driver_number']).cumcount() + 1
        df['pit_stop_count'] = df['pit_stop_count'].clip(upper=20)  # Cap at 20 pit stops (more reasonable)
        
        df['pit_stop_timing'] = df.groupby(['session_key', 'driver_number'])['lap_number'].diff()
        df['pit_stop_timing'] = df['pit_stop_timing'].clip(lower=0, upper=50)  # Cap at 50 laps

        df['normal_pit_stop'] = (df['pit_duration'] >= 0.5) & (df['pit_duration'] <= 5)
        df['long_pit_stop'] = (df['pit_duration'] > 5) & (df['pit_duration'] <= 60)
        df['penalty_pit_stop'] = (df['pit_duration'] > 60) & (df['pit_duration'] <= 300)
        
        return df

    def _create_position_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create position-specific features with ALL features included"""

        df['position_change'] = df.groupby(['session_key', 'driver_number'])['position'].diff()
        df['position_change'] = df['position_change'].clip(lower=-20, upper=20) 
        
        df['position_std'] = df.groupby(['session_key', 'driver_number'])['position'].transform('std')
        df['position_std'] = df['position_std'].clip(upper=20) 
        
        df['is_leader'] = (df['position'] == 1).astype(bool) 

        # Position improvement/decline
        df['position_improved'] = (df['position_change'] < 0).astype(bool)
        df['position_declined'] = (df['position_change'] > 0).astype(bool)
        
        return df

    def _create_interval_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create interval-specific features with ALL features included"""
        # Boolean features (these should be created here, not in outlier handling)
        df['is_leader'] = df['gap_to_leader'] == 0
        df['is_lapped'] = (df['gap_to_leader'] == -999) | (df['interval'] == -999)
        
        return df
    
    def _create_weather_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create weather-specific features"""
        # Weather impact features
        df['temperature_delta'] = df['track_temperature'] - df['air_temperature']
        df['weather_severity'] = df['humidity'] * df['rainfall'].astype(int)
        
        # Boolean features for weather
        df['extreme_weather'] = (
            ((df['air_temperature'] < -20) | (df['air_temperature'] > 50)) |
            ((df['track_temperature'] < -10) | (df['track_temperature'] > 80)) |
            (df['humidity'] > 90)
        ).astype(bool)
        
        return df
    
    def _update_database_table(self, table_name: str, df: pd.DataFrame):
        """Update transformed table in database"""
        conn = self.db_ingestion.connect_to_db()
        cursor = conn.cursor()
        
        try:
            # First, get the actual columns that exist in the database table
            cursor.execute(f"""
                SELECT column_name 
                FROM information_schema.columns 
                WHERE table_name = '{table_name}_transformed'
                ORDER BY ordinal_position
            """)
            db_columns = [row[0] for row in cursor.fetchall()]
            
            # Filter DataFrame to only include columns that exist in the database
            df_clean = df[db_columns].copy()
            
            # Convert numeric columns to appropriate types
            for col in df_clean.columns:
                if 'count' in col or 'timing' in col or 'duration' in col or 'change' in col or 'progression' in col:
                    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0).astype('int32')
                elif 'std' in col or 'mean' in col or 'deviation' in col or 'consistency' in col:
                    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0.0).astype('float32')
                elif col in ['is_outlier', 'had_incident', 'safety_car_lap', 
                            'normal_pit_stop', 'long_pit_stop', 'penalty_pit_stop', 'is_leader', 'is_retired', 
                            'is_lapped', 'extreme_weather']:
                    df_clean[col] = df_clean[col].fillna(False).astype('bool')
                elif col.endswith('_encoded'):
                    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0).astype('int32')
                elif col == 'outlier_type':
                    df_clean[col] = df_clean[col].fillna('normal')
            
            # Clear existing data
            cursor.execute(f"DELETE FROM {table_name}_transformed")
            
            # Insert transformed data
            for _, row in df_clean.iterrows():
                placeholders = ', '.join(['%s'] * len(row))
                columns = ', '.join(row.index)
                query = f"INSERT INTO {table_name}_transformed ({columns}) VALUES ({placeholders})"
                cursor.execute(query, tuple(row.values))
            
            conn.commit()
            self.logger.info(f"Updated {table_name}_transformed with {len(df)} rows")
            
        except Exception as e:
            conn.rollback()
            self.logger.error(f"Error updating {table_name}_transformed: {e}")
            raise
        finally:
            cursor.close()
            conn.close()
    
    def _load_all_transformed_data(self) -> Dict[str, pd.DataFrame]:
        """Load all transformed data from database"""
        transformed_data = {}
        
        for table in self.config.tables_to_transform:
            df = self._load_table_data(f"{table}_transformed")
            if df is not None:
                transformed_data[table] = df
        
        return transformed_data
    
    def get_feature_columns(self) -> List[str]:
        """Get list of feature columns for ML"""
        feature_cols = []
        for df in self.transformed_data.values():
            # Exclude metadata columns
            exclude_cols = ['id', 'created_at', 'meeting_key', 'session_key']
            feature_cols.extend([col for col in df.columns if col not in exclude_cols])
        return list(set(feature_cols))
    
    def _debug_table_data(self, df: pd.DataFrame, table_name: str):
        """Debug method to see what's happening with the data"""
        self.logger.info(f"DEBUG {table_name}:")
        self.logger.info(f"  Shape: {df.shape}")
        self.logger.info(f"  Columns: {list(df.columns)}")
        self.logger.info(f"  Missing values: {df.isnull().sum().sum()}")
        
        # Check for encoded columns
        encoded_cols = [col for col in df.columns if col.endswith('_encoded')]
        if encoded_cols:
            self.logger.info(f"  Encoded columns: {encoded_cols}")
            for col in encoded_cols:
                self.logger.info(f"    {col}: {df[col].value_counts().to_dict()}")
        
        # Check for boolean columns
        tables_with_outliers = ['laps', 'pit_stops', 'stints', 'positions', 'intervals']
        if table_name in tables_with_outliers:
            boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()
            if boolean_cols:
                self.logger.info(f"  Boolean columns: {boolean_cols}")
                for col in boolean_cols:
                    value_counts = df[col].value_counts().to_dict()
                    self.logger.info(f"    {col}: {value_counts}")

In [17]:
# ADD THIS CELL BEFORE RUNNING TRANSFORMATION
# Force drop all transformed tables to recreate with correct column types

from src.formula_one.entity.config_entity import DatabaseConfig
from src.formula_one.components.data_ingestion import DatabaseIngestion

# Create database connection
db_config = DatabaseConfig()
db_ingestion = DatabaseIngestion(None, db_config, None)
conn = db_ingestion.connect_to_db()
cursor = conn.cursor()

try:
    # Drop all transformed tables
    tables_to_drop = [
        "sessions_transformed", "drivers_transformed",
        "laps_transformed", "pit_stops_transformed", "stints_transformed", 
        "positions_transformed", "intervals_transformed", "weather_transformed"
    ]
    
    for table in tables_to_drop:
        cursor.execute(f"DROP TABLE IF EXISTS {table} CASCADE")
        print(f"Dropped {table}")
    
    conn.commit()
    print("All transformed tables dropped successfully!")
    
except Exception as e:
    conn.rollback()
    print(f"Error dropping tables: {e}")
finally:
    cursor.close()
    conn.close()

[2025-07-03 19:29:12,766: INFO: data_ingestion: Successfully connected to PostgreSQL database]
Dropped sessions_transformed
Dropped drivers_transformed
Dropped laps_transformed
Dropped pit_stops_transformed
Dropped stints_transformed
Dropped positions_transformed
Dropped intervals_transformed
Dropped weather_transformed
All transformed tables dropped successfully!


In [18]:
# Load configuration
config_manager = ConfigurationManager()
transformation_config = config_manager.get_data_transformation_config()
db_config = DatabaseConfig()

# Create transformation instance
data_transformation = DataTransformation(transformation_config, db_config)

# Run transformation (updates database)
transformed_data = data_transformation.transform_all_data()

# Print results
print("=== DATA TRANSFORMATION RESULTS (Database-First) ===")
print(f"Transformed Tables: {len(transformed_data)}")

for table_name, df in transformed_data.items():
    print(f"\n{table_name.upper()}:")
    print(f"  Rows: {len(df)}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Missing Values: {df.isnull().sum().sum()}")
    
    # Show new features
    new_features = [col for col in df.columns if any(suffix in col for suffix in ['_std', '_mean', '_deviation', 'had_incident', 'safety_car_lap', 'is_leader', 'is_retired', '_encoded'])]
    if new_features:
        print(f"  New Features: {len(new_features)}")
        print(f"    {new_features[:5]}...")  # Show first 5
    
    # Show outlier information
    if 'is_outlier' in df.columns:
        outlier_count = df['is_outlier'].sum()
        print(f"  Outliers: {outlier_count} ({outlier_count/len(df)*100:.1f}%)")

# Get feature columns for ML
feature_columns = data_transformation.get_feature_columns()
print(f"\nTotal Feature Columns for ML: {len(feature_columns)}")

[2025-07-03 19:29:12,776: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-03 19:29:12,777: INFO: 946721898: Starting comprehensive data transformation (Database-First)]
[2025-07-03 19:29:12,781: INFO: data_ingestion: Successfully connected to PostgreSQL database]
[2025-07-03 19:29:13,311: INFO: 946721898: Created transformed tables in database]
[2025-07-03 19:29:13,315: INFO: data_ingestion: Successfully connected to PostgreSQL database]
[2025-07-03 19:29:13,318: INFO: 946721898: Table structure for stints:]
[2025-07-03 19:29:13,318: INFO: 946721898:   id: integer (nullable: NO)]
[2025-07-03 19:29:13,319: INFO: 946721898:   session_key: integer (nullable: YES)]
[2025-07-03 19:29:13,319: INFO: 946721898:   meeting_key: integer (nullable: YES)]
[2025-07-03 19:29:13,319: INFO: 946721898:   driver_number: integer (nullable: YES)]
[2025-07-03 19:29:13,320: INFO: 946721898:   compound: character varying (nullable: YES)]
[2025-07-03 19:29:13,320: INFO: 946721898:   l

In [20]:
# Print first 5 rows of each transformed table
from src.formula_one.entity.config_entity import DatabaseConfig
from src.formula_one.components.data_ingestion import DatabaseIngestion
import pandas as pd

# Create database connection
db_config = DatabaseConfig()
db_ingestion = DatabaseIngestion(None, db_config, None)
conn = db_ingestion.connect_to_db()
cursor = conn.cursor()

try:
    conn.rollback()
    
    # List of transformed tables
    transformed_tables = [
        "sessions_transformed", "drivers_transformed",
        "laps_transformed", "pit_stops_transformed", "stints_transformed", 
        "positions_transformed", "intervals_transformed", "weather_transformed",
    ]
    
    for table in transformed_tables:
        print(f"\n{'='*60}")
        print(f"TABLE: {table.upper()}")
        print(f"{'='*60}")
        
        try:
            # Get first 5 rows
            cursor.execute(f"SELECT * FROM {table} LIMIT 5")
            columns = [desc[0] for desc in cursor.description]
            data = cursor.fetchall()
            
            if data:
                df = pd.DataFrame(data, columns=columns)
                print(f"Shape: {df.shape}")
                print(f"Columns: {list(df.columns)}")
                print("\nFirst 5 rows:")
                print(df.to_string(index=False, max_cols=10))
                
                # Show feature columns specifically
                feature_cols = [col for col in df.columns if any(suffix in col for suffix in 
                    ['_std', '_mean', '_deviation', 'had_incident', 'safety_car_lap', 
                     'is_leader', 'is_retired', '_encoded', '_count', '_timing', 
                     '_change', 'normal_', 'long_', 'penalty_', 'system_error'])]
                
                if feature_cols:
                    print(f"\nFeature columns: {feature_cols}")
                    print("Feature values (first 5 rows):")
                    print(df[feature_cols].head().to_string(index=False))
            else:
                print("Table is empty")
                
        except Exception as e:
            print(f"Error reading {table}: {e}")
    
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    
    # Summary of all tables
    for table in transformed_tables:
        try:
            cursor.execute(f"SELECT COUNT(*) FROM {table}")
            count = cursor.fetchone()[0]
            print(f"{table}: {count:,} rows")
        except Exception as e:
            print(f"{table}: Error - {e}")

except Exception as e:
    print(f"Error: {e}")
finally:
    cursor.close()
    conn.close()

[2025-07-04 13:50:05,305: INFO: data_ingestion: Successfully connected to PostgreSQL database]

TABLE: SESSIONS_TRANSFORMED
Shape: (5, 8)
Columns: ['session_key', 'meeting_key', 'session_name', 'session_type', 'date_start', 'date_end', 'created_at', 'session_type_encoded']

First 5 rows:
 session_key  meeting_key session_name session_type          date_start            date_end                 created_at  session_type_encoded
        9974         1261   Practice 3     Practice 2025-05-24 10:30:00 2025-05-24 11:30:00 2025-07-04 11:51:27.876505                     0
        9975         1261   Qualifying   Qualifying 2025-05-24 14:00:00 2025-05-24 15:00:00 2025-07-04 11:51:27.876505                     1
        9979         1261         Race         Race 2025-05-25 13:00:00 2025-05-25 15:00:00 2025-07-04 11:51:27.876505                     2
        9964         1262   Practice 1     Practice 2025-05-30 11:30:00 2025-05-30 12:30:00 2025-07-04 11:52:09.977048                     0
      