In [None]:
"""
QUY TR√åNH X·ª¨ L√ù D·ªÆ LI·ªÜU - PART 2
==================================
Data Cleaning & Advanced Preprocessing
"""


from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from part_2 import DataExplorationPipeline


# ============================================================================
# TI·∫æP T·ª§C CLASS DataExplorationPipeline
# ============================================================================


class DataCleaningPipeline(DataExplorationPipeline):
    """
    M·ªü r·ªông class v·ªõi c√°c ch·ª©c nƒÉng cleaning v√† preprocessing
    """

    def __init__(self, data_path=None, df=None):
        super().__init__(data_path, df)
        self.df_cleaned = None
        self.imputation_strategies = {}
        self.scaling_strategies = {}
        self.encoding_strategies = {}

    

    # ========================================================================
    # 1. X·ª¨ L√ù GI√Å TR·ªä B·ªä THI·∫æU (MISSING VALUES HANDLING)
    # ========================================================================

    def handle_missing_values(self, strategies=None, threshold=0.7):
        """
        X·ª≠ l√Ω gi√° tr·ªã b·ªã thi·∫øu v·ªõi nhi·ªÅu chi·∫øn l∆∞·ª£c
        
        Parameters:
        -----------
        strategies : dict
            Dictionary ch·ª©a chi·∫øn l∆∞·ª£c cho t·ª´ng c·ªôt
            V√≠ d·ª•: {'column_name': 'mean', 'other_col': 'knn'}
        threshold : float
            Ng∆∞·ª°ng ƒë·ªÉ x√≥a c·ªôt (n·∫øu t·ª∑ l·ªá thi·∫øu > threshold)
        
        Strategies available:
        - 'drop': X√≥a h√†ng
        - 'mean': ƒêi·ªÅn trung b√¨nh
        - 'median': ƒêi·ªÅn trung v·ªã
        - 'mode': ƒêi·ªÅn ch·∫ø ƒë·ªô (mode)
        - 'constant': ƒêi·ªÅn gi√° tr·ªã c·ªë ƒë·ªãnh
        - 'ffill': Forward fill
        - 'bfill': Backward fill
        - 'knn': KNN Imputer
        - 'iterative': MICE/Iterative Imputer
        """

        print("\n" + "="*80)
        print("üîß X·ª¨ L√ù GI√Å TR·ªä B·ªä THI·∫æU")
        print("="*80)


        self.df_cleaned = self.df.copy()


        # B∆∞·ªõc 1: X√≥a c√°c c·ªôt c√≥ qu√° nhi·ªÅu missing
        missing_pct = self.df_cleaned.isnull().sum() / len(self.df_cleaned)
        cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()

        if cols_to_drop:
            print(f"\nüóëÔ∏è  X√≥a {len(cols_to_drop)} c·ªôt c√≥ >70% gi√° tr·ªã thi·∫øu:")
            for col in cols_to_drop:
                print(f"  ‚Ä¢ {col}: {missing_pct[col]*100:.1f}% thi·∫øu")

            self.df_cleaned.drop(columns=cols_to_drop, inplace=True)



        # B∆∞·ªõc 2: √Åp d·ª•ng strategies
        if strategies is None:
            strategies = self._auto_imputation_strategy()

        print("\nüìã Strategies ƒë∆∞·ª£c √°p d·ª•ng:")

        for col, strategy in strategies.items():
            if col not in self.df_cleaned.columns:
                continue

            missing_count = self.df_cleaned[col].isnull().sum()
            if missing_count == 0:
                continue

            print(f"\n  ‚Üí {col} ({missing_count} gi√° tr·ªã thi·∫øu):")
            print(f"     Strategy: {strategy}")

            try:
                if strategy == 'drop':
                    self.df_cleaned.dropna(subset=[col], inplace=True)
                
                elif strategy == 'mean':
                    self.df_cleaned[col].fillna(self.df_cleaned[col].mean(), inplace=True)
                
                elif strategy == 'median':
                    self.df_cleaned[col].fillna(self.df_cleaned[col].median(), inplace=True)
                
                elif strategy == 'mode':
                    mode_val = self.df_cleaned[col].mode()
                    if len(mode_val) > 0:
                        self.df_cleaned[col].fillna(mode_val[0], inplace=True)
                
                elif strategy == 'constant':
                    fill_val = 0 if pd.api.types.is_numeric_dtype(self.df_cleaned[col]) else 'Unknown'
                    self.df_cleaned[col].fillna(fill_val, inplace=True)
                
                elif strategy == 'ffill':
                    self.df_cleaned[col].fillna(method='ffill', inplace=True)
                
                elif strategy == 'bfill':
                    self.df_cleaned[col].fillna(method='bfill', inplace=True)
                
                elif strategy == 'knn':
                    self._apply_knn_imputer([col])
                
                elif strategy == 'iterative':
                    self._apply_iterative_imputer([col])
                
                self.imputation_strategies[col] = strategy
                print(f"     ‚úì Ho√†n th√†nh")

            except Exception as e:
                print(f"     ‚úó L·ªói: {str(e)}")

        # B√°o c√°o k·∫øt qu·∫£
        remaining_missing = self.df_cleaned.isnull().sum().sum()
        print(f"\n‚úÖ K·∫øt qu·∫£:")
        print(f"  ‚Ä¢ Gi√° tr·ªã thi·∫øu c√≤n l·∫°i: {remaining_missing}")
        print(f"  ‚Ä¢ S·ªë h√†ng: {len(self.df)} ‚Üí {len(self.df_cleaned)}")
        print(f"  ‚Ä¢ S·ªë c·ªôt: {len(self.df.columns)} ‚Üí {len(self.df_cleaned.columns)}")
        

        return self.df_cleaned
    


    def _auto_imputation_strategy(self):
        """T·ª± ƒë·ªông ch·ªçn strategy ph√π h·ª£p cho t·ª´ng c·ªôt"""

        strategies = {}

        for col in self.df.columns:
            if self.df[col].isnull().sum() == 0:
                continue

            # Numeric columns
            if col in self.numeric_cols:
                # N·∫øu ph√¢n ph·ªëi g·∫ßn normal ‚Üí mean, ng∆∞·ª£c l·∫°i ‚Üí median
                skewness = abs(self.df[col].skew())
                strategies[col] = 'median' if skewness > 1 else 'mean'

            # Categorical columns
            elif col in self.categorical_cols:
                strategies[col] = 'mode'

            # Datetime columns
            elif col in self.datetime_cols:
                strategies[col] = 'ffill'

        
        return strategies
    


    def _apply_knn_imputer(self, columns, n_neighbors=5):
        """√Åp d·ª•ng KNN Imputer cho c√°c c·ªôt s·ªë"""

        numeric_cols = [c for c in columns if c in self.numeric_cols]
        if not numeric_cols:
            return
        
        imputer = KNNImputer(n_neighbors=n_neighbors)
        self.df_cleaned[numeric_cols] = imputer.fit_transform(self.df_cleaned[numeric_cols])



    def _apply_iterative_imputer(self, columns, max_iter=10):
        """√Åp d·ª•ng Iterative Imputer (MICE)"""

        numeric_cols = [c for c in columns if c in self.numeric_cols]
        if not numeric_cols:
            return
        
        imputer = IterativeImputer(max_iter=max_iter, random_state=42)
        self.df_cleaned[numeric_cols] = imputer.fit_transform(self.df_cleaned[numeric_cols])



    # ========================================================================
    # 2. PH√ÅT HI·ªÜN V√Ä X·ª¨ L√ù NGO·∫†I LAI (OUTLIER DETECTION & HANDLING)
    # ========================================================================
    

    def detect_outliers(self, methods=['iqr', 'zscore'], visualize=True):
        """
        Ph√°t hi·ªán outliers b·∫±ng nhi·ªÅu ph∆∞∆°ng ph√°p
        
        Parameters:
        -----------
        methods : list
            C√°c ph∆∞∆°ng ph√°p: 'iqr', 'zscore', 'isolation_forest'
        visualize : bool
            C√≥ v·∫Ω box plot hay kh√¥ng
        """
        print("\n" + "="*80)
        print("üîç PH√ÅT HI·ªÜN NGO·∫†I LAI (OUTLIERS)")
        print("="*80)

        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()

        outlier_info = {}

        for col in self.numeric_cols:
            if col not in self.df_cleaned.columns:
                continue

            outliers = {}
            data = self.df_cleaned[col].dropna()

            # Ph∆∞∆°ng ph√°p IQR
            if 'iqr' in methods:
                Q1 = data.quantile(0.25)
                Q3 = data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                iqr_outliers = ((data < lower_bound) | (data > upper_bound)).sum()
                outliers['iqr'] = {
                    'count': iqr_outliers,
                    'percentage': iqr_outliers / len(data) * 100,
                    'lower_bound': lower_bound,
                    'upper_bound': upper_bound
                }

            # Ph∆∞∆°ng ph√°p Z-Score
            if 'zscore' in methods:
                z_scores = np.abs(stats.zscore(data))
                zscore_outliers = (z_scores > 3).sum()
                outliers['zscore'] = {
                    'count': zscore_outliers,
                    'percentage': zscore_outliers / len(data) * 100
                }
            
            outlier_info[col] = outliers


        # In b√°o c√°o
        print("\nüìä B√°o c√°o Outliers:")
        for col, methods_result in outlier_info.items():
            print(f"\n  {col}:")
            for method, result in methods_result.items():
                print(f"    ‚Ä¢ {method.upper()}: {result['count']} outliers ({result['percentage']:.2f}%)")
                if 'lower_bound' in result:
                    print(f"      Range: [{result['lower_bound']:.2f}, {result['upper_bound']:.2f}]")
        

        # Visualization
        if visualize and len(self.numeric_cols) > 0:if visualize and len(self.numeric_cols) > 0:
            cols_to_plot = [c for c in self.numeric_cols[:12] if c in self.df_cleaned.columns]
            n_cols = 3
            n_rows = (len(cols_to_plot) + n_cols - 1) // n_cols
        

            fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
            axes = axes.flatten() if n_rows > 1 else [axes]
        

            for idx, col in enumerate(cols_to_plot):
                self.df_cleaned.boxplot(column=col, ax=axes[idx])
                axes[idx].set_title(f'{col}', fontsize=10)
                axes[idx].set_ylabel('')
                axes[idx].grid(True, alpha=0.3)

            
            for idx in range(len(cols_to_plot), len(axes)):
                axes[idx].axis('off')

            plt.suptitle('Box Plots - Ph√°t hi·ªán Outliers', fontsize=14, fontweight='bold', y=1.00)
            plt.tight_layout()
            plt.show()
        
        self.report['outliers'] = outlier_info
        return outlier_info
    

    def handle_outliers(self, strategy='cap', columns=None, iqr_multiplier=1.5):
        """
        X·ª≠ l√Ω outliers
        
        Parameters:
        -----------
        strategy : str
            'cap': Gi·ªõi h·∫°n (Winsorization)
            'remove': Lo·∫°i b·ªè
            'log': Bi·∫øn ƒë·ªïi log
            'boxcox': Bi·∫øn ƒë·ªïi Box-Cox
        columns : list
            Danh s√°ch c·ªôt c·∫ßn x·ª≠ l√Ω (None = t·∫•t c·∫£ numeric)
        iqr_multiplier : float
            H·ªá s·ªë nh√¢n v·ªõi IQR
        """

        print("\n" + "="*80)
        print("üîß X·ª¨ L√ù NGO·∫†I LAI")
        print("="*80)

        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()

        if columns is None:
            columns = self.numeric_cols

        columns = [c for c in columns if c in self.df_cleaned.columns]

        print(f"\nStrategy: {strategy}")
        print(f"S·ªë c·ªôt x·ª≠ l√Ω: {len(columns)}\n")

        for col in columns:
            data = self.df_cleaned[col].copy()
            original_count = len(data)

            if strategy == 'cap':
                # Winsorization
                Q1 = data.quantile(0.25)
                Q3 = data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - iqr_multiplier * IQR
                upper_bound = Q3 + iqr_multiplier * IQR

                # Cap values
                capped = self.df_cleaned[col].clip(lower=lower_bound, upper=upper_bound)
                changed = (self.df_cleaned[col] != capped).sum()
                self.df_cleaned[col] = capped

                print(f"  ‚Ä¢ {col}: {changed} gi√° tr·ªã b·ªã cap")

            elif strategy == 'remove':
                # Remove outliers

                Q1 = data.quantile(0.25)
                Q3 = data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - iqr_multiplier * IQR
                upper_bound = Q3 + iqr_multiplier * IQR
                
                mask = (data >= lower_bound) & (data <= upper_bound)
                self.df_cleaned = self.df_cleaned[mask]
                removed = original_count - len(self.df_cleaned)
                
                print(f"  ‚Ä¢ {col}: {removed} h√†ng b·ªã lo·∫°i b·ªè")


            elif strategy == 'log':
                # Log transformation
                if (data > 0).all():
                    self.df_cleaned[col] = np.log1p(data)
                    print(f"  ‚Ä¢ {col}: ƒê√£ √°p d·ª•ng log transform")
                else:
                    print(f"  ‚Ä¢ {col}: B·ªè qua (c√≥ gi√° tr·ªã <= 0)")


            elif strategy == 'boxcox':
                # Box-Cox transformation
                if (data > 0).all():
                    transformed, lambda_param = stats.boxcox(data)
                    self.df_cleaned[col] = transformed
                    print(f"  ‚Ä¢ {col}: ƒê√£ √°p d·ª•ng Box-Cox (Œª={lambda_param:.3f})")
                else:
                    print(f"  ‚Ä¢ {col}: B·ªè qua (c√≥ gi√° tr·ªã <= 0)")

        print(f"\n‚úÖ K·∫øt qu·∫£: {len(self.df)} ‚Üí {len(self.df_cleaned)} h√†ng")
        return self.df_cleaned
    


    # ========================================================================
    # 3. X·ª¨ L√ù D·ªÆ LI·ªÜU PH√ÇN LO·∫†I (CATEGORICAL ENCODING)
    # ========================================================================

    def encode_categorical(self, encoding_map=None, max_categories=10):
        """
        Encode c√°c bi·∫øn ph√¢n lo·∫°i
        
        Parameters:
        -----------
        encoding_map : dict
            Dictionary ch·ªâ ƒë·ªãnh ph∆∞∆°ng ph√°p encode cho t·ª´ng c·ªôt
            V√≠ d·ª•: {'col1': 'onehot', 'col2': 'label', 'col3': 'ordinal'}
        max_categories : int
            S·ªë categories t·ªëi ƒëa cho one-hot encoding
        """

        print("\n" + "="*80)
        print("üî§ ENCODING D·ªÆ LI·ªÜU PH√ÇN LO·∫†I")
        print("="*80)

        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()

        if encoding_map is None:
            encoding_map = self._auto_encoding_strategy(max_categories)

        print("\nüìã Strategies:")

        for col, method in encoding_map.items():
            if col not in self.df_cleaned.columns:
                continue
            
            n_unique = self.df_cleaned[col].nunique()
            print(f"\n  ‚Üí {col} ({n_unique} categories):")
            print(f"     Method: {method}")

            try:
                if method == 'label':
                    # Label Encoding
                    le = LabelEncoder()
                    self.df_cleaned[col] = le.fit_transform(self.df_cleaned[col].astype(str))
                    self.encoding_strategies[col] = {'method': 'label', 'encoder': le}
                
                elif method == 'onehot':
                    # One-Hot Encoding
                    dummies = pd.get_dummies(self.df_cleaned[col], prefix=col, drop_first=True)
                    self.df_cleaned = pd.concat([self.df_cleaned, dummies], axis=1)
                    self.df_cleaned.drop(columns=[col], inplace=True)
                    self.encoding_strategies[col] = {'method': 'onehot', 'columns': dummies.columns.tolist()}
                    print(f"     ‚úì T·∫°o {len(dummies.columns)} c·ªôt m·ªõi")

                elif method == 'ordinal':
                    # Ordinal Encoding (c·∫ßn c√≥ th·ª© t·ª±)
                    # User ph·∫£i cung c·∫•p th·ª© t·ª±
                    print(f"     ‚ö†Ô∏è  C·∫ßn cung c·∫•p th·ª© t·ª± cho ordinal encoding")

                elif method == 'frequency':
                    # Frequency Encoding
                    freq_map = self.df_cleaned[col].value_counts(normalize=True).to_dict()
                    self.df_cleaned[col + '_freq'] = self.df_cleaned[col].map(freq_map)
                    self.encoding_strategies[col] = {'method': 'frequency', 'map': freq_map}
                    print(f"     ‚úì T·∫°o c·ªôt {col}_freq")

                print(f"     ‚úì Ho√†n th√†nh")

            except Exception as e:
                print(f"     ‚úó L·ªói: {str(e)}")

        print(f"\n‚úÖ Shape sau encoding: {self.df_cleaned.shape}")
        return self.df_cleaned


    def _auto_encoding_strategy(self, max_categories):
        """T·ª± ƒë·ªông ch·ªçn ph∆∞∆°ng ph√°p encoding"""
        strategies = {}
        
        for col in self.categorical_cols:
            if col not in self.df.columns:
                continue
            
            n_unique = self.df[col].nunique()
            
            if n_unique == 2:
                strategies[col] = 'label'  # Binary ‚Üí Label
            elif n_unique <= max_categories:
                strategies[col] = 'onehot'  # Few categories ‚Üí One-Hot
            else:
                strategies[col] = 'frequency'  # Many categories ‚Üí Frequency
        
        return strategies


    # ========================================================================
    # 4. CHU·∫®N H√ìA D·ªÆ LI·ªÜU (FEATURE SCALING)
    # ========================================================================

    def scale_features(self, method='standard', columns=None):
        """
        Chu·∫©n h√≥a c√°c features s·ªë
        
        Parameters:
        -----------
        method : str
            'standard': StandardScaler (z-score)
            'minmax': MinMaxScaler (0-1)
            'robust': RobustScaler (s·ª≠ d·ª•ng median, robust v·ªõi outliers)
        columns : list
            Danh s√°ch c·ªôt c·∫ßn scale (None = t·∫•t c·∫£ numeric)
        """

        print("\n" + "="*80)
        print("üìè CHU·∫®N H√ìA D·ªÆ LI·ªÜU")
        print("="*80)

        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()
        
        # X√°c ƒë·ªãnh c√°c c·ªôt s·ªë
        if columns is None:
            columns = [c for c in self.df_cleaned.columns 
                       if pd.api.types.is_numeric_dtype(self.df_cleaned[c])]
            
        columns = [c for c in columns if c in self.df_cleaned.columns]

        print(f"\nMethod: {method}")
        print(f"S·ªë c·ªôt: {len(columns)}")

        # Ch·ªçn scaler
        if method == 'standard':
            scaler = StandardScaler()
        elif method == 'minmax':
            scaler = MinMaxScaler()
        elif method == 'robust':
            scaler = RobustScaler()
        else:
            raise ValueError(f"Method kh√¥ng h·ª£p l·ªá: {method}")
        

        # √Åp d·ª•ng scaling
        self.df_cleaned[columns] = scaler.fit_transform(self.df_cleaned[columns])
        self.scaling_strategies = {'method': method, 'columns': columns, 'scaler': scaler}
        

        print(f"\n‚úÖ ƒê√£ chu·∫©n h√≥a {len(columns)} c·ªôt b·∫±ng {method}")

        # Hi·ªÉn th·ªã th·ªëng k√™ sau scaling
        print("\nüìä Th·ªëng k√™ sau scaling (5 c·ªôt ƒë·∫ßu):")
        print(self.df_cleaned[columns[:5]].describe())
        
        return self.df_cleaned


    # ========================================================================
    # 5. X·ª¨ L√ù D·ªÆ LI·ªÜU TR√ôNG L·∫∂P V√Ä KH√îNG NH·∫§T QU√ÅN
    # ========================================================================
    
    def clean_data_inconsistencies(self):
        """
        X·ª≠ l√Ω d·ªØ li·ªáu tr√πng l·∫∑p v√† kh√¥ng nh·∫•t qu√°n
        """

        print("\n" + "="*80)
        print("üßπ L√ÄM S·∫†CH D·ªÆ LI·ªÜU KH√îNG NH·∫§T QU√ÅN")
        print("="*80)

        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()

        # 1. X·ª≠ l√Ω tr√πng l·∫∑p
        duplicates_before = self.df_cleaned.duplicated().sum()
        print(f"\nüîÑ D·ªØ li·ªáu tr√πng l·∫∑p:")
        print(f"  ‚Ä¢ Tr∆∞·ªõc: {duplicates_before} h√†ng")

        self.df_cleaned.drop_duplicates(inplace=True)

        duplicates_after = self.df_cleaned.duplicated().sum()
        print(f"  ‚Ä¢ Sau: {duplicates_after} h√†ng")
        print(f"  ‚Ä¢ ƒê√£ lo·∫°i b·ªè: {duplicates_before - duplicates_after} h√†ng")

        # 2. Chu·∫©n h√≥a string columns
        print(f"\nüìù Chu·∫©n h√≥a c·ªôt text:")
        string_cols = [c for c in self.categorical_cols if c in self.df_cleaned.columns]
        
        for col in string_cols:
            if self.df_cleaned[col].dtype == 'object':
                # Strip whitespace
                self.df_cleaned[col] = self.df_cleaned[col].astype(str).str.strip()

                # Lowercase (t√πy ch·ªçn)
                # self.df_cleaned[col] = self.df_cleaned[col].str.lower()
                
                # Thay th·∫ø multiple spaces
                self.df_cleaned[col] = self.df_cleaned[col].str.replace(r'\s+', ' ', regex=True)
                
                print(f"  ‚úì {col}")

        
        print(f"\n‚úÖ Shape sau cleaning: {self.df_cleaned.shape}")
        return self.df_cleaned

    # ========================================================================
    # 6. T·∫†O B√ÅO C√ÅO T·ªîNG K·∫æT
    # ========================================================================

    def generate_cleaning_report(self):
        """
        T·∫°o b√°o c√°o t·ªïng k·∫øt qu√° tr√¨nh cleaning
        """
        print("\n" + "="*80)
        print("üìä B√ÅO C√ÅO T·ªîNG K·∫æT QU√Å TR√åNH CLEANING")
        print("="*80)
        
        print("\nüî¢ Thay ƒë·ªïi v·ªÅ k√≠ch th∆∞·ªõc:")
        print(f"  ‚Ä¢ H√†ng: {len(self.df):,} ‚Üí {len(self.df_cleaned):,} "
              f"({(len(self.df_cleaned)-len(self.df))/len(self.df)*100:+.1f}%)")
        print(f"  ‚Ä¢ C·ªôt: {len(self.df.columns)} ‚Üí {len(self.df_cleaned.columns)} "
              f"({len(self.df_cleaned.columns)-len(self.df.columns):+d})")
        
        print("\nüï≥Ô∏è  Missing values:")
        missing_before = self.df.isnull().sum().sum()
        missing_after = self.df_cleaned.isnull().sum().sum()
        print(f"  ‚Ä¢ Tr∆∞·ªõc: {missing_before:,}")
        print(f"  ‚Ä¢ Sau: {missing_after:,}")
        print(f"  ‚Ä¢ Gi·∫£m: {missing_before - missing_after:,}")
        
        print("\nüîß Strategies ƒë√£ √°p d·ª•ng:")
        print(f"  ‚Ä¢ Imputation: {len(self.imputation_strategies)} c·ªôt")
        print(f"  ‚Ä¢ Encoding: {len(self.encoding_strategies)} c·ªôt")
        if self.scaling_strategies:
            print(f"  ‚Ä¢ Scaling: {len(self.scaling_strategies.get('columns', []))} c·ªôt")
        
        # L∆∞u cleaned data
        print("\nüíæ L∆∞u d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω:")
        print("  df_cleaned = pipeline.df_cleaned")
        
        return self.df_cleaned


# ============================================================================
# H∆Ø·ªöNG D·∫™N S·ª¨ D·ª§NG PART 2
# ============================================================================

"""
C√ÅCH S·ª¨ D·ª§NG:

# Kh·ªüi t·∫°o (ti·∫øp t·ª•c t·ª´ Part 1)
pipeline = DataCleaningPipeline(df=your_dataframe)

# 1. X·ª≠ l√Ω missing values
pipeline.handle_missing_values(
    strategies={'column1': 'mean', 'column2': 'mode'},
    threshold=0.7
)

# 2. Ph√°t hi·ªán v√† x·ª≠ l√Ω outliers
pipeline.detect_outliers(methods=['iqr', 'zscore'], visualize=True)
pipeline.handle_outliers(strategy='cap', iqr_multiplier=1.5)

# 3. Encode categorical variables
pipeline.encode_categorical(max_categories=10)

# 4. Scale features
pipeline.scale_features(method='standard')

# 5. Clean inconsistencies
pipeline.clean_data_inconsistencies()

# 6. B√°o c√°o t·ªïng k·∫øt
cleaned_df = pipeline.generate_cleaning_report()

# 7. L∆∞u k·∫øt qu·∫£
cleaned_df.to_csv('cleaned_data.csv', index=False)
"""

print("\n‚úÖ ƒê√É HO√ÄN TH√ÄNH PART 2: DATA CLEANING & PREPROCESSING")
print("üìå Ti·∫øp theo: Part 3 - Relationship Analysis & Feature Selection")
        
