In [None]:
"""
QUY TR√åNH X·ª¨ L√ù D·ªÆ LI·ªÜU - PART 3
==================================
Relationship Analysis & Pre-Modeling Preparation
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, f_oneway, pearsonr, spearmanr
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')


# ============================================================================
# TI·∫æP T·ª§C CLASS - RELATIONSHIP ANALYSIS
# ============================================================================

class AdvancedAnalysisPipeline(DataCleaningPipeline):
    """
    M·ªü r·ªông v·ªõi ph√¢n t√≠ch m·ªëi quan h·ªá v√† feature selection
    """
    
    def __init__(self, data_path=None, df=None, target_column=None):
        super().__init__(data_path, df)
        self.target_column = target_column
        self.correlation_matrix = None
        self.feature_importance = {}
        self.pca_results = {}
    
    
    # ========================================================================
    # 1. PH√ÇN T√çCH T∆Ø∆†NG QUAN (CORRELATION ANALYSIS)
    # ========================================================================
    
    def analyze_correlations(self, method='pearson', threshold=0.8, visualize=True):
        """
        Ph√¢n t√≠ch t∆∞∆°ng quan gi·ªØa c√°c bi·∫øn
        
        Parameters:
        -----------
        method : str
            'pearson': T∆∞∆°ng quan tuy·∫øn t√≠nh
            'spearman': T∆∞∆°ng quan th·ª© h·∫°ng
            'kendall': T∆∞∆°ng quan Kendall Tau
        threshold : float
            Ng∆∞·ª°ng ƒë·ªÉ c·∫£nh b√°o multicollinearity
        visualize : bool
            V·∫Ω heatmap hay kh√¥ng
        """
        print("\n" + "="*80)
        print("üîó PH√ÇN T√çCH T∆Ø∆†NG QUAN")
        print("="*80)
        
        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()
        
        # Ch·ªâ l·∫•y c√°c c·ªôt s·ªë
        numeric_data = self.df_cleaned.select_dtypes(include=[np.number])
        
        if len(numeric_data.columns) < 2:
            print("‚ö†Ô∏è  Kh√¥ng ƒë·ªß c·ªôt s·ªë ƒë·ªÉ ph√¢n t√≠ch t∆∞∆°ng quan")
            return None
        
        # T√≠nh correlation matrix
        print(f"\nüìä Method: {method.upper()}")
        self.correlation_matrix = numeric_data.corr(method=method)
        
        # T√¨m c√°c c·∫∑p c√≥ t∆∞∆°ng quan cao
        print(f"\nüî¥ C·∫∑p bi·∫øn c√≥ t∆∞∆°ng quan cao (|r| > {threshold}):")
        high_corr_pairs = []
        
        for i in range(len(self.correlation_matrix.columns)):
            for j in range(i+1, len(self.correlation_matrix.columns)):
                corr_val = self.correlation_matrix.iloc[i, j]
                if abs(corr_val) > threshold:
                    col1 = self.correlation_matrix.columns[i]
                    col2 = self.correlation_matrix.columns[j]
                    high_corr_pairs.append((col1, col2, corr_val))
                    print(f"  ‚Ä¢ {col1} ‚Üî {col2}: {corr_val:.3f}")
        
        if not high_corr_pairs:
            print("  ‚úì Kh√¥ng ph√°t hi·ªán multicollinearity nghi√™m tr·ªçng")
        else:
            print(f"\n‚ö†Ô∏è  T√¨m th·∫•y {len(high_corr_pairs)} c·∫∑p c√≥ multicollinearity!")
            print("  ‚Üí C√¢n nh·∫Øc lo·∫°i b·ªè m·ªôt trong c√°c bi·∫øn n√†y")
        
        # Visualization
        if visualize:
            fig, axes = plt.subplots(1, 2, figsize=(18, 7))
            
            # Heatmap ƒë·∫ßy ƒë·ªß
            sns.heatmap(
                self.correlation_matrix, 
                annot=False, 
                cmap='coolwarm', 
                center=0,
                vmin=-1, vmax=1,
                square=True,
                ax=axes[0],
                cbar_kws={'label': 'Correlation'}
            )
            axes[0].set_title(f'Ma tr·∫≠n T∆∞∆°ng quan ({method.capitalize()})', 
                            fontsize=12, fontweight='bold')
            
            # Top correlations v·ªõi target (n·∫øu c√≥)
            if self.target_column and self.target_column in self.correlation_matrix.columns:
                target_corr = self.correlation_matrix[self.target_column].drop(self.target_column).sort_values(ascending=False)
                
                # V·∫Ω top 20
                top_n = min(20, len(target_corr))
                target_corr_plot = pd.concat([
                    target_corr.head(top_n//2),
                    target_corr.tail(top_n//2)
                ])
                
                colors = ['green' if x > 0 else 'red' for x in target_corr_plot.values]
                axes[1].barh(range(len(target_corr_plot)), target_corr_plot.values, color=colors, alpha=0.7)
                axes[1].set_yticks(range(len(target_corr_plot)))
                axes[1].set_yticklabels(target_corr_plot.index, fontsize=9)
                axes[1].set_xlabel('Correlation with Target')
                axes[1].set_title(f'Top Features t∆∞∆°ng quan v·ªõi {self.target_column}', 
                                fontsize=12, fontweight='bold')
                axes[1].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
                axes[1].grid(True, alpha=0.3, axis='x')
            else:
                # V·∫Ω distribution of correlations
                corr_values = self.correlation_matrix.values[np.triu_indices_from(self.correlation_matrix.values, k=1)]
                axes[1].hist(corr_values, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
                axes[1].set_xlabel('Correlation Coefficient')
                axes[1].set_ylabel('Frequency')
                axes[1].set_title('Ph√¢n ph·ªëi c√°c gi√° tr·ªã T∆∞∆°ng quan', 
                                fontsize=12, fontweight='bold')
                axes[1].axvline(x=0, color='red', linestyle='--', linewidth=1.5)
                axes[1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
        
        self.report['correlation'] = {
            'method': method,
            'high_corr_pairs': high_corr_pairs
        }
        
        return self.correlation_matrix
    
    
    # ========================================================================
    # 2. PH√ÇN T√çCH M·ªêI QUAN H·ªÜ FEATURE-TARGET
    # ========================================================================
    
    def analyze_feature_target_relationship(self, target=None, task='auto'):
        """
        Ph√¢n t√≠ch m·ªëi quan h·ªá gi·ªØa features v√† target
        
        Parameters:
        -----------
        target : str
            T√™n c·ªôt target
        task : str
            'classification', 'regression', ho·∫∑c 'auto'
        """
        print("\n" + "="*80)
        print("üéØ PH√ÇN T√çCH M·ªêI QUAN H·ªÜ FEATURE-TARGET")
        print("="*80)
        
        if target is None:
            target = self.target_column
        
        if target is None or target not in self.df_cleaned.columns:
            print("‚ö†Ô∏è  C·∫ßn ch·ªâ ƒë·ªãnh target column h·ª£p l·ªá")
            return
        
        # Auto detect task type
        if task == 'auto':
            n_unique = self.df_cleaned[target].nunique()
            task = 'classification' if n_unique < 20 else 'regression'
        
        print(f"\nüìã Target: {target}")
        print(f"üìã Task: {task}")
        
        # L·∫•y features
        features = [col for col in self.df_cleaned.columns if col != target]
        numeric_features = [col for col in features 
                           if pd.api.types.is_numeric_dtype(self.df_cleaned[col])]
        
        # 1. Statistical Tests
        print(f"\nüî¨ STATISTICAL TESTS:")
        
        if task == 'classification':
            # ANOVA for numeric features
            print("\n  üìä ANOVA F-test (Numeric features):")
            anova_results = []
            
            for col in numeric_features[:20]:  # Top 20
                groups = [self.df_cleaned[self.df_cleaned[target] == cat][col].dropna() 
                         for cat in self.df_cleaned[target].unique()]
                
                try:
                    f_stat, p_value = f_oneway(*groups)
                    anova_results.append({
                        'Feature': col,
                        'F-statistic': f_stat,
                        'p-value': p_value,
                        'Significant': '‚úì' if p_value < 0.05 else '‚úó'
                    })
                except:
                    pass
            
            anova_df = pd.DataFrame(anova_results).sort_values('F-statistic', ascending=False)
            print(anova_df.head(10).to_string(index=False))
        
        else:  # regression
            # Correlation with target
            print("\n  üìä Correlation with Target:")
            correlations = []
            
            for col in numeric_features[:20]:
                try:
                    corr, p_value = pearsonr(
                        self.df_cleaned[col].dropna(), 
                        self.df_cleaned[target].loc[self.df_cleaned[col].dropna().index]
                    )
                    correlations.append({
                        'Feature': col,
                        'Correlation': corr,
                        'p-value': p_value,
                        'Significant': '‚úì' if p_value < 0.05 else '‚úó'
                    })
                except:
                    pass
            
            corr_df = pd.DataFrame(correlations).sort_values('Correlation', 
                                                            key=abs, ascending=False)
            print(corr_df.head(10).to_string(index=False))
        
        # 2. Mutual Information
        print(f"\nüîç MUTUAL INFORMATION:")
        X = self.df_cleaned[numeric_features].fillna(0)
        y = self.df_cleaned[target]
        
        if task == 'classification':
            mi_scores = mutual_info_classif(X, y, random_state=42)
        else:
            mi_scores = mutual_info_regression(X, y, random_state=42)
        
        mi_df = pd.DataFrame({
            'Feature': numeric_features,
            'MI_Score': mi_scores
        }).sort_values('MI_Score', ascending=False)
        
        print(mi_df.head(10).to_string(index=False))
        
        # 3. Visualization - Feature vs Target
        print(f"\nüìä Visualizing top features...")
        
        top_features = mi_df.head(6)['Feature'].tolist()
        n_features = len(top_features)
        
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for idx, feature in enumerate(top_features):
            if task == 'classification':
                # Box plot for classification
                self.df_cleaned.boxplot(column=feature, by=target, ax=axes[idx])
                axes[idx].set_title(f'{feature} vs {target}')
                axes[idx].set_xlabel('')
            else:
                # Scatter plot for regression
                axes[idx].scatter(self.df_cleaned[feature], 
                                self.df_cleaned[target], 
                                alpha=0.5, s=20)
                axes[idx].set_xlabel(feature)
                axes[idx].set_ylabel(target)
                axes[idx].set_title(f'{feature} vs {target}')
                
                # Add trend line
                z = np.polyfit(self.df_cleaned[feature].fillna(0), 
                             self.df_cleaned[target], 1)
                p = np.poly1d(z)
                axes[idx].plot(self.df_cleaned[feature], 
                             p(self.df_cleaned[feature]), 
                             "r--", alpha=0.8, linewidth=2)
            
            axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        self.feature_importance['mutual_information'] = mi_df
        return mi_df
    
    
    # ========================================================================
    # 3. DIMENSIONALITY REDUCTION - PCA
    # ========================================================================
    
    def perform_pca(self, n_components=None, variance_threshold=0.95, visualize=True):
        """
        Th·ª±c hi·ªán Principal Component Analysis
        
        Parameters:
        -----------
        n_components : int or None
            S·ªë components mu·ªën gi·ªØ l·∫°i
        variance_threshold : float
            T·ª∑ l·ªá variance mu·ªën gi·ªØ l·∫°i
        visualize : bool
            C√≥ v·∫Ω bi·ªÉu ƒë·ªì hay kh√¥ng
        """
        print("\n" + "="*80)
        print("üî¨ PRINCIPAL COMPONENT ANALYSIS (PCA)")
        print("="*80)
        
        if self.df_cleaned is None:
            self.df_cleaned = self.df.copy()
        
        # Ch·ªâ l·∫•y numeric columns
        numeric_data = self.df_cleaned.select_dtypes(include=[np.number])
        
        # Remove columns with NaN
        numeric_data = numeric_data.dropna(axis=1)
        
        if len(numeric_data.columns) < 2:
            print("‚ö†Ô∏è  Kh√¥ng ƒë·ªß features s·ªë ƒë·ªÉ th·ª±c hi·ªán PCA")
            return None
        
        print(f"\nüìä S·ªë features: {len(numeric_data.columns)}")
        
        # Standardize data
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(numeric_data)
        
        # Fit PCA v·ªõi t·∫•t c·∫£ components
        pca_full = PCA()
        pca_full.fit(data_scaled)
        
        # Explained variance
        cumsum_variance = np.cumsum(pca_full.explained_variance_ratio_)
        
        # Determine optimal n_components
        if n_components is None:
            n_components = np.argmax(cumsum_variance >= variance_threshold) + 1
        
        print(f"\nüéØ S·ªë components ƒë∆∞·ª£c ch·ªçn: {n_components}")
        print(f"   (Gi·∫£i th√≠ch {cumsum_variance[n_components-1]*100:.2f}% variance)")
        
        # Fit PCA v·ªõi n_components
        pca = PCA(n_components=n_components)
        components = pca.fit_transform(data_scaled)
        
        # Create DataFrame
        pca_df = pd.DataFrame(
            components,
            columns=[f'PC{i+1}' for i in range(n_components)]
        )
        
        # Component loadings
        loadings = pd.DataFrame(
            pca.components_.T,
            columns=[f'PC{i+1}' for i in range(n_components)],
            index=numeric_data.columns
        )
        
        print(f"\nüìã Top 5 features cho m·ªói PC:")
        for i in range(min(3, n_components)):
            pc_name = f'PC{i+1}'
            top_features = loadings[pc_name].abs().sort_values(ascending=False).head(5)
            print(f"\n  {pc_name} (Variance: {pca.explained_variance_ratio_[i]*100:.2f}%):")
            for feat, loading in top_features.items():
                print(f"    ‚Ä¢ {feat}: {loading:.3f}")
        
        # Visualization
        if visualize:
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Scree plot
            axes[0, 0].bar(range(1, len(pca_full.explained_variance_ratio_)+1), 
                          pca_full.explained_variance_ratio_,
                          alpha=0.7, color='steelblue')
            axes[0, 0].set_xlabel('Principal Component')
            axes[0, 0].set_ylabel('Explained Variance Ratio')
            axes[0, 0].set_title('Scree Plot', fontsize=12, fontweight='bold')
            axes[0, 0].grid(True, alpha=0.3)
            
            # 2. Cumulative variance
            axes[0, 1].plot(range(1, len(cumsum_variance)+1), 
                          cumsum_variance, 
                          marker='o', linewidth=2, color='green')
            axes[0, 1].axhline(y=variance_threshold, color='r', 
                             linestyle='--', label=f'{variance_threshold*100}% threshold')
            axes[0, 1].axvline(x=n_components, color='orange', 
                             linestyle='--', label=f'{n_components} components')
            axes[0, 1].set_xlabel('Number of Components')
            axes[0, 1].set_ylabel('Cumulative Explained Variance')
            axes[0, 1].set_title('Cumulative Variance Explained', 
                               fontsize=12, fontweight='bold')
            axes[0, 1].legend()
            axes[0, 1].grid(True, alpha=0.3)
            
            # 3. Biplot (PC1 vs PC2)
            if n_components >= 2:
                axes[1, 0].scatter(components[:, 0], components[:, 1], 
                                 alpha=0.5, s=30)
                axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
                axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
                axes[1, 0].set_title('PCA Biplot (PC1 vs PC2)', 
                                   fontsize=12, fontweight='bold')
                axes[1, 0].grid(True, alpha=0.3)
                
                # Add loading vectors (top 10 features)
                top_loadings = loadings[['PC1', 'PC2']].abs().sum(axis=1).sort_values(ascending=False).head(10)
                scale = 3
                for feature in top_loadings.index:
                    axes[1, 0].arrow(0, 0, 
                                   loadings.loc[feature, 'PC1']*scale,
                                   loadings.loc[feature, 'PC2']*scale,
                                   head_width=0.1, head_length=0.1,
                                   fc='red', ec='red', alpha=0.6)
                    axes[1, 0].text(loadings.loc[feature, 'PC1']*scale*1.1,
                                  loadings.loc[feature, 'PC2']*scale*1.1,
                                  feature, fontsize=8, ha='center')
            
            # 4. Heatmap of loadings
            top_n = min(15, len(loadings))
            top_features_idx = loadings.abs().sum(axis=1).sort_values(ascending=False).head(top_n).index
            
            sns.heatmap(loadings.loc[top_features_idx].T, 
                       cmap='coolwarm', center=0,
                       annot=True, fmt='.2f',
                       ax=axes[1, 1],
                       cbar_kws={'label': 'Loading'})
            axes[1, 1].set_title(f'Component Loadings (Top {top_n} Features)', 
                               fontsize=12, fontweight='bold')
            
            plt.tight_layout()
            plt.show()
        
        # Save results
        self.pca_results = {
            'pca_model': pca,
            'components': pca_df,
            'loadings': loadings,
            'explained_variance_ratio': pca.explained_variance_ratio_,
            'n_components': n_components
        }
        
        return pca_df, loadings
    
    
    # ========================================================================
    # 4. FEATURE SELECTION
    # ========================================================================
    
    def select_features(self, target=None, method='all', k=20):
        """
        Feature Selection s·ª≠ d·ª•ng nhi·ªÅu ph∆∞∆°ng ph√°p
        
        Parameters:
        -----------
        target : str
            T√™n c·ªôt target
        method : str
            'univariate': SelectKBest
            'rfe': Recursive Feature Elimination
            'importance': Tree-based feature importance
            'all': K·∫øt h·ª£p t·∫•t c·∫£
        k : int
            S·ªë features mu·ªën ch·ªçn
        """
        print("\n" + "="*80)
        print("üéØ FEATURE SELECTION")
        print("="*80)
        
        if target is None:
            target = self.target_column
        
        if target is None or target not in self.df_cleaned.columns:
            print("‚ö†Ô∏è  C·∫ßn ch·ªâ ƒë·ªãnh target column h·ª£p l·ªá")
            return None
        
        # Prepare data
        X = self.df_cleaned.drop(columns=[target]).select_dtypes(include=[np.number])
        y = self.df_cleaned[target]
        
        # Remove NaN
        valid_idx = X.notna().all(axis=1) & y.notna()
        X = X[valid_idx].fillna(0)
        y = y[valid_idx]
        
        # Determine task type
        task = 'classification' if y.nunique() < 20 else 'regression'
        print(f"\nüìã Task: {task}")
        print(f"üìã Features: {len(X.columns)}")
        print(f"üìã Target: {target}")
        
        selected_features = {}
        
        # 1. Univariate Selection
        if method in ['univariate', 'all']:
            print(f"\nüîπ Method 1: Univariate Selection (SelectKBest)")
            
            if task == 'classification':
                selector = SelectKBest(score_func=f_classif, k=min(k, len(X.columns)))
            else:
                selector = SelectKBest(score_func=f_regression, k=min(k, len(X.columns)))
            
            selector.fit(X, y)
            
            scores_df = pd.DataFrame({
                'Feature': X.columns,
                'Score': selector.scores_
            }).sort_values('Score', ascending=False)
            
            selected_features['univariate'] = scores_df.head(k)['Feature'].tolist()
            print(f"   Top 10 features:")
            print(scores_df.head(10).to_string(index=False))
        
        # 2. RFE (Recursive Feature Elimination)
        if method in ['rfe', 'all']:
            print(f"\nüîπ Method 2: Recursive Feature Elimination (RFE)")
            
            if task == 'classification':
                estimator = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
            else:
                estimator = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
            
            rfe = RFE(estimator=estimator, n_features_to_select=min(k, len(X.columns)))
            rfe.fit(X, y)
            
            rfe_df = pd.DataFrame({
                'Feature': X.columns,
                'Ranking': rfe.ranking_,
                'Selected': rfe.support_
            }).sort_values('Ranking')
            
            selected_features['rfe'] = rfe_df[rfe_df['Selected']]['Feature'].tolist()
            print(f"   Selected {len(selected_features['rfe'])} features")
            print(rfe_df.head(10).to_string(index=False))
        
        # 3. Feature Importance
        if method in ['importance', 'all']:
            print(f"\nüîπ Method 3: Tree-based Feature Importance")
            
            if task == 'classification':
                model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
            else:
                model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
            
            model.fit(X, y)
            
            importance_df = pd.DataFrame({
                'Feature': X.columns,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            selected_features['importance'] = importance_df.head(k)['Feature'].tolist()
            print(f"   Top 10 features:")
            print(importance_df.head(10).to_string(index=False))
        
        # 4. Combine results
        if method == 'all':
            print(f"\nüéØ ENSEMBLE FEATURE SELECTION:")
            
            # Count votes
            from collections import Counter
            all_features = []
            for features_list in selected_features.values():
                all_features.extend(features_list)
            
            feature_votes = Counter(all_features)
            
            ensemble_df = pd.DataFrame(
                feature_votes.items(), 
                columns=['Feature', 'Votes']
            ).sort_values('Votes', ascending=False)
            
            print(f"\n   Features xu·∫•t hi·ªán trong nhi·ªÅu methods nh·∫•t:")
            print(ensemble_df.head(15).to_string(index=False))
            
            # Select features with at least 2 votes
            final_features = ensemble_df[ensemble_df['Votes'] >= 2]['Feature'].tolist()
            selected_features['ensemble'] = final_features[:k]
            
            print(f"\n‚úÖ Ch·ªçn {len(final_features[:k])} features cu·ªëi c√πng (‚â•2 votes)")
        
        # Visualization
        if method == 'all':
            fig, ax = plt.subplots(figsize=(12, 8))
            
            ensemble_df_plot = ensemble_df.head(20)
            colors = ['green' if x >= 2 else 'orange' for x in ensemble_df_plot['Votes']]
            
            ax.barh(range(len(ensemble_df_plot)), 
                   ensemble_df_plot['Votes'].values,
                   color=colors, alpha=0.7)
            ax.set_yticks(range(len(ensemble_df_plot)))
            ax.set_yticklabels(ensemble_df_plot['Feature'].values)
            ax.set_xlabel('Number of Votes (Methods)', fontsize=11)
            ax.set_title('Feature Selection - Ensemble Results', 
                        fontsize=13, fontweight='bold')
            ax.axvline(x=2, color='red', linestyle='--', 
                      linewidth=2, label='Threshold (2 votes)')
            ax.legend()
            ax.grid(True, alpha=0.3, axis='x')
            
            plt.tight_layout()
            plt.show()
        
        self.feature_importance['selected_features'] = selected_features
        
        return selected_features
    
    
    # ========================================================================
    # 5. T·∫†O FINAL DATASET CHO MODELING
    # ========================================================================
    
    def prepare_final_dataset(self, selected_features=None, include_pca=False):
        """
        T·∫°o dataset cu·ªëi c√πng s·∫µn s√†ng cho modeling
        
        Parameters:
        -----------
        selected_features : list
            Danh s√°ch features ƒë∆∞·ª£c ch·ªçn
        include_pca : bool
            C√≥ th√™m PCA components kh√¥ng
        """
        print("\n" + "="*80)
        print("üéÅ CHU·∫®N B·ªä FINAL DATASET")
        print("="*80)
        
        df_final = self.df_cleaned.copy()
        
        # 1. Select features
        if selected_features is not None:
            if self.target_column:
                cols_to_keep = selected_features + [self.target_column]
            else:
                cols_to_keep = selected_features
            
            cols_to_keep = [c for c in cols_to_keep if c in df_final.columns]
            df_final = df_final[cols_to_keep]
            print(f"\n‚úì Ch·ªçn {len(selected_features)} features")
        
        # 2. Add PCA components
        if include_pca and self.pca_results:
            pca_df = self.pca_results['components']
            df_final = pd.concat([df_final, pca_df], axis=1)
            print(f"‚úì Th√™m {len(pca_df.columns)} PCA components")
        
        # 3. Final checks
        print(f"\nüìä FINAL DATASET:")
        print(f"  ‚Ä¢ Shape: {df_final.shape}")
        print(f"  ‚Ä¢ Missing values: {df_final.isnull().sum().sum()}")
        print(f"  ‚Ä¢ Memory: {df_final.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # 4. Summary report
        print(f"\nüìã FEATURES SUMMARY:")
        numeric_cols = df_final.select_dtypes(include=[np.number]).columns.tolist()
        if self.target_column in numeric_cols:
            numeric_cols.remove(self.target_column)
        
        print(f"  ‚Ä¢ Numeric features: {len(numeric_cols)}")
        print(f"  ‚Ä¢ Target: {self.target_column}")
        
        # Save
        self.df_final = df_final
        
        print(f"\nüíæ ƒê·ªÉ truy c·∫≠p:")
        print(f"   df_final = pipeline.df_final")
        
        return df_final
    
    
    # ========================================================================
    # 6. B√ÅO C√ÅO T·ªîNG K·∫æT HO√ÄN CH·ªàNH
    # ========================================================================
    
    def generate_complete_report(self, save_path=None):
        """
        T·∫°o b√°o c√°o t·ªïng k·∫øt to√†n b·ªô pipeline
        """
        print("\n" + "="*80)
        print("üìä B√ÅO C√ÅO T·ªîNG K·∫æT PIPELINE")
        print("="*80)
        
        report = {
            'original_shape': self.df.shape,
            'final_shape': self.df_final.shape if hasattr(self, 'df_final') else self.df_cleaned.shape,
            'processing_steps': []
        }
        
        print(f"\nüî¢ THAY ƒê·ªîI D·ªÆ LI·ªÜU:")
        print(f"  ‚Ä¢ Shape ban ƒë·∫ßu: {self.df.shape}")
        if hasattr(self, 'df_final'):
            print(f"  ‚Ä¢ Shape cu·ªëi c√πng: {self.df_final.shape}")
        else:
            print(f"  ‚Ä¢ Shape sau cleaning: {self.df_cleaned.shape}")
        
        print(f"\nüîß C√ÅC B∆Ø·ªöC ƒê√É TH·ª∞C HI·ªÜN:")
        
        # Missing values
        if self.imputation_strategies:
            print(f"  ‚úì Missing Value Imputation: {len(self.imputation_strategies)} c·ªôt")
            report['processing_steps'].append('imputation')
        
        # Encoding
        if self.encoding_strategies:
            print(f"  ‚úì Categorical Encoding: {len(self.encoding_strategies)} c·ªôt")
            report['processing_steps'].append('encoding')
        
        # Scaling
        if self.scaling_strategies:
            print(f"  ‚úì Feature Scaling: {self.scaling_strategies.get('method', 'N/A')}")
            report['processing_steps'].append('scaling')
        
        # Outliers
        if 'outliers' in self.report:
            print(f"  ‚úì Outlier Detection & Handling")
            report['processing_steps'].append('outlier_handling')
        
        # PCA
        if self.pca_results:
            n_comp = self.pca_results.get('n_components', 0)
            variance = sum(self.pca_results.get('explained_variance_ratio', []))
            print(f"  ‚úì PCA: {n_comp} components ({variance*100:.1f}% variance)")
            report['processing_steps'].append('pca')
        
        # Feature Selection
        if self.feature_importance:
            print(f"  ‚úì Feature Selection")
            report['processing_steps'].append('feature_selection')
        
        print(f"\n‚úÖ PIPELINE HO√ÄN TH√ÄNH!")
        print(f"   D·ªØ li·ªáu s·∫µn s√†ng cho modeling")
        
        # Save report
        if save_path:
            import json
            with open(save_path, 'w') as f:
                json.dump(report, f, indent=2)
            print(f"\nüíæ B√°o c√°o ƒë√£ l∆∞u t·∫°i: {save_path}")
        
        return report


# ============================================================================
# H∆Ø·ªöNG D·∫™N S·ª¨ D·ª§NG HO√ÄN CH·ªàNH - FULL PIPELINE
# ============================================================================

"""
====================================
H∆Ø·ªöNG D·∫™N S·ª¨ D·ª§NG COMPLETE PIPELINE
====================================

# ============ B∆Ø·ªöC 1: KH·ªûI T·∫†O ============
pipeline = AdvancedAnalysisPipeline(
    df=your_dataframe,
    target_column='your_target'  # Optional
)

# ============ B∆Ø·ªöC 2: EDA ============
# Part 1: Ph√¢n t√≠ch ban ƒë·∫ßu
pipeline.initial_assessment()
pipeline.descriptive_statistics()
pipeline.analyze_missing_values(plot=True)
pipeline.visualize_distributions(max_cols=12)

# ============ B∆Ø·ªöC 3: CLEANING ============
# Part 2: L√†m s·∫°ch v√† preprocessing
pipeline.handle_missing_values(threshold=0.7)
pipeline.detect_outliers(methods=['iqr', 'zscore'], visualize=True)
pipeline.handle_outliers(strategy='cap')
pipeline.encode_categorical(max_categories=10)
pipeline.scale_features(method='standard')
pipeline.clean_data_inconsistencies()

# ============ B∆Ø·ªöC 4: ADVANCED ANALYSIS ============
# Part 3: Ph√¢n t√≠ch n√¢ng cao
pipeline.analyze_correlations(method='pearson', threshold=0.8, visualize=True)
pipeline.analyze_feature_target_relationship(task='auto')
pca_components, loadings = pipeline.perform_pca(variance_threshold=0.95, visualize=True)
selected_features = pipeline.select_features(method='all', k=20)

# ============ B∆Ø·ªöC 5: FINAL DATASET ============
df_final = pipeline.prepare_final_dataset(
    selected_features=selected_features.get('ensemble', None),
    include_pca=False
)

# ============ B∆Ø·ªöC 6: B√ÅO C√ÅO & L∆ØU ============
report = pipeline.generate_complete_report(save_path='pipeline_report.json')

# L∆∞u d·ªØ li·ªáu cu·ªëi c√πng
df_final.to_csv('cleaned_data_final.csv', index=False)
df_final.to_parquet('cleaned_data_final.parquet', index=False)

# ============ TRUY C·∫¨P K·∫æT QU·∫¢ ============
# D·ªØ li·ªáu g·ªëc: pipeline.df
# D·ªØ li·ªáu ƒë√£ cleaned: pipeline.df_cleaned
# D·ªØ li·ªáu cu·ªëi c√πng: pipeline.df_final
# Ma tr·∫≠n t∆∞∆°ng quan: pipeline.correlation_matrix
# PCA results: pipeline.pca_results
# Feature importance: pipeline.feature_importance
"""

print("\n" + "="*80)
print("‚úÖ ƒê√É HO√ÄN TH√ÄNH T·∫§T C·∫¢ 3 PARTS!")
print("="*80)
print("\nüìö PIPELINE BAO G·ªíM:")
print("  Part 1: Data Loading & Initial EDA")
print("  Part 2: Data Cleaning & Preprocessing")
print("  Part 3: Relationship Analysis & Feature Selection")
print("\nüéâ B·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng pipeline n√†y cho B·∫§T K·ª≤ t·∫≠p d·ªØ li·ªáu n√†o!")
print("="*80)