In [1]:
import pandas as pd 
import numpy as np


In [2]:
products_df = pd.read_csv('skincare_datasets/master_products.csv')

In [3]:
products_df.head()

Unnamed: 0,product_name,brand,category,ingredients,price,rating,source
0,Blu Mediterraneo MINIATURE Set,Acqua Di Parma,Fragrance,Arancia di Capri Eau de Toilette: Alcohol Dena...,66.0,4.0,sephora
1,Colonia,Acqua Di Parma,Cologne,unknown,66.0,4.5,sephora
2,Arancia di Capri,Acqua Di Parma,Perfume,Alcohol Denat.- Water- Fragrance- Limonene- Li...,180.0,4.5,sephora
3,Mirto di Panarea,Acqua Di Parma,Perfume,unknown,120.0,4.5,sephora
4,Colonia Miniature Set,Acqua Di Parma,Fragrance,Colonia: Alcohol Denat.- Water- Fragrance- Lim...,72.0,3.5,sephora


In [4]:
"""
Skincare Data Preprocessing, Cleaning, EDA & Feature Engineering Pipeline
Complete pipeline for preparing skincare data for ML models
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    print("‚ö†Ô∏è  NLTK downloads failed. Some features may not work.")


class SkincareDataPreprocessor:
    def __init__(self, data_dir='skincare_datasets'):
        """Initialize preprocessor"""
        self.data_dir = Path(data_dir)
        self.processed_dir = self.data_dir / 'processed'
        self.processed_dir.mkdir(exist_ok=True)
        
        # Harmful/risky ingredients database
        self.harmful_ingredients = {
            'high_risk': [
                'parabens', 'methylparaben', 'propylparaben', 'butylparaben',
                'formaldehyde', 'toluene', 'phthalates', 'triclosan',
                'hydroquinone', 'oxybenzone', 'benzophenone', 'coal tar',
                'petrolatum', 'mineral oil', 'siloxanes', 'bha', 'bht'
            ],
            'moderate_risk': [
                'sulfates', 'sls', 'sodium lauryl sulfate', 'sodium laureth sulfate',
                'fragrance', 'parfum', 'alcohol denat', 'denatured alcohol',
                'peg compounds', 'dmdm hydantoin', 'quaternium-15'
            ],
            'comedogenic': [
                'coconut oil', 'cocoa butter', 'isopropyl myristate',
                'isopropyl palmitate', 'acetylated lanolin', 'algae extract'
            ],
            'irritants': [
                'menthol', 'camphor', 'eucalyptus', 'peppermint oil',
                'lemon', 'lime', 'grapefruit', 'witch hazel', 'sd alcohol'
            ]
        }
        
        # Beneficial ingredients
        self.beneficial_ingredients = {
            'anti_aging': [
                'retinol', 'retinoid', 'peptides', 'vitamin c', 'ascorbic acid',
                'hyaluronic acid', 'niacinamide', 'coenzyme q10', 'resveratrol'
            ],
            'moisturizing': [
                'glycerin', 'ceramides', 'squalane', 'shea butter',
                'jojoba oil', 'argan oil', 'aloe vera'
            ],
            'acne_fighting': [
                'salicylic acid', 'benzoyl peroxide', 'tea tree oil',
                'niacinamide', 'zinc', 'sulfur'
            ],
            'brightening': [
                'vitamin c', 'kojic acid', 'arbutin', 'licorice extract',
                'alpha arbutin', 'tranexamic acid'
            ]
        }
    
    def load_datasets(self):
        """Load all master datasets"""
        print("üìÇ Loading datasets...")
        
        datasets = {}
        
        files = {
            'products': 'master_products.csv',
            'reviews': 'master_reviews.csv',
            'ingredients': 'ingredient_database.csv'
        }
        
        for name, filename in files.items():
            filepath = self.data_dir / filename
            if filepath.exists():
                datasets[name] = pd.read_csv(filepath)
                print(f"‚úÖ Loaded {name}: {len(datasets[name]):,} rows")
            else:
                print(f"‚ö†Ô∏è  {name} not found at {filepath}")
                datasets[name] = None
        
        return datasets
    
    def clean_products_data(self, df):
        """Clean and preprocess products dataset"""
        print("\nüßπ Cleaning products data...")
        
        if df is None:
            return None
        
        df_clean = df.copy()
        initial_rows = len(df_clean)
        
        # 1. Remove duplicates
        df_clean = df_clean.drop_duplicates(subset=['product_name', 'brand'])
        print(f"   Removed {initial_rows - len(df_clean)} duplicates")
        
        # 2. Clean text columns
        text_columns = ['product_name', 'brand', 'category', 'ingredients']
        for col in text_columns:
            if col in df_clean.columns:
                df_clean[col] = df_clean[col].astype(str)
                df_clean[col] = df_clean[col].str.strip()
                df_clean[col] = df_clean[col].replace('nan', np.nan)
        
        # 3. Remove products without ingredients
        before = len(df_clean)
        df_clean = df_clean[df_clean['ingredients'].notna()]
        df_clean = df_clean[df_clean['ingredients'].str.len() > 20]
        print(f"   Removed {before - len(df_clean)} products without ingredients")
        
        # 4. Clean price column
        if 'price' in df_clean.columns:
            df_clean['price'] = pd.to_numeric(df_clean['price'], errors='coerce')
            df_clean['price'] = df_clean['price'].clip(lower=0, upper=1000)
        
        # 5. Clean rating column
        if 'rating' in df_clean.columns:
            df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')
            df_clean['rating'] = df_clean['rating'].clip(lower=0, upper=5)
        
        # 6. Standardize category names
        if 'category' in df_clean.columns:
            df_clean['category'] = df_clean['category'].str.lower()
            df_clean['category'] = df_clean['category'].fillna('uncategorized')
        
        # 7. Create product ID
        df_clean['product_id'] = range(1, len(df_clean) + 1)
        df_clean['product_id'] = 'PROD_' + df_clean['product_id'].astype(str).str.zfill(6)
        
        print(f"‚úÖ Products cleaned: {len(df_clean):,} products remaining")
        
        return df_clean
    
    def clean_reviews_data(self, df):
        """Clean and preprocess reviews dataset"""
        print("\nüßπ Cleaning reviews data...")
        
        if df is None:
            return None
        
        df_clean = df.copy()
        initial_rows = len(df_clean)
        
        # 1. Remove duplicates
        df_clean = df_clean.drop_duplicates(subset=['product_name', 'user_id', 'review_text'])
        print(f"   Removed {initial_rows - len(df_clean)} duplicate reviews")
        
        # 2. Clean text columns
        df_clean['review_text'] = df_clean['review_text'].astype(str)
        df_clean['review_text'] = df_clean['review_text'].str.strip()
        
        # 3. Remove very short reviews (less than 10 characters)
        before = len(df_clean)
        df_clean = df_clean[df_clean['review_text'].str.len() >= 10]
        print(f"   Removed {before - len(df_clean)} very short reviews")
        
        # 4. Clean rating column
        if 'rating' in df_clean.columns:
            df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')
            df_clean = df_clean[df_clean['rating'].notna()]
            df_clean['rating'] = df_clean['rating'].clip(lower=1, upper=5)
        
        # 5. Remove reviews with missing ratings
        before = len(df_clean)
        df_clean = df_clean[df_clean['rating'].notna()]
        print(f"   Removed {before - len(df_clean)} reviews without ratings")
        
        # 6. Convert timestamps
        if 'timestamp' in df_clean.columns:
            df_clean['timestamp'] = pd.to_datetime(df_clean['timestamp'], errors='coerce')
        
        # 7. Create review ID
        df_clean['review_id'] = range(1, len(df_clean) + 1)
        df_clean['review_id'] = 'REV_' + df_clean['review_id'].astype(str).str.zfill(8)
        
        print(f"‚úÖ Reviews cleaned: {len(df_clean):,} reviews remaining")
        
        return df_clean
    
    def perform_eda(self, products_df, reviews_df):
        """Perform Exploratory Data Analysis"""
        print("\nüìä Performing Exploratory Data Analysis...")
        
        # Create visualizations directory
        viz_dir = self.processed_dir / 'visualizations'
        viz_dir.mkdir(exist_ok=True)
        
        # Set style
        sns.set_style('whitegrid')
        plt.rcParams['figure.figsize'] = (12, 6)
        
        # === PRODUCTS EDA ===
        if products_df is not None:
            print("\nüì¶ Products Analysis:")
            
            # 1. Top brands
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            top_brands = products_df['brand'].value_counts().head(15)
            axes[0, 0].barh(top_brands.index, top_brands.values, color='steelblue')
            axes[0, 0].set_xlabel('Number of Products')
            axes[0, 0].set_title('Top 15 Brands by Product Count')
            axes[0, 0].invert_yaxis()
            
            # 2. Category distribution
            top_categories = products_df['category'].value_counts().head(10)
            axes[0, 1].pie(top_categories.values, labels=top_categories.index, autopct='%1.1f%%')
            axes[0, 1].set_title('Product Categories Distribution')
            
            # 3. Price distribution
            if 'price' in products_df.columns:
                price_data = products_df['price'].dropna()
                axes[1, 0].hist(price_data[price_data < 200], bins=50, color='green', alpha=0.7)
                axes[1, 0].set_xlabel('Price ($)')
                axes[1, 0].set_ylabel('Frequency')
                axes[1, 0].set_title('Price Distribution (< $200)')
                axes[1, 0].axvline(price_data.median(), color='red', linestyle='--', 
                                   label=f'Median: ${price_data.median():.2f}')
                axes[1, 0].legend()
            
            # 4. Rating distribution
            if 'rating' in products_df.columns:
                rating_data = products_df['rating'].dropna()
                axes[1, 1].hist(rating_data, bins=20, color='orange', alpha=0.7)
                axes[1, 1].set_xlabel('Rating')
                axes[1, 1].set_ylabel('Frequency')
                axes[1, 1].set_title('Product Rating Distribution')
                axes[1, 1].axvline(rating_data.mean(), color='red', linestyle='--',
                                   label=f'Mean: {rating_data.mean():.2f}')
                axes[1, 1].legend()
            
            plt.tight_layout()
            plt.savefig(viz_dir / 'products_eda.png', dpi=300, bbox_inches='tight')
            print(f"   ‚úÖ Products visualization saved")
            plt.close()
            
            # Print statistics
            print(f"\n   Products Statistics:")
            print(f"   - Total products: {len(products_df):,}")
            print(f"   - Unique brands: {products_df['brand'].nunique():,}")
            print(f"   - Unique categories: {products_df['category'].nunique():,}")
            if 'price' in products_df.columns:
                print(f"   - Avg price: ${products_df['price'].mean():.2f}")
                print(f"   - Median price: ${products_df['price'].median():.2f}")
            if 'rating' in products_df.columns:
                print(f"   - Avg rating: {products_df['rating'].mean():.2f}")
        
        # === REVIEWS EDA ===
        if reviews_df is not None:
            print("\nüí¨ Reviews Analysis:")
            
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Rating distribution
            rating_counts = reviews_df['rating'].value_counts().sort_index()
            axes[0, 0].bar(rating_counts.index, rating_counts.values, color='purple', alpha=0.7)
            axes[0, 0].set_xlabel('Rating')
            axes[0, 0].set_ylabel('Count')
            axes[0, 0].set_title('Review Rating Distribution')
            axes[0, 0].set_xticks(range(1, 6))
            
            # 2. Review length distribution
            review_lengths = reviews_df['review_text'].str.len()
            axes[0, 1].hist(review_lengths[review_lengths < 1000], bins=50, color='teal', alpha=0.7)
            axes[0, 1].set_xlabel('Review Length (characters)')
            axes[0, 1].set_ylabel('Frequency')
            axes[0, 1].set_title('Review Length Distribution')
            
            # 3. Reviews by source
            source_counts = reviews_df['source'].value_counts()
            axes[1, 0].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
            axes[1, 0].set_title('Reviews by Source')
            
            # 4. Rating vs Review Length
            avg_length_by_rating = reviews_df.groupby('rating')['review_text'].apply(lambda x: x.str.len().mean())
            axes[1, 1].plot(avg_length_by_rating.index, avg_length_by_rating.values, 
                           marker='o', linewidth=2, markersize=8, color='red')
            axes[1, 1].set_xlabel('Rating')
            axes[1, 1].set_ylabel('Avg Review Length')
            axes[1, 1].set_title('Average Review Length by Rating')
            axes[1, 1].set_xticks(range(1, 6))
            axes[1, 1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.savefig(viz_dir / 'reviews_eda.png', dpi=300, bbox_inches='tight')
            print(f"   ‚úÖ Reviews visualization saved")
            plt.close()
            
            # Print statistics
            print(f"\n   Reviews Statistics:")
            print(f"   - Total reviews: {len(reviews_df):,}")
            print(f"   - Avg rating: {reviews_df['rating'].mean():.2f}")
            print(f"   - Median rating: {reviews_df['rating'].median():.2f}")
            print(f"   - Avg review length: {review_lengths.mean():.0f} characters")
            print(f"   - Rating breakdown:")
            for rating in sorted(reviews_df['rating'].unique()):
                count = len(reviews_df[reviews_df['rating'] == rating])
                pct = count / len(reviews_df) * 100
                print(f"     {rating} stars: {count:,} ({pct:.1f}%)")
        
        print(f"\n‚úÖ EDA complete! Visualizations saved to {viz_dir}")
    
    def extract_ingredient_features(self, products_df):
        """Extract ingredient-based features"""
        print("\nüß™ Extracting ingredient features...")
        
        if products_df is None:
            return None
        
        df = products_df.copy()
        
        # Parse ingredients into lists
        def parse_ingredients(ing_text):
            if pd.isna(ing_text):
                return []
            ingredients = re.split(r'[,;]', str(ing_text).lower())
            return [ing.strip() for ing in ingredients if len(ing.strip()) > 2]
        
        df['ingredient_list'] = df['ingredients'].apply(parse_ingredients)
        df['ingredient_count'] = df['ingredient_list'].apply(len)
        
        # Risk scores
        def calculate_risk_scores(ing_list):
            scores = {
                'high_risk_count': 0,
                'moderate_risk_count': 0,
                'comedogenic_count': 0,
                'irritant_count': 0,
                'beneficial_count': 0
            }
            
            for ing in ing_list:
                # Check harmful ingredients
                if any(harmful in ing for harmful in self.harmful_ingredients['high_risk']):
                    scores['high_risk_count'] += 1
                if any(harmful in ing for harmful in self.harmful_ingredients['moderate_risk']):
                    scores['moderate_risk_count'] += 1
                if any(harmful in ing for harmful in self.harmful_ingredients['comedogenic']):
                    scores['comedogenic_count'] += 1
                if any(harmful in ing for harmful in self.harmful_ingredients['irritants']):
                    scores['irritant_count'] += 1
                
                # Check beneficial ingredients
                for benefit_type, benefit_ings in self.beneficial_ingredients.items():
                    if any(beneficial in ing for beneficial in benefit_ings):
                        scores['beneficial_count'] += 1
                        break
            
            return pd.Series(scores)
        
        risk_scores = df['ingredient_list'].apply(calculate_risk_scores)
        df = pd.concat([df, risk_scores], axis=1)
        
        # Calculate overall risk score (0-100)
        df['risk_score'] = (
            (df['high_risk_count'] * 3) + 
            (df['moderate_risk_count'] * 2) + 
            (df['comedogenic_count'] * 1.5) + 
            (df['irritant_count'] * 1)
        ) * 10
        df['risk_score'] = df['risk_score'].clip(upper=100)
        
        # Risk category
        df['risk_category'] = pd.cut(
            df['risk_score'],
            bins=[0, 20, 40, 60, 100],
            labels=['Low', 'Moderate', 'High', 'Very High']
        )
        
        # Beneficial score
        df['beneficial_score'] = (df['beneficial_count'] / df['ingredient_count'] * 100).fillna(0)
        
        print(f"‚úÖ Ingredient features extracted")
        print(f"   - Avg ingredients per product: {df['ingredient_count'].mean():.1f}")
        print(f"   - Avg risk score: {df['risk_score'].mean():.1f}")
        print(f"   - Risk categories:")
        print(df['risk_category'].value_counts())
        
        return df
    
    def extract_review_features(self, reviews_df):
        """Extract features from reviews using NLP"""
        print("\nüí¨ Extracting review features...")
        
        if reviews_df is None:
            return None
        
        df = reviews_df.copy()
        
        # 1. Sentiment analysis
        def get_sentiment(text):
            try:
                blob = TextBlob(str(text))
                return blob.sentiment.polarity
            except:
                return 0
        
        print("   Analyzing sentiment...")
        df['sentiment_score'] = df['review_text'].apply(get_sentiment)
        df['sentiment_category'] = pd.cut(
            df['sentiment_score'],
            bins=[-1, -0.1, 0.1, 1],
            labels=['Negative', 'Neutral', 'Positive']
        )
        
        # 2. Review metrics
        df['review_length'] = df['review_text'].str.len()
        df['word_count'] = df['review_text'].str.split().str.len()
        
        # 3. Skin concern detection
        skin_concerns = {
            'acne': ['acne', 'pimple', 'breakout', 'blemish', 'zit'],
            'dryness': ['dry', 'flaky', 'dehydrated', 'tight'],
            'oily': ['oily', 'greasy', 'shiny', 'sebum'],
            'aging': ['wrinkle', 'fine line', 'aging', 'sagging', 'anti-aging'],
            'sensitivity': ['sensitive', 'irritat', 'redness', 'burning', 'stinging'],
            'dark_spots': ['dark spot', 'hyperpigmentation', 'discoloration', 'uneven tone'],
            'rosacea': ['rosacea', 'redness', 'flush']
        }
        
        for concern, keywords in skin_concerns.items():
            pattern = '|'.join(keywords)
            df[f'mentions_{concern}'] = df['review_text'].str.lower().str.contains(pattern, na=False).astype(int)
        
        # 4. Rating alignment (does sentiment match rating?)
        df['rating_normalized'] = (df['rating'] - 3) / 2  # Scale to -1 to 1
        df['sentiment_rating_alignment'] = 1 - abs(df['sentiment_score'] - df['rating_normalized'])
        
        # 5. Helpful review score
        if 'helpful_votes' in df.columns:
            df['helpful_votes'] = df['helpful_votes'].fillna(0)
            df['helpful_score'] = np.log1p(df['helpful_votes'])
        
        print(f"‚úÖ Review features extracted")
        print(f"   - Avg sentiment: {df['sentiment_score'].mean():.3f}")
        print(f"   - Sentiment distribution:")
        print(df['sentiment_category'].value_counts())
        
        return df
    
    def create_feature_matrices(self, products_df, reviews_df):
        """Create final feature matrices for ML"""
        print("\nüîß Creating feature matrices...")
        
        # Products feature matrix
        if products_df is not None:
            product_features = products_df[[
                'product_id', 'product_name', 'brand', 'category',
                'ingredient_count', 'risk_score', 'risk_category',
                'high_risk_count', 'moderate_risk_count', 'comedogenic_count',
                'irritant_count', 'beneficial_count', 'beneficial_score'
            ]].copy()
            
            if 'price' in products_df.columns:
                product_features['price'] = products_df['price']
            if 'rating' in products_df.columns:
                product_features['avg_rating'] = products_df['rating']
            
            # Encode categorical features
            le_brand = LabelEncoder()
            le_category = LabelEncoder()
            
            product_features['brand_encoded'] = le_brand.fit_transform(product_features['brand'].fillna('unknown'))
            product_features['category_encoded'] = le_category.fit_transform(product_features['category'].fillna('unknown'))
            
            # Save
            product_features.to_csv(self.processed_dir / 'product_features.csv', index=False)
            print(f"‚úÖ Product features: {product_features.shape}")
        
        # Reviews feature matrix
        if reviews_df is not None:
            review_features = reviews_df[[
                'review_id', 'product_name', 'rating', 'sentiment_score',
                'sentiment_category', 'review_length', 'word_count'
            ]].copy()
            
            # Add skin concern columns
            concern_cols = [col for col in reviews_df.columns if col.startswith('mentions_')]
            for col in concern_cols:
                review_features[col] = reviews_df[col]
            
            if 'helpful_score' in reviews_df.columns:
                review_features['helpful_score'] = reviews_df['helpful_score']
            
            # Save
            review_features.to_csv(self.processed_dir / 'review_features.csv', index=False)
            print(f"‚úÖ Review features: {review_features.shape}")
        
        # Aggregate reviews by product
        if reviews_df is not None and products_df is not None:
            print("\nüìä Aggregating reviews by product...")
            
            review_aggregates = reviews_df.groupby('product_name').agg({
                'rating': ['mean', 'count', 'std'],
                'sentiment_score': 'mean',
                'review_length': 'mean',
                'word_count': 'mean'
            }).reset_index()
            
            review_aggregates.columns = ['product_name', 'avg_rating', 'review_count', 
                                         'rating_std', 'avg_sentiment', 'avg_review_length',
                                         'avg_word_count']
            
            # Add skin concern aggregates
            concern_cols = [col for col in reviews_df.columns if col.startswith('mentions_')]
            for col in concern_cols:
                concern_agg = reviews_df.groupby('product_name')[col].sum().reset_index()
                review_aggregates = review_aggregates.merge(concern_agg, on='product_name', how='left')
            
            review_aggregates.to_csv(self.processed_dir / 'review_aggregates_by_product.csv', index=False)
            print(f"‚úÖ Review aggregates: {review_aggregates.shape}")
        
        print("\n‚úÖ Feature matrices created!")
    
    def generate_data_summary_report(self):
        """Generate comprehensive data summary"""
        print("\n" + "="*70)
        print("üìã DATA PREPROCESSING & FEATURE ENGINEERING SUMMARY")
        print("="*70)
        
        files = {
            'Product Features': 'product_features.csv',
            'Review Features': 'review_features.csv',
            'Review Aggregates': 'review_aggregates_by_product.csv'
        }
        
        for name, filename in files.items():
            filepath = self.processed_dir / filename
            if filepath.exists():
                df = pd.read_csv(filepath)
                print(f"\nüìä {name}:")
                print(f"   Shape: {df.shape}")
                print(f"   Columns: {', '.join(df.columns[:8])}...")
                print(f"   File size: {filepath.stat().st_size / 1024 / 1024:.2f} MB")
                print(f"   Missing values: {df.isnull().sum().sum()}")
            else:
                print(f"\n‚ö†Ô∏è  {name}: Not found")
        
        print("\n" + "="*70)
        print("‚úÖ All processing complete! Ready for ML model training.")
        print("="*70)


def main():
    """Main execution pipeline"""
    print("üöÄ Skincare Data Preprocessing & Feature Engineering Pipeline")
    print("="*70)
    
    # Initialize preprocessor
    preprocessor = SkincareDataPreprocessor()
    
    # Step 1: Load datasets
    print("\n" + "="*70)
    print("STEP 1: Loading Datasets")
    print("="*70)
    datasets = preprocessor.load_datasets()
    
    # Step 2: Clean data
    print("\n" + "="*70)
    print("STEP 2: Data Cleaning")
    print("="*70)
    products_clean = preprocessor.clean_products_data(datasets['products'])
    reviews_clean = preprocessor.clean_reviews_data(datasets['reviews'])
    
    # Step 3: EDA
    print("\n" + "="*70)
    print("STEP 3: Exploratory Data Analysis")
    print("="*70)
    preprocessor.perform_eda(products_clean, reviews_clean)
    
    # Step 4: Feature Engineering - Products
    print("\n" + "="*70)
    print("STEP 4: Feature Engineering - Products")
    print("="*70)
    products_with_features = preprocessor.extract_ingredient_features(products_clean)
    
    # Step 5: Feature Engineering - Reviews
    print("\n" + "="*70)
    print("STEP 5: Feature Engineering - Reviews")
    print("="*70)
    reviews_with_features = preprocessor.extract_review_features(reviews_clean)
    
    # Step 6: Create feature matrices
    print("\n" + "="*70)
    print("STEP 6: Creating Feature Matrices")
    print("="*70)
    preprocessor.create_feature_matrices(products_with_features, reviews_with_features)
    
    # Step 7: Generate summary
    print("\n" + "="*70)
    print("STEP 7: Summary Report")
    print("="*70)
    preprocessor.generate_data_summary_report()
    
    print("\nüéâ Pipeline Complete!")
    print("\nüìÇ Check the 'skincare_datasets/processed/' folder for:")
    print("   - product_features.csv")
    print("   - review_features.csv")
    print("   - review_aggregates_by_product.csv")
    print("   - visualizations/ (EDA plots)")
    print("\nüöÄ Next: Train ML models for risk analysis and recommendations!")


if __name__ == "__main__":
    main()

üöÄ Skincare Data Preprocessing & Feature Engineering Pipeline

STEP 1: Loading Datasets
üìÇ Loading datasets...
‚úÖ Loaded products: 9,538 rows
‚úÖ Loaded reviews: 0 rows
‚úÖ Loaded ingredients: 10,791 rows

STEP 2: Data Cleaning

üßπ Cleaning products data...
   Removed 0 duplicates
   Removed 1660 products without ingredients
‚úÖ Products cleaned: 7,878 products remaining

üßπ Cleaning reviews data...
   Removed 0 duplicate reviews
   Removed 0 very short reviews
   Removed 0 reviews without ratings
‚úÖ Reviews cleaned: 0 reviews remaining

STEP 3: Exploratory Data Analysis

üìä Performing Exploratory Data Analysis...

üì¶ Products Analysis:
   ‚úÖ Products visualization saved

   Products Statistics:
   - Total products: 7,878
   - Unique brands: 388
   - Unique categories: 127
   - Avg price: $47.88
   - Median price: $35.00
   - Avg rating: 4.01

üí¨ Reviews Analysis:
   ‚úÖ Reviews visualization saved

   Reviews Statistics:
   - Total reviews: 0
   - Avg rating: nan
   -

In [13]:
"""
Comprehensive ML Models for Skincare Risk Analysis & Recommendation System
Fixed for scikit-learn compatibility issues
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support, roc_auc_score, roc_curve,
    mean_squared_error, mean_absolute_error, r2_score
)

# Classification Models
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, VotingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Try importing XGBoost and LightGBM (optional)
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("‚ö†Ô∏è  XGBoost not available, using alternatives")

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("‚ö†Ô∏è  LightGBM not available, using alternatives")

# Recommendation
from sklearn.metrics.pairwise import cosine_similarity


class SkincareMLPipeline:
    def __init__(self, data_dir='skincare_datasets/processed'):
        """Initialize ML pipeline"""
        self.data_dir = Path(data_dir)
        self.models_dir = self.data_dir / 'models'
        self.models_dir.mkdir(exist_ok=True)
        
        self.results_dir = self.data_dir / 'results'
        self.results_dir.mkdir(exist_ok=True)
        
        self.models = {}
        self.scalers = {}
        self.encoders = {}
        
    def load_data(self):
        """Load preprocessed datasets"""
        print("üìÇ Loading preprocessed data...")
        
        datasets = {}
        files = {
            'products': 'product_features.csv',
            'reviews': 'review_features.csv',
            'aggregates': 'review_aggregates_by_product.csv'
        }
        
        for name, filename in files.items():
            filepath = self.data_dir / filename
            if filepath.exists():
                datasets[name] = pd.read_csv(filepath)
                print(f"‚úÖ Loaded {name}: {datasets[name].shape}")
            else:
                print(f"‚ö†Ô∏è  {name} not found")
                datasets[name] = None
        
        return datasets
    
    def prepare_risk_classification_data(self, products_df):
        """Prepare data for risk classification (Multi-class)"""
        print("\nüéØ Preparing Risk Classification Data...")
        
        df = products_df.copy()
        
        # Features for prediction
        feature_cols = [
            'ingredient_count', 'high_risk_count', 'moderate_risk_count',
            'comedogenic_count', 'irritant_count', 'beneficial_count',
            'beneficial_score'
        ]
        
        # Add encoded features if available
        if 'brand_encoded' in df.columns:
            feature_cols.append('brand_encoded')
        if 'category_encoded' in df.columns:
            feature_cols.append('category_encoded')
        if 'price' in df.columns:
            df['price'] = df['price'].fillna(df['price'].median())
            feature_cols.append('price')
        
        # Remove rows with missing target
        df = df[df['risk_category'].notna()].copy()
        
        X = df[feature_cols].fillna(0)
        y = df['risk_category']
        
        # Encode target
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        
        print(f"   Features shape: {X.shape}")
        print(f"   Target distribution:")
        print(df['risk_category'].value_counts())
        
        return X, y_encoded, le, feature_cols
    
    def prepare_sentiment_classification_data(self, reviews_df):
        """Prepare data for review sentiment classification"""
        print("\nüí¨ Preparing Sentiment Classification Data...")
        
        df = reviews_df.copy()
        
        # Check if sentiment_category column exists
        if 'sentiment_category' not in df.columns:
            print("   ‚ö†Ô∏è  'sentiment_category' column not found!")
            print("   Available columns:", df.columns.tolist())
            return None, None, None, None
        
        # Remove rows with missing sentiment
        df = df[df['sentiment_category'].notna()].copy()
        
        if len(df) == 0:
            print("   ‚ö†Ô∏è  No valid sentiment data found after filtering!")
            return None, None, None, None
        
        feature_cols = ['rating', 'review_length', 'word_count']
        
        # Add skin concern features
        concern_cols = [col for col in df.columns if col.startswith('mentions_')]
        feature_cols.extend(concern_cols)
        
        if 'helpful_score' in df.columns:
            feature_cols.append('helpful_score')
        
        # Keep only columns that exist
        feature_cols = [col for col in feature_cols if col in df.columns]
        
        if len(feature_cols) == 0:
            print("   ‚ö†Ô∏è  No valid feature columns found!")
            return None, None, None, None
        
        X = df[feature_cols].fillna(0)
        y = df['sentiment_category']
        
        # Encode target
        le = LabelEncoder()
        y_encoded = le.fit_transform(y)
        
        print(f"   Features shape: {X.shape}")
        print(f"   Features used: {feature_cols}")
        print(f"   Target distribution:")
        print(df['sentiment_category'].value_counts())
        
        return X, y_encoded, le, feature_cols
    
    def train_risk_classification_models(self, X, y, label_encoder):
        """Train multiple models for risk classification"""
        print("\nü§ñ Training Risk Classification Models...")
        print("="*70)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        self.scalers['risk_scaler'] = scaler
        
        # Define models (compatible versions)
        models = {
            'Random Forest': RandomForestClassifier(
                n_estimators=200, max_depth=15, min_samples_split=5,
                random_state=42, n_jobs=-1
            ),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=150, max_depth=7, learning_rate=0.1,
                random_state=42
            ),
            'Logistic Regression': LogisticRegression(
                max_iter=1000, random_state=42, n_jobs=-1
            ),
            'Decision Tree': DecisionTreeClassifier(
                max_depth=15, min_samples_split=5, random_state=42
            ),
            'K-Nearest Neighbors': KNeighborsClassifier(
                n_neighbors=5, n_jobs=-1
            )
        }
        
        # Add XGBoost if available
        if XGBOOST_AVAILABLE:
            models['XGBoost'] = XGBClassifier(
                n_estimators=200, max_depth=8, learning_rate=0.1,
                random_state=42, n_jobs=-1, 
                eval_metric='mlogloss',
                use_label_encoder=False
            )
        
        # Add LightGBM if available
        if LIGHTGBM_AVAILABLE:
            models['LightGBM'] = LGBMClassifier(
                n_estimators=200, max_depth=8, learning_rate=0.1,
                random_state=42, n_jobs=-1, verbose=-1
            )
        
        results = {}
        
        for name, model in models.items():
            print(f"\nüìä Training {name}...")
            
            try:
                # Train
                if name in ['Logistic Regression', 'K-Nearest Neighbors']:
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    if hasattr(model, 'predict_proba'):
                        y_pred_proba = model.predict_proba(X_test_scaled)
                    else:
                        y_pred_proba = None
                    X_cv = X_train_scaled
                else:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    if hasattr(model, 'predict_proba'):
                        y_pred_proba = model.predict_proba(X_test)
                    else:
                        y_pred_proba = None
                    X_cv = X_train
                
                # Evaluate
                accuracy = accuracy_score(y_test, y_pred)
                precision, recall, f1, _ = precision_recall_fscore_support(
                    y_test, y_pred, average='weighted', zero_division=0
                )
                
                # Cross-validation
                try:
                    cv_scores = cross_val_score(model, X_cv, y_train, cv=5)
                    cv_mean = cv_scores.mean()
                    cv_std = cv_scores.std()
                except Exception as e:
                    print(f"   ‚ö†Ô∏è  CV failed: {str(e)[:50]}")
                    cv_mean = accuracy
                    cv_std = 0.0
                
                results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'cv_mean': cv_mean,
                    'cv_std': cv_std,
                    'y_test': y_test,
                    'y_pred': y_pred,
                    'y_pred_proba': y_pred_proba,
                    'error_rate': 1 - accuracy
                }
                
                print(f"   Accuracy: {accuracy:.4f}")
                print(f"   Precision: {precision:.4f}")
                print(f"   Recall: {recall:.4f}")
                print(f"   F1-Score: {f1:.4f}")
                print(f"   Error Rate: {1-accuracy:.4f}")
                print(f"   CV Score: {cv_mean:.4f} (+/- {cv_std:.4f})")
                
            except Exception as e:
                print(f"   ‚ùå Failed to train {name}: {str(e)[:100]}")
                continue
        
        if not results:
            raise ValueError("No models were successfully trained!")
        
        # Find best model
        best_model_name = max(results, key=lambda x: results[x]['f1_score'])
        print(f"\nüèÜ Best Model: {best_model_name} (F1: {results[best_model_name]['f1_score']:.4f})")
        
        # Save best model
        self.models['risk_classifier'] = results[best_model_name]['model']
        self.encoders['risk_encoder'] = label_encoder
        
        return results, best_model_name
    
    def train_sentiment_classification_models(self, X, y, label_encoder):
        """Train models for sentiment classification"""
        print("\nü§ñ Training Sentiment Classification Models...")
        print("="*70)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        self.scalers['sentiment_scaler'] = scaler
        
        # Define models
        models = {
            'Random Forest': RandomForestClassifier(
                n_estimators=150, max_depth=12, random_state=42, n_jobs=-1
            ),
            'Logistic Regression': LogisticRegression(
                max_iter=1000, random_state=42, n_jobs=-1
            ),
            'Naive Bayes': GaussianNB(),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=100, max_depth=5, random_state=42
            )
        }
        
        # Add XGBoost if available
        if XGBOOST_AVAILABLE:
            models['XGBoost'] = XGBClassifier(
                n_estimators=150, max_depth=6, learning_rate=0.1,
                random_state=42, n_jobs=-1, 
                eval_metric='mlogloss',
                use_label_encoder=False
            )
        
        results = {}
        
        for name, model in models.items():
            print(f"\nüìä Training {name}...")
            
            try:
                # Train
                if name in ['Logistic Regression', 'Naive Bayes']:
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                else:
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                
                # Evaluate
                accuracy = accuracy_score(y_test, y_pred)
                precision, recall, f1, _ = precision_recall_fscore_support(
                    y_test, y_pred, average='weighted', zero_division=0
                )
                
                results[name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'y_test': y_test,
                    'y_pred': y_pred,
                    'error_rate': 1 - accuracy
                }
                
                print(f"   Accuracy: {accuracy:.4f}")
                print(f"   F1-Score: {f1:.4f}")
                print(f"   Error Rate: {1-accuracy:.4f}")
                
            except Exception as e:
                print(f"   ‚ùå Failed to train {name}: {str(e)[:100]}")
                continue
        
        if not results:
            raise ValueError("No models were successfully trained!")
        
        # Best model
        best_model_name = max(results, key=lambda x: results[x]['f1_score'])
        print(f"\nüèÜ Best Model: {best_model_name} (F1: {results[best_model_name]['f1_score']:.4f})")
        
        self.models['sentiment_classifier'] = results[best_model_name]['model']
        self.encoders['sentiment_encoder'] = label_encoder
        
        return results, best_model_name
    
    def plot_confusion_matrices(self, results, task_name, label_encoder):
        """Plot confusion matrices for all models"""
        print(f"\nüìà Generating confusion matrices for {task_name}...")
        
        n_models = len(results)
        cols = min(3, n_models)
        rows = (n_models + cols - 1) // cols
        
        fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 5*rows))
        if n_models == 1:
            axes = np.array([axes])
        axes = axes.flatten() if n_models > 1 else axes
        
        for idx, (name, result) in enumerate(results.items()):
            cm = confusion_matrix(result['y_test'], result['y_pred'])
            
            # Plot
            sns.heatmap(
                cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_,
                ax=axes[idx], cbar=True
            )
            
            axes[idx].set_title(f'{name}\nAcc: {result["accuracy"]:.4f} | Err: {result["error_rate"]:.4f}')
            axes[idx].set_ylabel('True Label')
            axes[idx].set_xlabel('Predicted Label')
        
        # Hide unused subplots
        for idx in range(n_models, len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        filename = self.results_dir / f'{task_name}_confusion_matrices.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Saved: {filename}")
        plt.close()
    
    def plot_model_comparison(self, results, task_name):
        """Plot model performance comparison"""
        print(f"\nüìä Generating model comparison for {task_name}...")
        
        # Prepare data
        models = list(results.keys())
        accuracies = [results[m]['accuracy'] for m in models]
        precisions = [results[m]['precision'] for m in models]
        recalls = [results[m]['recall'] for m in models]
        f1_scores = [results[m]['f1_score'] for m in models]
        error_rates = [results[m]['error_rate'] for m in models]
        
        # Create comparison plots
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Accuracy comparison
        axes[0, 0].barh(models, accuracies, color='steelblue', alpha=0.8)
        axes[0, 0].set_xlabel('Accuracy')
        axes[0, 0].set_title('Model Accuracy Comparison')
        axes[0, 0].set_xlim(0, 1)
        for i, v in enumerate(accuracies):
            axes[0, 0].text(v + 0.01, i, f'{v:.4f}', va='center')
        
        # 2. Error Rate comparison
        axes[0, 1].barh(models, error_rates, color='crimson', alpha=0.8)
        axes[0, 1].set_xlabel('Error Rate')
        axes[0, 1].set_title('Model Error Rate Comparison')
        axes[0, 1].set_xlim(0, max(error_rates) * 1.2 if error_rates else 1)
        for i, v in enumerate(error_rates):
            axes[0, 1].text(v + 0.005, i, f'{v:.4f}', va='center')
        
        # 3. Precision, Recall, F1 comparison
        x = np.arange(len(models))
        width = 0.25
        
        axes[1, 0].bar(x - width, precisions, width, label='Precision', alpha=0.8)
        axes[1, 0].bar(x, recalls, width, label='Recall', alpha=0.8)
        axes[1, 0].bar(x + width, f1_scores, width, label='F1-Score', alpha=0.8)
        axes[1, 0].set_ylabel('Score')
        axes[1, 0].set_title('Precision, Recall, F1-Score Comparison')
        axes[1, 0].set_xticks(x)
        axes[1, 0].set_xticklabels(models, rotation=45, ha='right')
        axes[1, 0].legend()
        axes[1, 0].set_ylim(0, 1)
        axes[1, 0].grid(axis='y', alpha=0.3)
        
        # 4. Overall metrics table
        axes[1, 1].axis('off')
        table_data = []
        for model in models:
            table_data.append([
                model,
                f"{results[model]['accuracy']:.4f}",
                f"{results[model]['precision']:.4f}",
                f"{results[model]['recall']:.4f}",
                f"{results[model]['f1_score']:.4f}",
                f"{results[model]['error_rate']:.4f}"
            ])
        
        table = axes[1, 1].table(
            cellText=table_data,
            colLabels=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Error'],
            cellLoc='center',
            loc='center',
            colWidths=[0.25, 0.15, 0.15, 0.15, 0.15, 0.15]
        )
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1, 2)
        
        # Style header
        for i in range(6):
            table[(0, i)].set_facecolor('#40466e')
            table[(0, i)].set_text_props(weight='bold', color='white')
        
        # Highlight best values
        best_idx = accuracies.index(max(accuracies))
        for i in range(6):
            table[(best_idx + 1, i)].set_facecolor('#90EE90')
        
        plt.tight_layout()
        filename = self.results_dir / f'{task_name}_model_comparison.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Saved: {filename}")
        plt.close()
    
    def create_product_recommendation_system(self, products_df):
        """Create content-based recommendation system"""
        print("\nüéØ Creating Product Recommendation System...")
        
        df = products_df.copy()
        
        # Create feature matrix for similarity
        feature_cols = [
            'ingredient_count', 'risk_score', 'beneficial_score',
            'high_risk_count', 'moderate_risk_count', 'comedogenic_count'
        ]
        
        if 'category_encoded' in df.columns:
            feature_cols.append('category_encoded')
        if 'price' in df.columns:
            df['price'] = df['price'].fillna(df['price'].median())
            feature_cols.append('price')
        
        # Normalize features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(df[feature_cols].fillna(0))
        
        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(features_scaled)
        
        print(f"‚úÖ Recommendation system created")
        print(f"   Similarity matrix shape: {similarity_matrix.shape}")
        
        return similarity_matrix, df
    
    def recommend_safer_alternatives(self, product_idx, similarity_matrix, products_df, n=5):
        """Recommend safer alternative products"""
        
        # Get product details
        product = products_df.iloc[product_idx]
        product_risk = product['risk_score']
        
        # Find similar products with lower risk
        similarities = similarity_matrix[product_idx]
        
        # Create recommendation scores
        rec_scores = []
        for i, sim in enumerate(similarities):
            if i != product_idx:
                other_risk = products_df.iloc[i]['risk_score']
                if other_risk < product_risk:
                    rec_score = sim * (1 - (other_risk / 100))
                    rec_scores.append((i, sim, rec_score))
        
        # Sort by recommendation score
        rec_scores.sort(key=lambda x: x[2], reverse=True)
        
        # Get top N recommendations
        recommendations = []
        for i, sim, rec_score in rec_scores[:n]:
            rec_product = products_df.iloc[i]
            recommendations.append({
                'product_name': rec_product['product_name'],
                'brand': rec_product['brand'],
                'risk_score': rec_product['risk_score'],
                'risk_category': rec_product['risk_category'],
                'similarity': sim,
                'recommendation_score': rec_score,
                'risk_reduction': product_risk - rec_product['risk_score']
            })
        
        return recommendations
    
    def save_models(self):
        """Save all trained models"""
        print("\nüíæ Saving models...")
        
        # Save models
        for name, model in self.models.items():
            filename = self.models_dir / f'{name}.pkl'
            with open(filename, 'wb') as f:
                pickle.dump(model, f)
            print(f"‚úÖ Saved {name}")
        
        # Save scalers
        for name, scaler in self.scalers.items():
            filename = self.models_dir / f'{name}.pkl'
            with open(filename, 'wb') as f:
                pickle.dump(scaler, f)
            print(f"‚úÖ Saved {name}")
        
        # Save encoders
        for name, encoder in self.encoders.items():
            filename = self.models_dir / f'{name}.pkl'
            with open(filename, 'wb') as f:
                pickle.dump(encoder, f)
            print(f"‚úÖ Saved {name}")
    
    def generate_final_report(self, risk_results, sentiment_results):
        """Generate comprehensive performance report"""
        print("\n" + "="*70)
        print("üìã FINAL MODEL PERFORMANCE REPORT")
        print("="*70)
        
        print("\nüéØ RISK CLASSIFICATION MODELS:")
        print("-" * 70)
        for name, result in risk_results.items():
            print(f"\n{name}:")
            print(f"  Accuracy:  {result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)")
            print(f"  Precision: {result['precision']:.4f}")
            print(f"  Recall:    {result['recall']:.4f}")
            print(f"  F1-Score:  {result['f1_score']:.4f}")
            print(f"  Error Rate: {result['error_rate']:.4f} ({result['error_rate']*100:.2f}%)")
            if 'cv_mean' in result:
                print(f"  CV Score:  {result['cv_mean']:.4f} (+/- {result['cv_std']:.4f})")
        
        if sentiment_results:
            print("\n" + "="*70)
            print("\nüí¨ SENTIMENT CLASSIFICATION MODELS:")
            print("-" * 70)
            for name, result in sentiment_results.items():
                print(f"\n{name}:")
                print(f"  Accuracy:  {result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)")
                print(f"  Precision: {result['precision']:.4f}")
                print(f"  Recall:    {result['recall']:.4f}")
                print(f"  F1-Score:  {result['f1_score']:.4f}")
                print(f"  Error Rate: {result['error_rate']:.4f} ({result['error_rate']*100:.2f}%)")
        
        print("\n" + "="*70)
        print("‚úÖ All models trained and evaluated successfully!")
        print("="*70)


def main():
    """Main execution pipeline"""
    print("üöÄ Skincare ML Model Training Pipeline")
    print("="*70)
    
    # Initialize pipeline
    pipeline = SkincareMLPipeline()
    
    # Load data
    print("\n" + "="*70)
    print("STEP 1: Loading Data")
    print("="*70)
    datasets = pipeline.load_data()
    
    if datasets['products'] is None:
        print("‚ùå Error: Product data not found. Run preprocessing first!")
        return
    
    # === TASK 1: RISK CLASSIFICATION ===
    print("\n" + "="*70)
    print("TASK 1: PRODUCT RISK CLASSIFICATION")
    print("="*70)
    
    X_risk, y_risk, le_risk, risk_features = pipeline.prepare_risk_classification_data(
        datasets['products']
    )
    
    risk_results, best_risk_model = pipeline.train_risk_classification_models(
        X_risk, y_risk, le_risk
    )
    
    pipeline.plot_confusion_matrices(risk_results, 'risk_classification', le_risk)
    pipeline.plot_model_comparison(risk_results, 'risk_classification')
    
    # === TASK 2: SENTIMENT CLASSIFICATION ===
    sentiment_results = {}
    if datasets['reviews'] is not None and len(datasets['reviews']) > 0:
        print("\n" + "="*70)
        print("TASK 2: REVIEW SENTIMENT CLASSIFICATION")
        print("="*70)
        
        X_sentiment, y_sentiment, le_sentiment, sentiment_features = \
            pipeline.prepare_sentiment_classification_data(datasets['reviews'])
        
        # Only train if we have valid data (all must be not None)
        if (X_sentiment is not None and y_sentiment is not None and 
            le_sentiment is not None and sentiment_features is not None and 
            len(X_sentiment) > 0):
            
            sentiment_results, best_sentiment_model = pipeline.train_sentiment_classification_models(
                X_sentiment, y_sentiment, le_sentiment
            )
            
            if sentiment_results:  # Check if training was successful
                pipeline.plot_confusion_matrices(sentiment_results, 'sentiment_classification', le_sentiment)
                pipeline.plot_model_comparison(sentiment_results, 'sentiment_classification')
        else:
            print("‚ö†Ô∏è  Skipping sentiment classification (invalid/empty data)")
    else:
        print("\n‚ö†Ô∏è  Skipping sentiment classification (no review data)")
    
    # === TASK 3: RECOMMENDATION SYSTEM ===
    print("\n" + "="*70)
    print("TASK 3: PRODUCT RECOMMENDATION SYSTEM")
    print("="*70)
    
    similarity_matrix, products_for_rec = pipeline.create_product_recommendation_system(
        datasets['products']
    )
    
    # Test recommendation
    print("\nüìã Testing Recommendation System:")
    test_idx = 0
    test_product = products_for_rec.iloc[test_idx]
    print(f"\nOriginal Product: {test_product['product_name']}")
    print(f"Risk Score: {test_product['risk_score']:.1f}")
    print(f"Risk Category: {test_product['risk_category']}")
    
    recommendations = pipeline.recommend_safer_alternatives(
        test_idx, similarity_matrix, products_for_rec, n=5
    )
    
    if recommendations:
        print("\n‚ú® Safer Alternatives:")
        for i, rec in enumerate(recommendations, 1):
            print(f"\n{i}. {rec['product_name']} ({rec['brand']})")
            print(f"   Risk Score: {rec['risk_score']:.1f} (‚Üì{rec['risk_reduction']:.1f})")
            print(f"   Risk Category: {rec['risk_category']}")
            print(f"   Similarity: {rec['similarity']:.3f}")
    else:
        print("\n‚ö†Ô∏è  No safer alternatives found for this product.")
    
    # Save models
    print("\n" + "="*70)
    print("STEP 4: Saving Models")
    print("="*70)
    pipeline.save_models()
    
    # Final report
    pipeline.generate_final_report(risk_results, sentiment_results)
    
    print("\nüéâ ML Pipeline Complete!")
    print("\nüìÇ Check these folders:")
    print(f"   - Models: {pipeline.models_dir}")
    print(f"   - Results: {pipeline.results_dir}")
    print("\nüöÄ Next Steps:")
    print("   1. Review the confusion matrices and model comparisons")
    print("   2. Test the recommendation system with different products")
    print("   3. Integrate the trained models into your web application")
    print("   4. Use the saved .pkl files for making predictions")


if __name__ == "__main__":
    main()

üöÄ Skincare ML Model Training Pipeline

STEP 1: Loading Data
üìÇ Loading preprocessed data...
‚úÖ Loaded products: (7878, 17)
‚úÖ Loaded reviews: (0, 15)
‚úÖ Loaded aggregates: (0, 14)

TASK 1: PRODUCT RISK CLASSIFICATION

üéØ Preparing Risk Classification Data...
   Features shape: (5841, 10)
   Target distribution:
risk_category
High         2428
Low          2082
Moderate      997
Very High     334
Name: count, dtype: int64

ü§ñ Training Risk Classification Models...

üìä Training Random Forest...
   Accuracy: 0.9966
   Precision: 0.9966
   Recall: 0.9966
   F1-Score: 0.9966
   Error Rate: 0.0034
   CV Score: 0.9940 (+/- 0.0029)

üìä Training Gradient Boosting...
   Accuracy: 0.9983
   Precision: 0.9983
   Recall: 0.9983
   F1-Score: 0.9983
   Error Rate: 0.0017
   CV Score: 0.9961 (+/- 0.0014)

üìä Training Logistic Regression...
   Accuracy: 0.9940
   Precision: 0.9940
   Recall: 0.9940
   F1-Score: 0.9940
   Error Rate: 0.0060
   CV Score: 0.9951 (+/- 0.0029)

üìä Traini