In [5]:
import pandas as pd

In [6]:
def analyze_apple_feedback():
    """
    Analyze feedback for Apple products with the following criteria:
    1. Only include feedback with actual content
    2. For products with >1000 reviews, take top 1000 based on content length
    3. Return specific columns in the final output
    """
    # Read the CSV files
    brand_df = pd.read_csv('warehouse/product/Brand.csv', encoding='utf-8-sig')
    product_df = pd.read_csv('warehouse/product/Product.csv', encoding='utf-8-sig')
    feedback_df = pd.read_csv('warehouse/feedback/FeedbackDetail.csv', encoding='utf-8-sig')

    # Merge the tables to get Apple product feedback
    apple_feedback = (feedback_df
        .merge(product_df[['ProductID', 'BrandID', 'Name']], on='ProductID', how='inner')
        .merge(brand_df[['BrandID', 'Name']], on='BrandID', how='inner')
        .query('BrandID == 17827'))
    
    # Rename product name column
    apple_feedback = apple_feedback.rename(columns={'Name_x': 'Product_Name'})
    
    # Filter for entries with actual content and add word count
    apple_feedback = apple_feedback[
        apple_feedback['Content'].notna() & 
        (apple_feedback['Content'].str.strip() != '')
    ]
    
    # Add word count for content
    apple_feedback['word_count'] = apple_feedback['Content'].str.split().str.len()

    def sample_reviews(group):
        """
        If group has >1000 reviews, take top 1000 by word count
        Otherwise return all reviews
        """
        if len(group) > 2000:
            return group.nlargest(1000, 'word_count')
        return group

    # Apply sampling by product
    apple_feedback = (apple_feedback
        .groupby('Product_Name', group_keys=False)
        .apply(sample_reviews)
        .reset_index(drop=True))
    
    # Select final columns
    final_columns = [
        'ProductID',
        'UserID',
        'GeneralFeedbackID',
        'Title',
        'Content',
        'Upvote',
        'Rating',
        'CreatedDate',
        'FeedbackDetailID'
    ]
    
    apple_feedback_filtered = apple_feedback[final_columns]
    
    # Print summary statistics
    product_review_counts = apple_feedback.groupby('Product_Name').size().sort_values(ascending=False)
    print("\nFinal review counts per product after filtering:")
    print(product_review_counts)
    
    print(f"\nTotal number of reviews in final dataset: {len(apple_feedback_filtered)}")
    
    # Sample of content lengths
    print("\nContent length distribution (words):")
    print(apple_feedback['word_count'].describe())
    
    return apple_feedback_filtered

In [7]:

# Run the analysis
apple_feedback_df = analyze_apple_feedback()


Final review counts per product after filtering:
Product_Name
Apple iPhone 13                                1974
Apple iPhone 11                                1000
MacBook Air M1 13 inch 2020                     821
Apple iPad 10.2-inch (9th Gen) Wi-Fi, 2021      420
Apple iPhone 15 Pro Max                         166
Apple iPhone 14                                 123
Apple iPad Air (5th Gen) Wi-Fi, 2022            104
Apple iPhone 15                                  74
MacBook Air M2                                   68
Apple iPhone 14 Plus                             60
Apple iPhone 16 Pro Max                          55
Apple iPhone 15 Plus                             40
Apple iPad 10.9-inch (10th Gen) Wi-Fi, 2022      36
Apple iPhone 15 Pro                              35
Apple iPhone 16                                  11
Apple iPhone 16 Pro                              10
Apple iPhone 16 Plus                              6
Apple iPad Pro M4 11-Inch Wi-Fi                   4
A

  .apply(sample_reviews)


In [8]:
# drop duplicate row
apple_feedback_df = apple_feedback_df.drop_duplicates()
apple_feedback_df.to_csv('CustomerSatisfaction.csv', index=False, encoding='utf-8-sig')