In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../jumia_full_operations_dataset.csv", sep=",")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../jumia_full_operations_dataset.csv'

In [None]:
df.columns

Index(['order_id', 'customer_id', 'city', 'product_category', 'product_price',
       'discount', 'quantity', 'payment_method', 'delivery_status',
       'delivery_time_days', 'seller_rating', 'customer_rating',
       'return_reason', 'seller_id', 'rider_id', 'rider_delivery_success_rate',
       'warehouse_id', 'warehouse_stock_level', 'support_ticket_opened',
       'support_category', 'marketing_campaign', 'campaign_conversion_rate'],
      dtype='object')

In [2]:
def process_category_discounts(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the input DataFrame to compute average discount per product category.
    Returns a DataFrame with columns: ['product_category', 'avg_discount']
    sorted by product_category ascending.
    """
    # Group by product category and compute mean discount
    result = (
        df.groupby("product_category")["discount"]
          .mean()
          .rename("avg_discount")
          .reset_index()
          .sort_values("product_category", ascending=True)
    )

    return result


In [3]:
def generate_analytics(data: pd.DataFrame) -> pd.Series: 
    # Clean and preprocess data to ensure discount is numeric and no missing categories 
    data = data.dropna(subset=["product_category", "discount"]) 
    data["discount"] = data["discount"].astype(float) 
    
    # Challenge1: Implement process_category_discounts to calculate mean discount per product category 
    data = process_category_discounts(data) 
    
    # Find the product category with the highest average discount 
    max_discount_category = data.loc[data["avg_discount"].idxmax(), "product_category"] 
    
    return max_discount_category

In [None]:
def generate_analytics(data_json):
    # Load data into DataFrame
    df = pd.DataFrame(data_json)

    # Convert any string date/time columns if exist - none here, so infer month from order_id as placeholder
    # We'll assume 'order_id' simulates different time points, so create a dummy 'month' column
    df['month'] = ((df['order_id'] - 1) // 30 + 1).astype(int)

    # Challenge1: Process discount aggregation by product category and month
    df = process_discount_aggregation(df)

    # Find product category with highest average discount across months
    max_avg_discount = df['average_discount'].max()
    best_categories = df[df['average_discount'] == max_avg_discount]['product_category'].unique()

    result = {
        'monthly_avg_discounts': df.to_dict(orient='records'),
        'top_product_categories': best_categories.tolist(),
        'max_average_discount': max_avg_discount,
    }

    return result

def process_discount_aggregation(df: pd.DataFrame) -> pd.DataFrame:
    """
    Challenge1: Implement process_discount_aggregation to:
    - Group data by 'product_category' and 'month'
    - Calculate average discount for each group
    - Return resultant DataFrame with columns: 'product_category', 'month', 'average_discount'

    Replace this placeholder with the pandas logic that performs the aggregation.
    """
    # Challenge1: Implement process_discount_aggregation to handle grouping and averaging discount by product_category and month
    # Fill here ...
    df_agg = (
        df.groupby(["product_category", "month"])["discount"] \
            .mean().reset_index() \
                .rename(columns={"discount": "avg_discount"})
    )
    return df_agg


In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict

def generate_analytics(data: List[Dict]) -> Dict[str, pd.DataFrame]:
    """
    Analyze discount distribution by product category per month
    and find the marketing campaign with highest average discount
    by product category over different months.

    Args:
        data (List[Dict]): Raw order data as list of dicts.

    Returns:
        Dict[str, pd.DataFrame]: Two dataframes:
            - 'discount_distribution': discount distribution by product_category and month
            - 'top_campaigns': marketing campaigns with highest average discount by category and month
    """
    df = pd.DataFrame(data)

    # Parse dates and extract month from 'order_id' or use a placeholder date since no date given
    # Using order_id as proxy for sequence, we simulate month extraction for demonstration
    df["month"] = (df["order_id"] % 12).replace(0, 12)

    # Challenge1: Use a helper function to process discount distribution by month and product_category
    df_discount_distribution = process_discount_distribution(df)

    # Challenge2: Use a helper function to find top marketing campaigns by avg discount per product_category and month
    df_top_campaign = process_top_marketing_campaign(df)

    return {
        "discount_distribution": df_discount_distribution,
        "top_campaigns": df_top_campaign,
    }

def process_discount_distribution(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process discount distribution by product category and month.

    This function groups data by month and product category and calculates discount distribution
    metrics needed for analysis.

    Returns a dataframe with discount mean, min, max, and std per group.

    # Challenge1: Implement process_discount_distribution to group by 'month' and 'product_category',
    # then calculate 'mean', 'min', 'max', and 'std' on 'discount'. Assign the resulting dataframe to a variable
    # and return it.
    """
    # Challenge1: Implement process_discount_distribution to handle discount stats computation
    df_agg = (
        df.groupby(['month', 'product_category'])['discount'] \
            .agg(['mean', 'min', 'max', 'std']).reset_index()
    )
    return df_agg

def process_top_marketing_campaign(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identify the marketing campaign with the highest average discount
    by product category and month.

    Groups the data appropriately and finds the top campaign per product category per month.

    Returns a dataframe indexed by product_category and month with columns:
    'marketing_campaign' and 'average_discount'.

    # Challenge2: Implement process_top_marketing_campaign that groups data by 'month', 'product_category',
    # and 'marketing_campaign', computes average discount, and selects the campaign with maximum average discount
    # per 'month' and 'product_category'. Assign the result to a variable and return it.
    """
    # Challenge2: Implement process_top_marketing_campaign to get top marketing campaigns by avg discount
    df_avg = (
        df.groupby(['month', 'product_category', 'marketing_campaign'])['discount'] \
            .mean().reset_index().rename(columns={"discount": "average_discount"})
    )

    # Select top campaign per (month, product_category)
    df_top = (
        df_avg.sort_values(
            ['month', 'product_category', 'average_discount'],
            ascending=["True", "True", "False"] # ensures highest discount appears first
        ) \
            .groupby(['month', 'product_category']) \
                .head(1).reset_index(drop=True)
    )

    return df_top

KeyError: 'month'

In [3]:
shopping_df = pd.read_csv("./datasets/shopping_behavior_updated.csv", sep=",")
shopping_df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
shopping_df.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases'],
      dtype='object')