In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Configuration
DATASET_FILE = "../intern_data_ikarus.csv" 

In [None]:
# Load Data and Initial Inspection ---
print(f"Loading dataset: {DATASET_FILE}")
df = pd.read_csv(DATASET_FILE)

print("Initial DataFrame Info:")
df.info()

In [None]:
# Data Cleaning and Preprocessing for Analytics ---

#  Clean 'price' column and convert to numeric (Crucial for price analysis)
def clean_price(price):
    if isinstance(price, str):
        # Remove '$', ',', and any non-numeric characters, then convert to float
        return pd.to_numeric(price.replace('$', '').replace(',', '').strip(), errors='coerce')
    return pd.to_numeric(price, errors='coerce')

df['price_numeric'] = df['price'].apply(clean_price)
df = df.dropna(subset=['price_numeric']) # Drop rows where price couldn't be cleaned

In [None]:
$ Clean 'categories' and 'material' for grouping
# Fill NaNs to avoid errors in grouping
df['categories'] = df['categories'].fillna('Unknown')
df['material'] = df['material'].fillna('Unknown')
df['brand'] = df['brand'].fillna('Generic')

In [None]:
# Exploratory Data Analysis (EDA) & Visualization ---

# Visualize Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['price_numeric'], bins=50, kde=True)
plt.title('Distribution of Product Prices')
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.show()

In [None]:
## Top 10 Brands
top_brands = df['brand'].value_counts().nlargest(10)
plt.figure(figsize=(12, 6))
top_brands.plot(kind='bar')
plt.title('Top 10 Brands by Product Count')
plt.ylabel('Product Count')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
## Analyze Category Distribution (using a simplified first category)
# Extract only the primary category for cleaner plotting
df['primary_category'] = df['categories'].apply(lambda x: x.split(',')[0].strip().replace("['", "").replace("']", ""))
top_categories = df['primary_category'].value_counts().nlargest(10)

plt.figure(figsize=(12, 6))
top_categories.plot(kind='pie', autopct='%1.1f%%')
plt.title('Top Primary Category Distribution')
plt.ylabel('')
plt.show()

In [None]:
## Calculating Metrics for FastAPI Dashboard ---

#  Total Products
total_products = len(df)

# Top 3 Categories by Product Count
top_categories_list = df['primary_category'].value_counts().nlargest(3).reset_index()
top_categories_list.columns = ['name', 'count']
top_categories_data = top_categories_list.to_dict('records')

In [None]:
## Average Price by Material
avg_price_by_material = df.groupby('material')['price_numeric'].mean().round(2).nlargest(4).to_dict()
# Reasoning: Demonstrates grouping, aggregation, and sorting. This data is used for the BarChart in React.

#  Top Brand Distribution
brand_distribution = df['brand'].value_counts().nlargest(3).reset_index()
brand_distribution.columns = ['brand', 'count']
brand_distribution_data = brand_distribution.to_dict('records')

In [None]:
## Final Analytics JSON Output (To be used in FastAPI mock/DB) ---
analytics_output = {
    "total_products": total_products,
    "top_categories": top_categories_data,
    "avg_price_by_material": avg_price_by_material,
    "brand_distribution": brand_distribution_data
}

print("\n--- FINAL ANALYTICS OUTPUT (For FastAPI Dashboard) ---")
import json
print(json.dumps(analytics_output, indent=4))