# 01 - Exploration
Understand the dataset and perform initial EDA.

In [None]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

sns.set_theme(style="whitegrid")

raw_path = os.path.join('..', 'data', 'raw', 'vg_sales_2024.csv')
df = pd.read_csv(raw_path)
df.head()

## Top 10 by Global Sales

Bar plots for the most successful genres, platforms, and publishers by `total_sales`.

In [None]:
# Aggregate total_sales by category and plot top 10
import numpy as np

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Helper to plot top 10 for a column
def plot_top10(ax, col, title):
    s = (
        df.groupby(col, dropna=False)["total_sales"]
          .sum()
          .sort_values(ascending=False)
          .head(10)
    )
    sns.barplot(x=s.values, y=s.index, ax=ax, palette="viridis")
    ax.set_title(title)
    ax.set_xlabel("Total Sales")
    ax.set_ylabel("")

for ax, (col, title) in zip(
    axes,
    [("genre", "Top 10 Genres"), ("platform", "Top 10 Platforms"), ("publisher", "Top 10 Publishers")],
):
    plot_top10(ax, col, title)

plt.tight_layout()
plt.show()

## Correlation Heatmap

Correlation for numeric columns, including `total_sales` and `critic_score`.

In [None]:
# Select numeric columns including total_sales and critic_score if present
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
for must_have in ["total_sales", "critic_score"]:
    if must_have in df.columns and must_have not in numeric_cols:
        numeric_cols.append(must_have)

corr_df = df[numeric_cols].corr(numeric_only=True)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_df, cmap="coolwarm", annot=False, square=False, cbar=True)
plt.title("Correlation Heatmap (numeric features)")
plt.tight_layout()
plt.show()