# Preprocessing: OS Market Share Data from Steam Hardware Survey

This notebook loads the raw OS usage dataset, filters and aggregates it into a clean monthly time series for Linux, macOS, and Windows. The output is saved as a CSV in `data/processed/`.


In [None]:
import pandas as pd
import os

df = pd.read_parquet("data/raw/steam_hw_survey.parquet")

df_total_os = df[df["category"] == "OS Version (total)"].copy()

df_os_share = df_total_os.pivot_table(
    index="date",
    columns="index",    # OS label (e.g., Windows, OSX, Linux)
    values="perc",      # Market share percentage
    aggfunc="sum"
)

# Forward fill any missing values
df_os_share = df_os_share.fillna(method="ffill")

os.makedirs("data/processed", exist_ok=True)
df_os_share.to_csv("data/processed/os_monthly_marketshare.csv")

print("Saved cleaned dataset to data/processed/os_monthly_marketshare.csv")
