In [2]:
import pandas as pd
import numpy as np

def clean_products(
    df: pd.DataFrame,
    fill_zero_cols: list[str] | None = None,
    duplicate_subset: list[str] | None = None,
    numeric_cols: list[str] = ["price", "rating"]

) -> tuple[pd.DataFrame, dict]:
    """Return a cleaned copy of df + a short summary dict."""
    report = {
        "rows_before": len(df),
        "dups_before": int(df.duplicated(subset=duplicate_subset).sum()),
        "missing_before": int(df.isna().sum().sum()),
    }
    out = df.copy()

#1 format each column
    out.columns = (
        out.columns
          .str.strip()
          .str.lower()
          .str.replace(" ", "_", regex=False)
    )

#2) trim white space, and fill empty cells as "nan"
    obj_cols = out.select_dtypes(include=["object"]).columns
    for c in obj_cols:
        out[c] = out[c].astype(str).str.strip()
    out.replace({"": np.nan, "nan": np.nan, "None": np.nan}, inplace=True)

#3) fill nan cells with "0"
    if fill_zero_cols is None:
        out = out.fillna(0)
    else:
        out[fill_zero_cols] = out[fill_zero_cols].fillna(0)

#4) drop all column duplicates
    out = out.drop_duplicates(subset=duplicate_subset, keep="first")

#5) ensure that each rating point is between the range 0-5, because that is the scale that is used
    if "price" in out.columns:
        out = out[out["price"] >= 0]
    if "rating" in out.columns:
        out["rating"] = out["rating"].clip(0, 5)

    report.update({
        "rows_after": len(out),
        "dups_after": int(out.duplicated(subset=duplicate_subset).sum()),
        "missing_after": int(out.isna().sum().sum()),
        "columns": out.columns.tolist(),
    })

    return out, report

# save new cleaned file
df = pd.read_csv("products.csv")
df_clean, summary = clean_products(df, duplicate_subset=["product_id"])
df_clean.to_csv("products_clean.csv", index=False)

print(summary)
print("\nPreview:")
print(df_clean.head())

from google.colab import files
files.download('products_clean.csv')


{'rows_before': 13156, 'dups_before': 1787, 'missing_before': 1078, 'rows_after': 11369, 'dups_after': 0, 'missing_after': 0, 'columns': ['product_id', 'brand', 'title', 'price', 'category', 'rating', 'image_url', 'product_url']}

Preview:
   product_id         brand  \
0  B08YRWN3WB      JANSPORT   
1  B08YRXFZZM      JANSPORT   
2  B09Q2PQ7ZB       BAODINI   
3  B001BEAWXY  Calvin Klein   
4  B09FM5PMN3      JANSPORT   

                                               title   price    category  \
0  Big Student Large laptop backpack Black EK0A5B...  189.00  New season   
1                                Superbreak Day Pack  119.00  New season   
2  Mini Travel Umbrella With Case Small Compact U...   17.79  New season   
3           Men's Cotton Classics 3-Pack Boxer Brief  119.00  New season   
4                           Unisex Kids Bag Backpack  179.00  New season   

   rating                                          image_url  \
0     4.7  https://m.media-amazon.com/images/I/51y2E

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>