# **Amazon Metadata Dataset**

In [0]:
from collections import Counter
from pprint import pprint

import matplotlib.pyplot as plt
import missingno as msno
import pandas as pd
from IPython.display import HTML, display
# import delta_sharing

In [0]:
## Uncomment to run outside of databricks
# get data
# profile_file = "config.share"
# client = delta_sharing.SharingClient(profile_file)
# print("Available tables:")
# pprint(client.list_all_tables())

# table_url = profile_file + "#share__products.silver.amazon_metadata_silver"
# df = delta_sharing.load_as_pandas(table_url, limit=200_000)

df = spark.table("products.silver.amazon_metadata_silver").limit(200_000).toPandas()

In [0]:
print("Dataset shape", df.shape)

df.head()

**Insights**

- The columns that look like lists are actually numpy arrays. ¿Why they come in this format?

In [0]:
df.dtypes

---

In [0]:
msno.matrix(df);

---

In [0]:
df["asin"].value_counts()

In [0]:
num_dup = df.duplicated("asin", keep="first").sum()
print(f"Percentaje of duplicated reviews {num_dup / len(df) * 100:.2f}%")

---

In [0]:
# shapes of the arrays in the also_buy column
df["also_buy"].map(lambda x: x.shape if x is not None else None).unique()

**Insights**
- `also_buy` columns are 1-dimensional numpy arrays, so they might be better represented as python lists.

In [0]:
print("Products with 'also_buy' attr greater than 0:")
(df["also_buy"].isnull().value_counts() / len(df) * 100).reset_index()

In [0]:
also_buy_no_null = df["also_buy"].dropna()
also_buy_no_null

---

In [0]:
df["brand"].value_counts().reset_index()

**Insights**
- This counts might not be relevant as this is only a small subset of the whole dataset.

In [0]:
print("Products with 'also_view' attr greater than 0:")
(df["also_view"].isnull().value_counts() / len(df) * 100).reset_index()

---

In [0]:
categories = Counter()

df["category"].dropna().map(lambda x: categories.update(x.tolist()))

print("Number of categories:", len(categories))

In [0]:
print("Most common categories:")

categories.most_common(10)

---

In [0]:
df["description"].dropna().map(len)

In [0]:
for _, row in df.dropna(subset=["description"]).sample(1).iterrows():
    description = row["description"]
    print(description)

**Insights**
- Some products don't have description.

---

In [0]:
for _, row in df.dropna().sample(1).iterrows():
    features = row["feature"]
    print("Number of features:", len(features))
    print(features)

**Insights**
- Some `features` are in html format

---

In [0]:
df["main_category"].value_counts().reset_index()

---

In [0]:
df["title"].map(len).value_counts().sort_index().to_frame().head()

In [0]:
for _, row in df.dropna().sample(10).iterrows():
    title = row["title"]
    print(title)

---

In [0]:
df["similar_items"].dropna()