# What features can better describe the product?

In [1]:
import pandas as pd
import numpy as np

In [2]:
TRAIN_PATH          = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv"
EVAL_PATH           = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/test/evalset.csv"

In [3]:
df_train = pd.read_csv(TRAIN_PATH)
df_eval  = pd.read_csv(EVAL_PATH)

In [5]:
df_train[["Product_Category_1","Product_Category_2","Product_Category_3"]].isna().sum()

Product_Category_1        0
Product_Category_2    24621
Product_Category_3    54487
dtype: int64

"Product_Category_1" is the only one that doesn't have null values.  
Let's try to define the product by the combination of its own features, which will only be "Product_Category_1" for now  
The user will still be defined as the combination of its own features as in part1

In [7]:
USER_FEATURES       = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status"]
PRODUCT_FEATURES    = ["Product_Category_1"]

## How well user features map to product features?

In [10]:
# SELF NOTE: This doesn't generalize to multiple product features!
train_products = df_train[PRODUCT_FEATURES[0]].unique()
eval_products  = df_eval[PRODUCT_FEATURES[0]].unique()
intersection   = np.intersect1d(train_products, eval_products)
dict(
    train_products_number        = train_products.shape[0],
    eval_products_number         = eval_products.shape[0],
    intersection_products_number = intersection.shape[0],
)

{'train_products_number': 20,
 'eval_products_number': 20,
 'intersection_products_number': 20}

In [35]:
# for a given "user features combination", we will output all the products with an associated "product features combination"

example=df_train.groupby(USER_FEATURES)[["Product_Category_1","Product_Category_2"]].apply(lambda x: x.values.tolist())

In [36]:
print(example.values[0])

[[5.0, nan], [5.0, nan], [8.0, nan], [5.0, nan], [2.0, 4.0], [8.0, nan], [5.0, 8.0], [9.0, 15.0], [8.0, nan], [9.0, 15.0], [2.0, 5.0], [5.0, 8.0], [5.0, 8.0], [5.0, 14.0], [5.0, nan], [8.0, nan], [8.0, nan], [8.0, 14.0], [5.0, nan], [8.0, 14.0], [11.0, 15.0], [1.0, 2.0], [8.0, nan], [8.0, nan], [3.0, 13.0], [8.0, nan], [5.0, 8.0], [6.0, 8.0], [8.0, nan], [3.0, 12.0], [8.0, nan], [1.0, 2.0], [8.0, nan], [8.0, nan], [8.0, nan], [5.0, 14.0], [8.0, nan], [5.0, 9.0], [8.0, nan], [3.0, 4.0], [1.0, 4.0], [8.0, 16.0], [8.0, 13.0], [2.0, 4.0], [8.0, 15.0], [4.0, 15.0], [2.0, 9.0], [5.0, nan], [8.0, 16.0], [8.0, nan], [1.0, 2.0]]


In [None]:
example=df_train.groupby(USER_FEATURES)[["Product_Category_1","Product_Category_2"]].apply(lambda x: x[""])