In [1]:
import os
import numpy as np
import pandas as pd 

In [2]:
%matplotlib inline

In [3]:
BASE_DATASET = "../../data/mlspec-blackfriday/dataset/raw"
os.makedirs(BASE_DATASET, exist_ok=True)

In [4]:
!gsutil cp -r gs://mlteam-ml-specialization-2021-blackfriday/dataset/raw/* $BASE_DATASET/

Copying gs://mlteam-ml-specialization-2021-blackfriday/dataset/raw/README.md...
Copying gs://mlteam-ml-specialization-2021-blackfriday/dataset/raw/test.csv...
Copying gs://mlteam-ml-specialization-2021-blackfriday/dataset/raw/train.csv...

Operation completed over 3 objects/33.5 MiB.                                     


In [4]:
TRAIN_SET = os.path.join(BASE_DATASET, 'train.csv')
df = pd.read_csv(TRAIN_SET)

In [5]:
df.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [6]:
USER_COLUMNS = ['User_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']

In [44]:
df_totals = df.groupby(USER_COLUMNS).agg(
    {
        'Product_ID': 'count',
        'Purchase': ['count', 'sum', 'mean', 'max']
    }
)
df_totals.columns = ['_'.join(col) for col in df_totals.columns.values]
df_totals = df_totals.reset_index()

In [56]:
# Display the Dive visualization for the training data.
from IPython.core.display import display, HTML

jsonstr = df_totals.to_json(orient='records')
HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
        <facets-dive id="elem" height="600"></facets-dive>
        <script>
          var data = {jsonstr};
          document.querySelector("#elem").data = data;
        </script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr)
display(HTML(html))