<img src="https://i.imgur.com/eCtgnYS.png">

<center><h1>- Data Exploration -</h1></center>

>  🛍️ **Competition Goal:** For each customer within the training data we need to predict up to 12 products that the customer will buy in the next 7-day period *after* the training time period. We can predict up to *12 products* that the customer will likely be purchasing in the 7-day period.

### ⬇ Libraries

In [None]:
# Libraries
import os
import gc
import wandb
import time
import random
import math
import glob
from scipy import spatial
from tqdm import tqdm
import warnings
import cv2
import pandas as pd
import numpy as np
from numpy import dot, sqrt
import seaborn as sns
import matplotlib as mpl
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from IPython.display import display_html
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
plt.rcParams.update({'font.size': 16})

# Environment check
warnings.filterwarnings("ignore")
os.environ["WANDB_SILENT"] = "true"
CONFIG = {'competition': 'HandM', '_wandb_kernel': 'aot'}

# Custom colors
class clr:
    S = '\033[1m' + '\033[95m'
    E = '\033[0m'
    
my_colors = ["#AF0848", "#E90B60", "#CB2170", "#954E93", "#705D98", "#5573A8", "#398BBB", "#00BDE3"]
print(clr.S+"Notebook Color Scheme:"+clr.E)
sns.palplot(sns.color_palette(my_colors))
plt.show()

bk_image = plt.imread("../input/hm-fashion-recommender-dataset/background.jpg")

### 🐝 W&B Fork & Run

In order to run this notebook you will need to input your own **secret API key** within the `! wandb login $secret_value_0` line. 

🐝**How do you get your own API key?**

Super simple! Go to **https://wandb.ai/site** -> Login -> Click on your profile in the top right corner -> Settings -> Scroll down to API keys -> copy your very own key (for more info check [this amazing notebook for ML Experiment Tracking on Kaggle](https://www.kaggle.com/ayuraj/experiment-tracking-with-weights-and-biases)).

<center><img src="https://i.imgur.com/fFccmoS.png" width=500></center>

In [None]:
# 🐝 Secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb")

! wandb login $secret_value_0

### ⬇ Helper Functions

In [None]:
def adjust_id(x):
    '''Adjusts article ID code.'''
    x = str(x)
    if len(x) == 9:
        x = "0"+x
    
    return x


def insert_image(path, zoom, xybox, ax):
    '''Insert an image within matplotlib'''
    imagebox = OffsetImage(mpimg.imread(path), zoom=zoom)
    ab = AnnotationBbox(imagebox, xy=(0.5, 0.7), frameon=False, pad=1, xybox=xybox)
    ax.add_artist(ab)
    
    
def show_values_on_bars(axs, h_v="v", space=0.4):
    '''Plots the value at the end of the a seaborn barplot.
    axs: the ax of the plot
    h_v: weather or not the barplot is vertical/ horizontal'''
    
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, format(value, ','), ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, format(value, ','), ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


# === 🐝 W&B ===
def save_dataset_artifact(run_name, artifact_name, path):
    '''Saves dataset to W&B Artifactory.
    run_name: name of the experiment
    artifact_name: under what name should the dataset be stored
    path: path to the dataset'''
    
    run = wandb.init(project='HandM', 
                     name=run_name, 
                     config=CONFIG)
    artifact = wandb.Artifact(name=artifact_name, 
                              type='dataset')
    artifact.add_file(path)

    wandb.log_artifact(artifact)
    wandb.finish()
    print("Artifact has been saved successfully.")
    
    
def create_wandb_plot(x_data=None, y_data=None, x_name=None, y_name=None, title=None, log=None, plot="line"):
    '''Create and save lineplot/barplot in W&B Environment.
    x_data & y_data: Pandas Series containing x & y data
    x_name & y_name: strings containing axis names
    title: title of the graph
    log: string containing name of log'''
    
    data = [[label, val] for (label, val) in zip(x_data, y_data)]
    table = wandb.Table(data=data, columns = [x_name, y_name])
    
    if plot == "line":
        wandb.log({log : wandb.plot.line(table, x_name, y_name, title=title)})
    elif plot == "bar":
        wandb.log({log : wandb.plot.bar(table, x_name, y_name, title=title)})
    elif plot == "scatter":
        wandb.log({log : wandb.plot.scatter(table, x_name, y_name, title=title)})
        
        
def create_wandb_hist(x_data=None, x_name=None, title=None, log=None):
    '''Create and save histogram in W&B Environment.
    x_data: Pandas Series containing x values
    x_name: strings containing axis name
    title: title of the graph
    log: string containing name of log'''
    
    data = [[x] for x in x_data]
    table = wandb.Table(data=data, columns=[x_name])
    wandb.log({log : wandb.plot.histogram(table, x_name, title=title)})
    
    
# 🐝 Log Cover Photo
run = wandb.init(project='HandM', name='CoverPhoto', config=CONFIG)
cover = plt.imread("../input/hm-fashion-recommender-dataset/pics/Kaggle Covers.png")
wandb.log({"example": wandb.Image(cover)})
wandb.finish()

# 1. Dataset

🛍️ **There are 3 metadata .csv files and 1 image file:**
* `images` - folder containing the photo of *almost* all `article_ids`
* `articles.csv` - description features of all `article_ids` **(105,542 datapoints)**
* `customers.csv` - description features of the customer profiles **(1,371,980 datapoints)**
* `transactions_train.csv` - file containing the `customer_id`, the article that was bought and at what price **(31,788,324 datapoints)**

In [None]:
%%time

# Read in the data
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
ss = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

In [None]:
print(clr.S+"ARTICLES:"+clr.E, articles.shape)
display_html(articles.head(3))
print("\n", clr.S+"CUSTOMERS:"+clr.E, customers.shape)
display_html(customers.head(3))
print("\n", clr.S+"TRANSACTIONS:"+clr.E, transactions.shape)
display_html(transactions.head(3))
print("\n", clr.S+"SAMPLE_SUBMISSION:"+clr.E, ss.shape)
display_html(ss.head(3))

# 2. Articles

## I. Preprocessing

🛍️ **Important Notes**:
* There are *more* `article_ids` than actual images:
    * unique article ids: 105,542
    * unique images: 105,100
* The `path` processing was taking too long, so the fastest (takes 1 second) way to do it was to create a variable that contains all article ids within the `images` folder (remember, `set()` is faster than a `list`), and then to correct any path that was invalid within the `articles.csv` file.
* There are only 416 missing values within the `desc` column - product description

In [None]:
# 🐝 W&B Experiment
run = wandb.init(project='HandM', name='Articles', config=CONFIG)

In [None]:
print(clr.S+"There are no missing values in any columns but 'Detail Description':"+clr.E,
      articles.isna().sum()[-1], "total missing values")

# Replace missing values
articles.fillna(value="No Description", inplace=True)

# Adjust the article ID and product code to be string & add "0"
articles["article_id"] = articles["article_id"].apply(lambda x: adjust_id(x))
articles["product_code"] = articles["article_id"].apply(lambda x: x[:3])

In [None]:
# Get all paths from the image folder
all_image_paths = glob.glob(f"../input/h-and-m-personalized-fashion-recommendations/images/*/*")

print(clr.S+"Number of unique article_ids within articles.csv:"+clr.E, len(articles), "\n"+
      clr.S+"Number of unique images within the image folder:"+clr.E, len(all_image_paths), "\n"+
      clr.S+"=> not all article_ids have a corresponding image!!!"+clr.E, "\n")

# 🐝 Log Distinct article IDs
wandb.log({"article_ids":len(articles)})

# Get all valid article ids
# Create a set() - as it moves faster than a list
all_image_ids = set()

for path in tqdm(all_image_paths):
    article_id = path.split('/')[-1].split('.')[0]
    all_image_ids.add(article_id)

In [None]:
# An image path example: ../input/h-and-m-personalized-fashion-recommendations/images/010/0108775015.jpg

# Create full path to the article image
images_path = "../input/h-and-m-personalized-fashion-recommendations/images/"
articles["path"] = images_path + articles["product_code"] + "/" + articles["article_id"] + ".jpg"

# Adjust the incorrect paths and set them to None
for k, article_id in tqdm(enumerate(articles["article_id"])):
    if article_id not in all_image_ids:
        articles.loc[k, "path"] = None

## II. Explore

In [None]:
print(clr.S+"Total Number of unique Product Names:"+clr.E, articles["prod_name"].nunique())

# Data
prod_name = articles["prod_name"].value_counts().reset_index().head(15)
total_prod_names = articles["prod_name"].nunique()
clrs = ["#CB2170" if x==max(prod_name["prod_name"]) else '#954E93' for x in prod_name["prod_name"]]

# Get images
prod_name_images = articles[articles["prod_name"].isin(prod_name["index"].tolist())].groupby("prod_name")["path"].first().reset_index()
image_paths = prod_name_images["path"].tolist()
image_names = prod_name_images["prod_name"].tolist()

# Plot
fig, ax = plt.subplots(figsize=(25, 13))
plt.title('- Most Frequent Product Names -', size=22, weight="bold")

sns.barplot(data=prod_name, x="prod_name", y="index", ax=ax,
            palette=clrs)
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
plt.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax, h_v="h", space=0.4)
plt.ylabel("Product Name", size = 16, weight="bold")
plt.xlabel("")
plt.xticks([])
plt.yticks(size=16)
plt.tick_params(size=16)

insert_image(path='../input/hm-fashion-recommender-dataset/pics/dragonfly.jpg', zoom=0.45, xybox=(92, 11), ax=ax)

sns.despine(left=True, bottom=True)
plt.show();

print("\n")

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 8))
fig.suptitle('- Example Images -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths, image_names)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

plt.tight_layout()
plt.show()

In [None]:
# 🐝 Log Barplot to W&B
create_wandb_plot(x_data=prod_name["index"], y_data=prod_name["prod_name"],
                  x_name="Product Name", y_name="Frequency", 
                  title="- Most Frequent Product Names -", log="prod_name", plot="bar")

---

In [None]:
print(clr.S+"Total Number of unique Product Types:"+clr.E, articles["product_type_name"].nunique())

# Data
prod_type = articles["product_type_name"].value_counts().reset_index().head(15)
total_prod_types = articles["product_type_name"].nunique()
clrs = ["#00BDE3" if x==max(prod_type["product_type_name"]) else '#398BBB' for x in prod_type["product_type_name"]]

# Get images
prod_type_images = articles[articles["product_type_name"].isin(prod_type["index"].tolist())].groupby("product_type_name")["path"].first().reset_index()
image_paths = prod_type_images["path"].tolist()
image_names = prod_type_images["product_type_name"].tolist()

# Plot
fig, ax = plt.subplots(figsize=(25, 13))
plt.title('- Most Frequent Product Types -', size=22, weight="bold")

sns.barplot(data=prod_type, x="product_type_name", y="index", ax=ax,
            palette=clrs)
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
plt.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax, h_v="h", space=0.4)
plt.ylabel("Product Type", size = 16, weight="bold")
plt.xlabel("")
plt.xticks([])
plt.yticks(size=16)
plt.tick_params(size=16)

insert_image(path='../input/hm-fashion-recommender-dataset/pics/blue.jpg', zoom=0.45, xybox=(11000, 11), ax=ax)

sns.despine(left=True, bottom=True)
plt.show();

print("\n")

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 8))
fig.suptitle('- Example Images -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths, image_names)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

plt.tight_layout()
plt.show()

In [None]:
# 🐝 Log Barplot to W&B
create_wandb_plot(x_data=prod_type["index"], y_data=prod_type["product_type_name"],
                  x_name="Product Type", y_name="Frequency", 
                  title="- Most Frequent Product Types -", log="prod_type", plot="bar")

---

In [None]:
print(clr.S+"Total Number of unique Product Group:"+clr.E, articles["product_group_name"].nunique())

# Data
prod_group = articles["product_group_name"].value_counts().reset_index()
total_prod_groups = articles["product_group_name"].nunique()
clrs = ["#E90B60" if x==max(prod_group["product_group_name"]) else '#AF0848' for x in prod_group["product_group_name"]]

# Get images
prod_group_images = articles[articles["product_group_name"].isin(prod_group["index"].tolist())].groupby("product_group_name")["path"].first().reset_index()
image_paths = prod_group_images["path"].tolist()
image_names = prod_group_images["product_group_name"].tolist()

# Plot
fig, ax = plt.subplots(figsize=(25, 13))
plt.title('- Most Frequent Product Groups -', size=22, weight="bold")

sns.barplot(data=prod_group, x="product_group_name", y="index", ax=ax,
            palette=clrs)
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
plt.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax, h_v="h", space=0.4)
plt.ylabel("Product Group", size = 16, weight="bold")
plt.xlabel("")
plt.xticks([])
plt.yticks(size=16)
plt.tick_params(size=16)

insert_image(path='../input/hm-fashion-recommender-dataset/pics/chloe.jpg', zoom=0.45, xybox=(40000, 14), ax=ax)

sns.despine(left=True, bottom=True)
plt.show();

print("\n")

# Plot
fig, axs = plt.subplots(4, 6, figsize=(23, 10))
fig.suptitle('- Example Images -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths, image_names)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

for a in [-1, -2, -3, -4, -5]: axs[a].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
# 🐝 Log Barplot to W&B
create_wandb_plot(x_data=prod_group["index"], y_data=prod_group["product_group_name"],
                  x_name="Product Group", y_name="Frequency", 
                  title="- Most Frequent Product Group -", log="prod_group", plot="bar")

---

In [None]:
def change_color(x):
    '''Change color name.'''
    if ("light" in x.lower().strip()) or \
        ("dark" in x.lower().strip()) or \
        ("greyish" in x.lower().strip()) or \
        ("yellowish" in x.lower().strip()) or \
        ("greenish" in x.lower().strip()) or \
        ("off" in x.lower().strip()) or \
        ("other" in x.lower().strip()):
        x = x.split(" ")[-1]
        
    return x

articles["colour_group_name"] = articles["colour_group_name"].apply(lambda x: change_color(x))

In [None]:
# Appearance and color
print(clr.S+"Total Number of unique Product Appearances:"+clr.E, articles["graphical_appearance_name"].nunique())
print(clr.S+"Total Number of unique Product Colors (after preprocess):"+clr.E, articles["colour_group_name"].nunique())

# --- Data 1 ---
prod_appearance = articles["graphical_appearance_name"].value_counts().reset_index().head(15)
total_prod_appearances = articles["graphical_appearance_name"].nunique()
clrs1 = ["#AF0848" if x==max(prod_appearance["graphical_appearance_name"]) else '#E90B60' for x in prod_appearance["graphical_appearance_name"]]


# Get images
prod_appearance_images = articles[articles["graphical_appearance_name"].isin(prod_appearance["index"].tolist())].groupby("graphical_appearance_name")["path"].first().reset_index()
image_paths1 = prod_appearance_images["path"].tolist()
image_names1 = prod_appearance_images["graphical_appearance_name"].tolist()

# --- Data 2 ---
prod_color = articles["colour_group_name"].value_counts().reset_index().head(15)
total_prod_color = articles["colour_group_name"].nunique()
clrs2 = ["#CB2170" if x==max(prod_color["colour_group_name"]) else '#954E93' for x in prod_color["colour_group_name"]]

# Get images
prod_color_images = articles[articles["colour_group_name"].isin(prod_color["index"].tolist())].groupby("colour_group_name")["path"].first().reset_index()
image_paths2 = prod_color_images["path"].tolist()
image_names2 = prod_color_images["colour_group_name"].tolist()

# Plot
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(25, 13))

ax1.set_title('- Most Frequent Product Appearances -', size=22, weight="bold")
sns.barplot(data=prod_appearance, x="graphical_appearance_name", y="index", ax=ax1,
            palette=clrs2)
x0,x1 = ax1.get_xlim()
y0,y1 = ax1.get_ylim()
ax1.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax1, h_v="h", space=0.4)
ax1.set_ylabel("Product Appearance", size = 16, weight="bold")
ax1.set_xlabel("")
ax1.set_xticks([])
# ax1.set_yticks(size=16)
# ax1.set_tick_params(size=16)

# insert_image(path='../input/hm-fashion-recommender-dataset/pics/blue.jpg', zoom=0.45, xybox=(11000, 11), ax=ax1)


ax2.set_title('- Most Frequent Product Colors -', size=22, weight="bold")
sns.barplot(data=prod_color, x="colour_group_name", y="index", ax=ax2,
            palette=clrs2)
x0,x1 = ax2.get_xlim()
y0,y1 = ax2.get_ylim()
ax2.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax2, h_v="h", space=0.4)
ax2.set_ylabel("Product Colors", size = 16, weight="bold")
ax2.set_xlabel("")
ax2.set_xticks([])
# ax1.set_yticks(size=16)
# ax1.set_tick_params(size=16)

# insert_image(path='../input/hm-fashion-recommender-dataset/pics/blue.jpg', zoom=0.45, xybox=(11000, 11), ax=ax1)

sns.despine(left=True, bottom=True)
plt.show();

print("\n")

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 8))
fig.suptitle('- Example Images [Appearance] -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths1, image_names1)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

plt.tight_layout()
plt.show()

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 8))
fig.suptitle('- Example Images [Color] -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths2, image_names2)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

plt.tight_layout()
plt.show()

In [None]:
# 🐝 Log Barplot to W&B
create_wandb_plot(x_data= prod_appearance["index"], y_data=prod_appearance["graphical_appearance_name"],
                  x_name="Product Appearance", y_name="Frequency", 
                  title="- Most Frequent Product Appearance -", log="prod_appearance", plot="bar")

create_wandb_plot(x_data= prod_color["index"], y_data=prod_color["colour_group_name"],
                  x_name="Product Color", y_name="Frequency", 
                  title="- Most Frequent Product Color -", log="prod_color", plot="bar")

---

🛍️ **Important Notes**:
* In order for the wordcloud to take the shape of the image you should input a `.jpg` image with **white** background (not black and not transparent - because the function will interpret the transparent background as black).
* More custom fonts like I used below can be found here: https://www.dafont.com/

In [None]:
def similar_color_func(word=None, font_size=None,
                       position=None, orientation=None,
                       font_path=None, random_state=None):
    '''Creates a custom function for the color of the wordcloud.'''
    
    h = 270 # 0 - 360 <- the color hue
    s = 40 # 0-100 <- the color saturation
    l = random_state.randint(30, 70) # 0 - 100 <- gradient
    
    return "hsl({}, {}%, {}%)".format(h, s, l)

In [None]:
print(clr.S+"Total Number of unique Article Descriptions:"+clr.E, articles["detail_desc"].nunique(), "\n")

# Get descriptions and convert them to a string
text = articles["detail_desc"].unique()
text = " ".join(text)

# Get the mask - the form of the wordcloud
mask = np.array(Image.open('../input/hm-fashion-recommender-dataset/pics/mask.jpg'))

# Create wordcloud object
wc = WordCloud(mask=mask, background_color="white", max_words=2000,
               stopwords=STOPWORDS, max_font_size=256,
               random_state=42, width=mask.shape[1],
               height=mask.shape[0], font_path="../input/hm-fashion-recommender-dataset/MorningRainbow.ttf",
               color_func=similar_color_func)
wc.generate(text)

# Plot
fig = plt.figure(figsize=(15, 15))
plt.title("- Most Common Words found within Article Descriptions -",
           size=22, weight="bold")
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# 🐝 Save wordcloud to Dashboard
fig.canvas.draw()
image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))

wandb.log({"wordcloud": wandb.Image(image_from_plot)})

wandb.finish()

In [None]:
# 🐝 Save the updated `articles` file.
# articles.to_parquet('articles.pqt', index=False)

save_dataset_artifact(run_name="save_articles", artifact_name="articles",
                      path="../input/hm-fashion-recommender-dataset/articles.pqt")

# 3. Customers

🛍️ **Important Notes**:
* In this dataset we have quite a few missing values:
    * for columns `FN` and `Active` I replaced all missing values with 0
    * for `club_member_status` and `fashion_news_frequency` I have set all missing values with `UNKNOWN`
    * for `age` I have imputed all missing values with the median age (which is 36)
* I have created an `age_interval` as well that splits all ages in decades

In [None]:
# 🐝 W&B Experiment
run = wandb.init(project='HandM', name='Customers', config=CONFIG)

In [None]:
def create_age_interval(x):
    if x <= 25:
        return [16, 25]
    elif x <= 35:
        return [26, 35]
    elif x <= 45:
        return [36, 45]
    elif x <= 55:
        return [46, 55]
    elif x <= 65:
        return [56, 65]
    else:
        return [66, 99]

In [None]:
print(clr.S+"Missing values within customers dataset:"+clr.E)
print(customers.isna().sum())

# 🐝 Log Distinct customer IDs
wandb.log({"customer_ids":len(customers)})

# Fill FN and Active - the only available value is "1"
customers["FN"].fillna(0, inplace=True)
customers["Active"].fillna(0, inplace=True)

# Set unknown the club member status & news frequency
customers["club_member_status"].fillna("UNKNOWN", inplace=True)

customers["fashion_news_frequency"] = customers["fashion_news_frequency"].replace({"None":"NONE"})
customers["fashion_news_frequency"].fillna("UNKNOWN", inplace=True)

# Set missing values in age with the median
customers["age"].fillna(customers["age"].median(), inplace=True)
customers["age_interval"] = customers["age"].apply(lambda x: create_age_interval(x))

In [None]:
plt.figure(figsize=(24, 10))
plt.suptitle('- Customer Profile -', size=22, weight="bold")

ax1 = plt.subplot(2,2,1)
ax2 = plt.subplot(2,2,2)
ax3 = plt.subplot(2,1,2)

sns.countplot(data=customers, x="club_member_status", ax=ax1,
              order=customers['club_member_status'].value_counts().index,
              palette=my_colors[2:])
show_values_on_bars(axs=ax1, h_v="v", space=0.4)
ax1.set_title("Club Member Status", size=18, weight="bold")
ax1.set_yticks([])
ax1.set_xlabel("")
ax1.set_ylabel("")

sns.countplot(data=customers, x="fashion_news_frequency", ax=ax2,
              order=customers['fashion_news_frequency'].value_counts().index,
              palette=my_colors[2:])
show_values_on_bars(axs=ax2, h_v="v", space=0.4)
ax2.set_title("Fashion News frequency", size=18, weight="bold")
ax2.set_yticks([])
ax2.set_xlabel("")
ax2.set_ylabel("")

sns.distplot(customers["age"], color=my_colors[-3], ax=ax3,
             hist_kws=dict(edgecolor=my_colors[-3]))
ax3.set_title("Age Distribution", size=18, weight="bold")
ax3.set_ylabel("")

for ax in [ax1, ax2]:
    x0,x1 = ax.get_xlim()
    y0,y1 = ax.get_ylim()
    ax.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')
    
# insert_image(path='../input/hm-fashion-recommender-dataset/pics/vans.jpg', zoom=0.5, xybox=(60, 0.00), ax=ax3)

sns.despine(left=True, bottom=True)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.99);

In [None]:
# 🐝 Log Barplot to W&B
dt = customers["club_member_status"].value_counts().reset_index()
create_wandb_plot(x_data= dt["index"], y_data=dt["club_member_status"],
                  x_name="Status", y_name="Frequency", 
                  title="- Club Member Status -", log="member_status", plot="bar")

dt = customers["fashion_news_frequency"].value_counts().reset_index()
create_wandb_plot(x_data= dt["index"], y_data=dt["fashion_news_frequency"],
                  x_name="News", y_name="Frequency", 
                  title="- Fashion News Frequency -", log="news_freq", plot="bar")

create_wandb_hist(x_data=customers["age"], x_name="Age", 
                  title="Age Distribution", log="age_dist")

wandb.finish()

In [None]:
# 🪄🐝 Save the updated `customers` file.
# customers.to_parquet('customers.pqt', index=False)

save_dataset_artifact(run_name="save_customers", artifact_name="customers",
                      path="../input/hm-fashion-recommender-dataset/customers.pqt")

# 4. Transactions

🛍️ **Important Notes**:
* Denims, Trousers and Undergarments are sold the most.
* The **prices are altered**, with the highest one being 0.59 and the lowest being 0.0000169.
* The most expensive items are leather garments.
* The average order has around 23 units and costs ~0.649.
* The units/order is directly correlated with the price/order: as the units increase, the price within the order increases too.

In [None]:
# 🐝 W&B Experiment
run = wandb.init(project='HandM', name='Transactions', config=CONFIG)

In [None]:
print(clr.S+"Missing values within transactions dataset:"+clr.E)
print(transactions.isna().sum())

# 🐝 Log length of transactions
wandb.log({"transaction_ids":len(transactions)})

# Adjust article_id (as did for articles dataframe)
transactions["article_id"] = transactions["article_id"].apply(lambda x: adjust_id(x))

---

In [None]:
# Get data
top_sold_products = transactions["article_id"].value_counts().reset_index().head(15)
top_sold_products.columns = ["article_id", "count"]
top_sold_products = pd.merge(top_sold_products, articles, on="article_id")[["article_id", "count", "prod_name"]]

clrs = ["#E90B60" if x==max(top_sold_products["count"]) else '#AF0848' for x in top_sold_products["count"]]

# Get images
image_paths = [path for path in articles[articles["article_id"].isin(top_sold_products["article_id"].tolist())]["path"].tolist() 
               if path != None]
image_names = articles[articles["path"].isin(image_paths)]["prod_name"].tolist()


# Plot
fig, ax = plt.subplots(figsize=(25, 13))
plt.title('- Products that sell the most (in UNITS) -', size=22, weight="bold")

sns.barplot(data=top_sold_products, x="count", y="prod_name", ax=ax,
            palette=clrs)
x0,x1 = ax.get_xlim()
y0,y1 = ax.get_ylim()
plt.imshow(bk_image, zorder=0, extent=[x0, x1, y0, y1], alpha=0.35, aspect='auto')

show_values_on_bars(axs=ax, h_v="h", space=0.4)
plt.ylabel("Product Name", size = 16, weight="bold")
plt.xlabel("")
plt.xticks([])
plt.yticks(size=16)
plt.tick_params(size=16)

# insert_image(path='../input/hm-fashion-recommender-dataset/pics/chloe.jpg', zoom=0.45, xybox=(40000, 14), ax=ax)

sns.despine(left=True, bottom=True)
plt.show();

print("\n")

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 10))
fig.suptitle('- Images -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths, image_names)):
    axs[k].set_title(f"{name}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

for a in [-1, -2]: axs[a].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
# 🐝 Log Barplot to W&B
create_wandb_plot(x_data= top_sold_products["prod_name"], y_data=top_sold_products["count"],
                  x_name="Prod Name", y_name="Count", 
                  title="- Products that sell the most (UNITS) -", log="sold_most", plot="bar")

---

In [None]:
print(clr.S+"Maximum Price is:"+clr.E, transactions["price"].max(), "\n" +
      clr.S+"Minimum Price is:"+clr.E, transactions["price"].min(), "\n" +
      clr.S+"Average Price is:"+clr.E, transactions["price"].mean())

# Get data
top_sold_products = transactions.groupby("article_id")["price"].max().reset_index()\
                                        .sort_values("price", ascending=False).head(15)
top_sold_products.columns = ["article_id", "price"]
top_sold_products = pd.merge(top_sold_products, articles, on="article_id")[["article_id", "price", "prod_name"]]

clrs = ["#E90B60" if x==max(top_sold_products["price"]) else '#AF0848' for x in top_sold_products["price"]]

# Get images
image_paths = [path for path in articles[articles["article_id"].isin(top_sold_products["article_id"].tolist())]["path"].tolist() 
               if path != None]
image_names = articles[articles["path"].isin(image_paths)]["prod_name"].tolist()

# Plot
fig, axs = plt.subplots(3, 5, figsize=(23, 10))
fig.suptitle('- Most Expensive Products -', size=22, weight="bold")
axs = axs.flatten()

for k, (path, name) in enumerate(zip(image_paths, image_names)):
    prc = top_sold_products[top_sold_products["prod_name"]==name]["price"].values[0]
    axs[k].set_title(f"{name} : {round(prc, 3)}", size = 16)
    img = plt.imread(path)
    axs[k].imshow(img)
    axs[k].axis("off")

# for a in [-1, -2]: axs[a].set_visible(False)
plt.tight_layout()
plt.show()

---

In [None]:
# Data
basket = transactions.groupby("customer_id").agg({'article_id':'count', 
                                                  'price': 'sum'}).reset_index()
basket.columns = ["customer_id", "units", "order_price"]

print(clr.S+"=== UNITS/ORDER ==="+clr.E)
print(clr.S+"Maximum Units/Order is:"+clr.E, basket["units"].max(), "\n" +
      clr.S+"Minimum Units/Order is:"+clr.E, basket["units"].min(), "\n" +
      clr.S+"Average Units/Order is:"+clr.E, basket["units"].mean(), "\n")

print(clr.S+"=== SPENDING/ORDER ==="+clr.E)
print(clr.S+"Maximum Spending/Order is:"+clr.E, basket["order_price"].max(), "\n" +
      clr.S+"Minimum Spending/Order is:"+clr.E, basket["order_price"].min(), "\n" +
      clr.S+"Average Spending/Order is:"+clr.E, basket["order_price"].mean())

# Plot
plt.figure(figsize=(24, 15))
plt.suptitle('- Order Attributes -', size=22, weight="bold")

ax1 = plt.subplot(2,2,1)
ax2 = plt.subplot(2,2,2)
ax3 = plt.subplot(2,1,2)

sns.distplot(basket["units"], color=my_colors[-3], ax=ax1,
             hist_kws=dict(edgecolor=my_colors[-3]))
ax1.set_title("Units/Order Distribution", size=18, weight="bold")
ax1.set_ylabel("")

sns.distplot(basket["order_price"], color=my_colors[-5], ax=ax2,
             hist_kws=dict(edgecolor=my_colors[-5]))
ax2.set_title("Spending/Order Distribution", size=18, weight="bold")
ax2.set_ylabel("")

sns.scatterplot(data=basket, x="units", y="order_price", hue="units", palette="mako", 
                legend=None, ax=ax3)
ax3.set_title("Units x Price Correlation", size=18, weight="bold")
ax3.set_ylabel("Spending per Order")
ax3.set_xlabel("Units per Order")

sns.despine(left=True, bottom=True)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None);

In [None]:
# 🐝 Log parameters
wandb.log({"average_units_per_order" : basket["units"].mean(),
           "average_spending_per_order" : basket["order_price"].mean()})

wandb.finish()

In [None]:
# 🐝 Save the updated `transactions` file.
# transactions.to_parquet('transactions.pqt', index=False)

save_dataset_artifact(run_name="save_transactions", artifact_name="transactions",
                      path="../input/hm-fashion-recommender-dataset/transactions.pqt")

In [None]:
del articles, customers, transactions, ss
del top_sold_products, prod_appearance, prod_color, prod_name
gc.collect()

<img src="https://i.imgur.com/nMuocgz.png">

# 5. Market Basket Analysis

**What is a Recommender System**?

A recommender system is not more or less than an algorithm that tries to predict the preference on an object or concept based on somebody's preferences for other objects or concepts.

This can apply to **anything**: movies, songs, books, amazon orders, clothing or just Google Engine searches.

<center><img src="https://i.imgur.com/qKG6v9l.png" width=500></center>

> 🛍 **Turicreate**: we will be using `turicreate` model in order to create recommendations for users based on their previous purchases. For more details about the library you can [read the documentation](https://github.com/apple/turicreate). My main inspiration was this amazing article [How to Build a Recommendation System for Purchase Data (Step-by-Step)](https://medium.datadriveninvestor.com/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6).

> 🛍 **RAPIDS**: as RAPIDS outperformes pandas whenever we have large datasets, I will be using it to prepare the data.

In [None]:
!pip install turicreate --user

import turicreate as tc
import cudf
import cuml
import cupy

from cuml.model_selection import train_test_split

In [None]:
# Read in RAPIDS dataframes
transactions = cudf.read_parquet("../input/hm-fashion-recommender-dataset/transactions.pqt")

### Faster & Lighter dataframe

I will be using Chris's Deotte [trick within this discussion post](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635) to make the dataframes lighter.

In [None]:
# Keep only last 16 digits from customer_id and convert to int
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# Convert article_id from object to int32
# transactions['article_id'] = transactions['article_id'].astype('int32')
# Convert date from object to datetime
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])

transactions = transactions[['t_dat','customer_id','article_id']]

### Decrease dataset size

🛍 I will decrease the dataset size as following:
* Select only **TOP_N most frequently bought articles** - as there are 104,547 unique article IDs, the training dataframe would be extremely wide and the notebook would run out of memory. Hence, for showcasing purposes I have set the TOP_N used in this notebook to be very low (only 200 ids are used).
* Get only customers with many transactions (as customers with only 1 purchased item are harder to evaluate by the model).

In [None]:
# ------ PARAMETERS ------
TOP_CUSTOMERS = 300000
TOP_N = 200
# ------------------------

In [None]:
# Select only most frequent article ids
most_frequent_articles = transactions["article_id"].value_counts().reset_index()
most_frequent_articles.columns = ["article_id", "count"]
print(clr.S+"Total Unique IDs in Transactions:"+clr.E, len(most_frequent_articles))
print(clr.S+"Total Unique IDs that are selected:"+clr.E, TOP_N)
# Get top n most frequent products
most_frequent_articles = cupy.asarray(most_frequent_articles.head(TOP_N)["article_id"])

transactions = transactions[transactions["article_id"].isin(most_frequent_articles)].reset_index(drop=True)

In [None]:
# Get customers with many transactions on these TOP_N items
customers_top_trans = list(transactions["customer_id"].value_counts().reset_index()\
                            .head(TOP_CUSTOMERS)["index"].unique().to_pandas())
transactions = transactions[transactions["customer_id"].isin(customers_top_trans)].reset_index(drop=True)

print(clr.S+"Total unique users to recommend:"+clr.E, transactions["customer_id"].nunique())

In [None]:
del most_frequent_articles, customers_top_trans
gc.collect()

# 6. Prepare Datasets

We will be preparing 3 distinct datasets as follows:

<center><img src="https://i.imgur.com/ODESQDe.png" width=700></center>

### I. Train Dataset

In [None]:
# Count per each customer how many products of each they have bought
train = transactions.groupby(["customer_id","article_id"])["t_dat"].count().reset_index()
train.columns = ["customer_id","article_id", "purchase_count"]

train.head()

### II. Dummy Dataset

In [None]:
dummy_train = train.copy()
dummy_train['purchase_dummy'] = 1

dummy_train.head()

### III. Normalized Dataset

In [None]:
def normalize_data(data):
    
    # Create matrix with customers on rows and articles ad columns
    # Using pandas here due to an error in cudf
    df_matrix = cudf.DataFrame(pd.pivot(data.to_pandas(), columns="article_id",
                                        index="customer_id", values="purchase_count"))
    # Normalize
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    
    # Recreate df with each customer and the 'label' whether they purchased the product or not
    final = cudf.melt(d, id_vars=['customer_id'], value_name='scaled_purchase_freq').dropna()
    final.columns = ["customer_id", "article_id", "scaled_purchase_freq"]
    final = final.reset_index(drop=True)
    
    return final

In [None]:
# Normalize count dataset
norm_train = normalize_data(data=train)

norm_train.head()

# 7. Data Validation

In [None]:
def split_data(data):
    
    train, test = train_test_split(data, test_size=0.3)
    train_data = tc.SFrame(train.to_pandas())
    test_data = tc.SFrame(test.to_pandas())
    
    return train_data, test_data

In [None]:
# TODO: fix norm_train error
train_data, test_data = split_data(train)
train_data_dummy, test_data_dummy = split_data(dummy_train)
# train_data_norm, test_data_norm = split_data(norm_train)

# 8. Models

In [None]:
# ------ PARAMETERS ------
user_id = 'customer_id'
item_id = 'article_id'
users_to_recommend = list(train["customer_id"].unique().to_pandas())
# ------------------------

In [None]:
def train_model(train_data, name, user_id, item_id, target, users_to_recommend):
    '''
    Trains a recommender model.
    train_data: the training tc.SFrame()
    name: can be 'popularity', 'cosine' or 'pearson'
    user_id & item_id: the customer and article unique IDs
    target: the value to be predicted, can be 'purchase_count', 'purchase_dummy' or 'scaled_purchase_freq'
    users_to_recommend: a unique list containing all customers for which we do the prediction
    '''
    
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                 user_id=user_id, 
                                                 item_id=item_id, 
                                                 target=target, verbose=False)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                      user_id=user_id, 
                                                      item_id=item_id, 
                                                      target=target,
                                                      similarity_type='cosine', verbose=False)
    elif name == 'pearson':
            model = tc.item_similarity_recommender.create(train_data, 
                                                          user_id=user_id, 
                                                          item_id=item_id, 
                                                          target=target, 
                                                          similarity_type='pearson', verbose=False)
    
    # k is set to 12 => maximum items to recommend for one customer
    recom = model.recommend(users=users_to_recommend, k=12, verbose=False)
    
    return model, recom

## 8.1 Popularity Recommender System

> 🛍 **Popularity Algorithm**: this one recommends the most popular articles across all customers.

In [None]:
# --- TRAIN ---
name = 'popularity'
target = 'purchase_count'

popularity_count, _ = train_model(train_data, name, user_id, item_id, target, users_to_recommend)
# _.to_dataframe()["article_id"].nunique()

In [None]:
# --- DUMMY ---
name = 'popularity'
target = 'purchase_dummy'

popularity_dummy, _ = train_model(train_data_dummy, name, user_id, item_id, target, users_to_recommend)

## 8.2 Cosine Recommender System

> 🛍 **Cosine Algorithm**: uses the *collaborative filtering* methodology to find how similar is a product from another.

In [None]:
# --- TRAIN ---
name = 'cosine'
target = 'purchase_count'

cosine_count, _ = train_model(train_data, name, user_id, item_id, target, users_to_recommend)

In [None]:
# --- DUMMY ---
name = 'cosine'
target = 'purchase_dummy'

cosine_dummy, _ = train_model(train_data_dummy, name, user_id, item_id, target, users_to_recommend)

## 8.3 Pearson Recommender System

> 🛍 **Pearson Algorithm**: as explained in the Cosine Algorithm, this also uses the *collaborative filtering* methodology to find how similar is a product from another.

In [None]:
# --- TRAIN ---
name = 'pearson'
target = 'purchase_count'

pearson_count, _ = train_model(train_data, name, user_id, item_id, target, users_to_recommend)

In [None]:
# --- DUMMY ---
name = 'pearson'
target = 'purchase_dummy'

pearson_dummy, _ = train_model(train_data_dummy, name, user_id, item_id, target, users_to_recommend)

## 8.4 Evaluate

> 🛍 **Note**: The tables below show the **RMSE, Mean Precision & Mean Recall** output from the 3 algorithms (Popularity, Cosine and Pearson) on 2 types of datasets: Count and Dummy (for mor details you can check out the `eval_counts.txt` and `eval_dummy.txt` files available in [my dataset](https://www.kaggle.com/datasets/andradaolteanu/hm-fashion-recommender-dataset))

<center><img src="https://i.imgur.com/8LP7Cr4.png" width=900></center>

*Side note: I have commented the cells below so that the notebook commits faster - all the results are in `eval_counts.txt` and `eval_dummy.txt` files available in [this dataset](https://www.kaggle.com/datasets/andradaolteanu/hm-fashion-recommender-dataset)*

In [None]:
# Group all models per type of dataset
model_counts = [popularity_count, cosine_count, pearson_count]
models_dummy = [popularity_dummy, cosine_dummy, pearson_dummy]

names_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts',
                'Pearson Similarity on Purchase Counts']
names_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy',
               'Pearson Similarity on Purchase Dummy']

# # Evaluate the models
# eval_counts = tc.recommender.util.compare_models(test_data, model_counts, model_names=names_counts, verbose=False)
# eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_dummy, model_names=names_dummy, verbose=False)

In [None]:
# # Print outputs to .txt files
# f = open("eval_counts.txt", "a")
# print(eval_counts, file=f)
# f.close()

# f = open("eval_dummy.txt", "a")
# print(eval_dummy, file=f)
# f.close()

In [None]:
# 🐝 Save the updated `articles` file.
save_dataset_artifact(run_name="save_output_counts", artifact_name="counts_results",
                      path="../input/hm-fashion-recommender-dataset/eval_counts.txt")

save_dataset_artifact(run_name="save_output_dummy", artifact_name="dummy_results",
                      path="../input/hm-fashion-recommender-dataset/eval_dummy.txt")

In [None]:
del popularity_count, cosine_count, pearson_count
del popularity_dummy, cosine_dummy, pearson_dummy
gc.collect()

# 9. Prediction

## 9.1 Predictions using the Recommender System

In [None]:
METHOD = "cosine"
TARGET = "purchase_dummy"

# 🐝 W&B Experiment
run = wandb.init(project='HandM', name=f'{METHOD}_customers{TOP_CUSTOMERS}_topArticles{TOP_N}', config=CONFIG)

In [None]:
# Final Model Training
final_model = tc.item_similarity_recommender.create(tc.SFrame(dummy_train.to_pandas()), 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=TARGET,
                                                    similarity_type=METHOD,
                                                    verbose=False)

recom = final_model.recommend(users=users_to_recommend, k=12, verbose=False)

# Convert to dataframe
recom_df = recom.to_dataframe()

print(clr.S+"12 Recommendations for each Customer:"+clr.E)
recom_df.head(12)

In [None]:
# Adjust ID
recom_df["article_id"] = recom_df["article_id"].apply(lambda x: adjust_id(x))

# Map all article_ids per 1 customer (like in submission file)
all_preds = recom_df.groupby("customer_id")["article_id"].unique().to_dict()
recom_df["preds"] = recom_df["customer_id"].map(all_preds)
recom_df["preds"] = recom_df["preds"].apply(lambda x: " ".join([str(y) for y in x]))

recom_df = recom_df.groupby("customer_id")["preds"].first().reset_index()

In [None]:
# 🐝 Log CV score
wandb.log({"TOP_CUSTOMERS" : TOP_CUSTOMERS,
              "TOP_N" : TOP_N,
              "METHOD" : METHOD,
              "TARGET": TARGET,
              "CV" : 0.0051})

wandb.finish()

### Make Submission

In [None]:
# Import sample submission
ss = cudf.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
ss['customer_id_new'] = ss['customer_id'].str[-16:].str.hex_to_int().astype('int64')

In [None]:
# Merge with predicted preds
ss = ss.merge(cudf.DataFrame(recom_df[["customer_id", "preds"]]), 
              left_on="customer_id_new", right_on="customer_id", how="left")

## 9.2 Prediction using Common Sense

Because we **cannot predict every customer using the Recommender Systems** (because the dataset is too large), we will have to use other *simpler* methods to predict the rest of the customers.

I was heavily inspired by Chris' Deotte's [notebook](https://www.kaggle.com/code/cdeotte/recommend-items-purchased-together-0-021/notebook) fot his next section (which is as of now **work in progress**).

In [None]:
# Read in RAPIDS dataframes
transactions = cudf.read_parquet("../input/hm-fashion-recommender-dataset/transactions.pqt")
print(clr.S+"Unique customer IDs:"+clr.E, transactions["customer_id"].nunique())

transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])
transactions = transactions[['t_dat','customer_id','article_id']]

In [None]:
# Compute last purchase date
max_date = transactions.groupby('customer_id')["t_dat"].max().reset_index()
max_date.columns = ['customer_id','max_date']
transactions = transactions.merge(max_date, on=['customer_id'], how='left')
transactions['diff_days'] = (transactions["max_date"] - transactions["t_dat"]).dt.days

del max_date
gc.collect()

### Compute most frequent purchased items

*TODO: enhance using EDA knowledge*

In [None]:
# temp = transactions.groupby(["customer_id", "article_id"])["t_dat"].count().reset_index()
# temp.columns = ["customer_id", "article_id", "count"]

# transactions = transactions.merge(temp, on=["customer_id", "article_id"], how='left')
# transactions = transactions.sort_values(["count", "t_dat"], ascending=False)
# transactions = transactions.drop_duplicates(["customer_id", "article_id"])
# transactions = transactions.sort_values(["count", "t_dat"], ascending=False)

### Compute Last Week's Most Popular Items

In [None]:
temp = transactions.loc[transactions["t_dat"] >= cudf.to_datetime('2020-09-16')]
top12 = ' 0' + ' 0'.join(temp.article_id.value_counts().to_pandas().index.astype('str')[:12])

### Final Submission

In [None]:
# Fill customers with no prediction with top12
# TODO: change this
ss['preds'].fillna(top12, inplace=True)

# Drop unwanted columns and make submission
ss.drop(columns=["prediction", "customer_id_new", "customer_id_y"], inplace=True)
ss.columns = ["customer_id", "prediction"]

ss.to_csv('submission.csv', index=False)

In [None]:
# Submission needs to be 1371980 long

print(clr.S+"Submission Shape:"+clr.E, ss.shape)
ss.head()

In [None]:
# === Work in Progress

<center><img src="https://i.imgur.com/0cx4xXI.png"></center>

### 🐝 W&B Dashboard

> My [W&B Dashboard](https://wandb.ai/andrada/HandM).

<center><video src="https://i.imgur.com/ni0pfbj.mp4" width=800 controls></center>

<center><img src="https://i.imgur.com/knxTRkO.png"></center>

### My Specs

* 🖥 Z8 G4 Workstation
* 💾 2 CPUs & 96GB Memory
* 🎮 NVIDIA Quadro RTX 8000
* 💻 Zbook Studio G7 on the go