# Create dataset 


In [1]:
from google.cloud import bigquery

client = bigquery.Client(project="artefact-taxonomy")

from sklearn.model_selection import train_test_split

In [2]:
# base_uri = "gs://artefact-taxonomy-classification-datasets/3P_mirakl_multilang_v1"


CUTOFF = 5

# Define the product_id and target / label
id_col = "adeo_product_id"
title_col = "title"
language_col = "lang"
description_raw = "description"
label_raw_col = "mirakl_model_code"

In [3]:
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
sql = """
SELECT * FROM `artefact-taxonomy.pem_uc_add_datasets.temp_simplon`
"""

df = client.query(sql).to_dataframe()

# Check the data
- 100 mirakl classes
- with 4 other attributes:
    style: 02419
    color: 01746
    shape: 00562
    battery included: 15344

## Preprocessing

In [5]:
# remove classes with samples <= CUTOGFF
df_cut = df[df[label_raw_col].map(df[label_raw_col].value_counts()) >= CUTOFF]

In [6]:
df_cut.describe()

Unnamed: 0,adeo_product_id,mirakl_model_code,title,lang,description
count,4565914,4565914,4565914,4565914,4565914
unique,3158410,3202,3885904,3,3040743
top,12105401,202265|PAPIER_PEINT|PAPIER_PEINT_FRISE_ET_FIBR...,Vis à métaux Ultima à tête cylindrique hexagon...,FR,"Papier peint intissé solide, résistant à l'eau..."
freq,9,148827,691,2810387,8015


In [7]:
df_unique = df_cut.copy()
df_unique.drop(columns=[title_col, language_col, description_raw], inplace=True)
df_unique = df_unique.drop_duplicates(subset=id_col, keep="first")

In [8]:
# Calculate the no. of entries and models removed due to sample <= CUTOFF per model
sample_cut_lackofsample = df.shape[0] - df_cut.shape[0]
model_cut_lackofsample = df[label_raw_col].nunique() - df_cut[label_raw_col].nunique()

In [9]:
# check if title or description exists - drop entries with no title and no description

df_cut["product_info"] = "None"
df_cut["product_info"].loc[
    df_cut[title_col].notnull() & df_cut[description_raw].notnull()
] = "both"
df_cut["product_info"].loc[
    df_cut[title_col].notnull() & df_cut[description_raw].isnull()
] = "title_only"
df_cut["product_info"].loc[
    df_cut[title_col].isnull() & df_cut[description_raw].notnull()
] = "description_only"

In [10]:
df_cut = df_cut.loc[df_cut["product_info"] != "None"]

In [11]:
# Calculate the no. of entries and models removed due to sample <= CUTOFF per model
sample_cut_lackofinfo = df.shape[0] - df_cut.shape[0] - sample_cut_lackofsample
model_cut_lackofinfo = (
    df[label_raw_col].nunique() - df_cut[label_raw_col].nunique() - model_cut_lackofsample
)

In [12]:
# Split the unique adeo_product_id
# To ensure that a unique adeo_product_id exist in only 1 dataset - to avoid data leaking

train_val, test = train_test_split(df_unique, test_size=0.1, stratify=df_unique[label_raw_col])
train, val = train_test_split(train_val, test_size=0.1, stratify=train_val[label_raw_col])

## Merge it back to the main df 

In [13]:
del val[label_raw_col]
val = val.merge(df_cut, how="inner", on=id_col)

In [14]:
del train[label_raw_col]
train = train.merge(df_cut, how="inner", on=id_col)

In [15]:
del test[label_raw_col]
test = test.merge(df_cut, how="inner", on=id_col)

## Check for overlapping

In [16]:
import numpy as np


def check_if_overlap(df1, df2, on_column):
    # merge two dataFrames and add indicator column
    all_df = pd.merge(df1, df2, on=on_column, how="left", indicator="exists")
    # add column to show if each row in first DataFrame exists in second
    all_df["exists"] = np.where(all_df.exists == "both", True, False)
    if all_df["exists"].sum() == 0:
        print("These 2 dataframes are not overlapped over ", on_column)
    else:
        print("There are overlapped entries!!")

In [17]:
check_if_overlap(val, train, id_col)

These 2 dataframes are not overlapped over  adeo_product_id


In [18]:
check_if_overlap(test, train, id_col)

These 2 dataframes are not overlapped over  adeo_product_id


In [19]:
check_if_overlap(test, val, id_col)

These 2 dataframes are not overlapped over  adeo_product_id


## Summary

In [20]:
print("============================")
print("Full Dataset")
print("============================")
print("No. entries:", df.shape[0])
print("No. of models (categories / label): ", df[label_raw_col].nunique())

print("No. of entries removed due to num of samples < CUTOFF: ", sample_cut_lackofsample)
print("No. of models removed due to num of samples < CUTOFF: ", model_cut_lackofsample)

print(
    "No. of entries removed due to lack of product information (no title & no description): ",
    sample_cut_lackofinfo,
)
print(
    "No. of models removed due to to lack of product information (no title & no description): ",
    model_cut_lackofinfo,
)

print("No. of entries after cut ", (df_cut.shape[0]))
print("No. of models after cut ", (df_cut[label_raw_col].nunique()))

print("============================")
print("Train Dataset")
print("============================")
print("No. entries:", train.shape[0])
print("No. of models (categories / label): ", train[label_raw_col].nunique())

print("============================")
print("Val Dataset")
print("============================")
print("No. entries:", val.shape[0])
print("No. of models (categories / label): ", val[label_raw_col].nunique())

print("============================")
print("Test Dataset")
print("============================")
print("No. entries:", test.shape[0])
print("No. of models (categories / label): ", test[label_raw_col].nunique())


print("============================")
print("Verify if Dataset is correct")
print("============================")
if train.shape[0] + val.shape[0] + test.shape[0] == df_cut.shape[0]:
    print("The total no. of entries matched!!")
else:
    print("No. of entries matching ERROR!!")

Full Dataset
No. entries: 4566310
No. of models (categories / label):  3372
No. of entries removed due to num of samples < CUTOFF:  396
No. of models removed due to num of samples < CUTOFF:  170
No. of entries removed due to lack of product information (no title & no description):  0
No. of models removed due to to lack of product information (no title & no description):  0
No. of entries after cut  4565914
No. of models after cut  3202
Train Dataset
No. entries: 3698108
No. of models (categories / label):  3202
Val Dataset
No. entries: 410900
No. of models (categories / label):  3104
Test Dataset
No. entries: 456906
No. of models (categories / label):  3152
Verify if Dataset is correct
The total no. of entries matched!!


## Push to bucket

In [21]:
f"{base_uri}/val.parquet"

'gs://artefact-taxonomy-classification-datasets/3P_mirakl_multilang_v1/val.parquet'

In [25]:
# val.to_parquet(f"{base_uri}/val.parquet", index=False)

In [26]:
# test.to_parquet(f"{base_uri}/test.parquet", index=False)

In [27]:
# train.to_parquet(f"{base_uri}/train.parquet", index=False)