Import, Setup, Paths

In [1]:
%pip install torch torchvision torchaudio pandas matplotlib seaborn scikit-learn tqdm fastparquet

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from PIL import Image
import torch
import torchvision.transforms as T
import torchvision.models as models

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Path di base
DATA_PATH = 'data/raw/visuelle2'

df_sales = pd.read_csv(os.path.join(DATA_PATH, 'sales.csv'))
df_price = pd.read_csv(os.path.join(DATA_PATH, 'price_discount_series.csv'))
df_meteo = pd.read_csv(os.path.join(DATA_PATH, 'vis2_weather_data.csv'))
df_trends = pd.read_csv(os.path.join(DATA_PATH, 'vis2_gtrends_data.csv'))
df_restocks = pd.read_csv(os.path.join(DATA_PATH, 'restocks.csv'))

#### Caricamento labels & label encoding

In [3]:
# Carica mapping numerici (label encoding)
category_labels = torch.load(os.path.join(DATA_PATH, 'category_labels.pt'))
color_labels    = torch.load(os.path.join(DATA_PATH, 'color_labels.pt'))
fabric_labels   = torch.load(os.path.join(DATA_PATH, 'fabric_labels.pt'))

cat2idx  = {k: v for k, v in category_labels.items()}
col2idx  = {k: v for k, v in color_labels.items()}
fab2idx  = {k: v for k, v in fabric_labels.items()}

#### Wide → Long: Vendite settimanali

In [4]:
week_cols = [str(i) for i in range(12)]
df_long = pd.melt(
    df_sales,
    id_vars=[c for c in df_sales.columns if c not in week_cols],
    value_vars=week_cols,
    var_name='week',
    value_name='sales'
)
df_long['week'] = df_long['week'].astype(int)
df_long

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,week,sales
0,0,5,36,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,22,0,1.0
1,1,2,51,SS17,long sleeve,violet,PE17/00002.png,acrylic,2016-11-28,17,0,1.0
2,2,5,10,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,0,1.0
3,3,9,41,SS17,culottes,yellow,PE17/00009.png,scuba crepe,2016-11-28,32,0,1.0
4,4,5,13,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,26,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1282195,106845,5504,51,AW19,medium coat,grey,AI19/05504.png,foam rubber,2019-12-30,12,11,0.0
1282196,106846,5558,10,AW19,medium coat,black,AI19/05558.png,foam rubber,2019-12-30,12,11,0.0
1282197,106847,4988,108,AW19,medium coat,black,AI19/04988.png,cloth,2019-12-30,15,11,0.0
1282198,106848,4280,105,AW19,culottes,blue,AI19/04280.png,light jeans,2019-12-30,1,11,0.0


#### Aggiungi One-Hot delle Categorie/Colori/Fabbricati

In [5]:
df_long['cat_idx']    = df_long['category'].map(cat2idx)
df_long['color_idx']  = df_long['color'].map(col2idx)
df_long['fabric_idx'] = df_long['fabric'].map(fab2idx)
df_long

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,week,sales,cat_idx,color_idx,fabric_idx
0,0,5,36,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,22,0,1.0,3,0,7
1,1,2,51,SS17,long sleeve,violet,PE17/00002.png,acrylic,2016-11-28,17,0,1.0,3,8,7
2,2,5,10,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,0,1.0,3,0,7
3,3,9,41,SS17,culottes,yellow,PE17/00009.png,scuba crepe,2016-11-28,32,0,1.0,0,6,16
4,4,5,13,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,26,0,1.0,3,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282195,106845,5504,51,AW19,medium coat,grey,AI19/05504.png,foam rubber,2019-12-30,12,11,0.0,4,0,24
1282196,106846,5558,10,AW19,medium coat,black,AI19/05558.png,foam rubber,2019-12-30,12,11,0.0,4,2,24
1282197,106847,4988,108,AW19,medium coat,black,AI19/04988.png,cloth,2019-12-30,15,11,0.0,4,2,4
1282198,106848,4280,105,AW19,culottes,blue,AI19/04280.png,light jeans,2019-12-30,1,11,0.0,0,7,27


#### Immagini: estrai embedding UNA sola volta e joinna

In [6]:
IMG_BASE = os.path.join(DATA_PATH, 'images')

cnn = models.resnet18(weights='IMAGENET1K_V1').eval()
layer = torch.nn.Sequential(*list(cnn.children())[:-1])
transf = T.Compose([
    T.Resize((224,224)), T.ToTensor(),
    T.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
])

prods_imgs = df_long.drop_duplicates('external_code')[['external_code', 'image_path']].dropna()
img_emb_dict = {}

for _, row in tqdm(prods_imgs.iterrows(), total=len(prods_imgs)):
    prod = row['external_code']
    img_path = os.path.join(IMG_BASE, row['image_path'])
    try:
        im = Image.open(img_path).convert('RGB')
        t = transf(im).unsqueeze(0)
        with torch.no_grad():
            emb = layer(t).squeeze().numpy().flatten()
        img_emb_dict[prod] = emb
    except Exception as e:
        img_emb_dict[prod] = np.zeros(512)
        print(f"ERRORE {img_path}: {e}")

for i in range(512):
    df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])

 84%|████████▎ | 4476/5355 [01:53<00:25, 34.80it/s]

ERRORE data/raw/visuelle2/images/AI19/04442.png: unrecognized data stream contents when reading image file


100%|██████████| 5355/5355 [02:17<00:00, 38.88it/s]
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512))[i])
  df_long[f'img_emb_{i}'] = df_long['external_code'].map(lambda p: img_emb_dict.get(p, np.zeros(512)

#### RESTOCK: join per product-negozio-week

In [9]:
# Adattare se serve anche per 'year'; qui chiavi semplici:
df_restocks.rename(columns={'qty':'restock_qty'}, inplace=True)
join_key = ['external_code','retail','week']
if 'year' in df_restocks.columns and 'year' in df_long.columns:
    join_key.append('year')
df_long = df_long.merge(
    df_restocks[join_key + ['restock_qty']],
    on=join_key, how='left'
)
df_long['restock_qty'] = df_long['restock_qty'].fillna(0)
df_long

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,img_emb_505,img_emb_506,img_emb_507,img_emb_508,img_emb_509,img_emb_510,img_emb_511,time_idx,target,restock_qty
0,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,3.715941,0.449606,0.012282,1.064582,1.417467,0.176237,0.143883,1,3.0,0.0
1,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,3.715941,0.449606,0.012282,1.064582,1.417467,0.176237,0.143883,2,3.0,0.0
2,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,3.715941,0.449606,0.012282,1.064582,1.417467,0.176237,0.143883,3,1.0,0.0
3,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,3.715941,0.449606,0.012282,1.064582,1.417467,0.176237,0.143883,4,4.0,0.0
4,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,3.715941,0.449606,0.012282,1.064582,1.417467,0.176237,0.143883,5,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282599,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,0.498303,0.337719,0.104122,0.958005,0.943867,0.597561,0.032088,8,1.0,0.0
1282600,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,0.498303,0.337719,0.104122,0.958005,0.943867,0.597561,0.032088,9,2.0,0.0
1282601,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,0.498303,0.337719,0.104122,0.958005,0.943867,0.597561,0.032088,10,0.0,0.0
1282602,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,0.498303,0.337719,0.104122,0.958005,0.943867,0.597561,0.032088,11,1.0,0.0


#### Altri join temporali: (prezzo, sconto, meteo, trends)

In [10]:
# Price/discount
df_price_long = pd.melt(
    df_price,
    id_vars=['external_code','retail','price'],
    value_vars=[str(i) for i in range(12)],
    var_name='week', value_name='discount'
)
df_price_long['week'] = df_price_long['week'].astype(int)

df_long = df_long.merge(
    df_price_long[['external_code','retail','week','price','discount']],
    on=['external_code','retail','week'], how='left'
)

# Meteo
shop_weather_pairs = torch.load(os.path.join(DATA_PATH, 'shop_weather_pairs.pt'))
shop2loc = {k: v for k, v in shop_weather_pairs.items() if v is not None}
df_long['locality'] = df_long['retail'].map(shop2loc)
df_long['release_date'] = pd.to_datetime(df_long['release_date'], errors='coerce')
df_meteo['date'] = pd.to_datetime(df_meteo['date'], errors='coerce')
df_long['week_date'] = df_long['release_date'] + pd.to_timedelta(df_long['week'], "W")

df_long = df_long.merge(
    df_meteo,
    left_on=['locality','week_date'],
    right_on=['locality','date'],
    how='left',
    suffixes=('', '_meteo')
)

# Trends
df_trends['date'] = pd.to_datetime(df_trends['date'], errors='coerce')
df_long = df_long.merge(
    df_trends,
    left_on='week_date',
    right_on='date',
    how='left',
    suffixes=('', '_trend')
)

In [11]:
df_long

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,scottish,milano stitch,devore,hron,ottoman,fluid,flamed,fluid polyviscous,shiny jersey,goose
0,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,53.0,0.0,19.0,18.0,71.0,84.0,44.0,-1.0,0.0,51.0
1,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,57.0,37.0,20.0,15.0,64.0,78.0,37.0,-1.0,0.0,55.0
2,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,53.0,0.0,12.0,17.0,56.0,78.0,48.0,-1.0,35.0,58.0
3,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,54.0,0.0,27.0,20.0,77.0,67.0,74.0,-1.0,0.0,55.0
4,133,1,3,SS17,long sleeve,violet,PE17/00001.png,acrylic,2016-12-05,20,...,61.0,0.0,27.0,22.0,78.0,76.0,38.0,-1.0,0.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282599,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,,,,,,,,,,
1282600,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,,,,,,,,,,
1282601,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,,,,,,,,,,
1282602,106354,5577,108,AW19,trapeze dress,green,AI19/05577.png,milano stitch,2019-12-16,13,...,,,,,,,,,,


#### Aggiunta colonna time_idx e scelta target

In [12]:
# Forecast nel tempo per ogni (external_code, retail)
# Tipicamente: target = vendite

# Crea time_idx: numero progressivo di settimana per ogni prodotto-negozio
df_long = df_long.sort_values(['external_code', 'retail', 'release_date', 'week'])

# Calcolare time_idx globale oppure locale per serie:
# Esempio: time_idx = progressivo (week) per ogni (external_code, retail)
df_long['time_idx'] = (
    df_long.groupby(['external_code', 'retail'])['week'].rank(method='first').astype(int)
)


# Target finale:
df_long['target'] = df_long['sales']

# Visualizza un sample e salva
print(df_long[['external_code','retail','week','time_idx','target','cat_idx','color_idx','fabric_idx']].head())

df_long.to_parquet("long_table_multimodal_labelencoded_timeidx.parquet", index=False)

   external_code  retail  week  time_idx  target  cat_idx  color_idx  \
0              1       3     0         1     3.0        3          8   
1              1       3     1         2     3.0        3          8   
2              1       3     2         3     1.0        3          8   
3              1       3     3         4     4.0        3          8   
4              1       3     4         5     3.0        3          8   

   fabric_idx  
0           7  
1           7  
2           7  
3           7  
4           7  


TO-DO:

1. Applicare PCA sui 512 embeddings e ridurli a 20–50 dimensioni (es. img_pca_0, ..., img_pca_49).

2. Aggiungere solo questi al tuo tabellare.

3. Testare il modello in PyTorch Forecasting con e senza embeddings per valutare l’impatto reale.