## Import libraries and dataset

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# import articles
articles = pd.read_parquet("data/articles.parquet")
df_art = articles.copy()

In [4]:
# import customers
customers = pd.read_parquet("data/customers.parquet")
df_cust = customers.copy()

In [5]:
# import transactions
transactions = pd.read_parquet("data/transactions_train.parquet")
df_tran = transactions.copy()

In [7]:
df_art.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,10,0,0,1,0,16,30,1002,2,8834
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,10,0,0,1,0,16,30,1002,2,8834
2,108775051,108775,44846,253,9,0,1010017,3,11,11,...,10,0,0,1,0,16,30,1002,2,8834
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,131,7,7,1,0,61,5,1017,4,8243
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,131,7,7,1,0,61,5,1017,4,8243


In [8]:
df_cust.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,6883939031699146327,-1,-1,0,0,49,6305
1,11246327431398957306,-1,-1,0,0,25,33726
2,18439897732908966680,-1,-1,0,0,24,3247
3,18352672461570950206,-1,-1,0,0,54,168643
4,18162778555210377306,1,1,0,1,52,168645


In [9]:
df_tran.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0
5389,2018-09-20,2076973761519164,661795002,0.167797,2,0
5390,2018-09-20,2076973761519164,684080003,0.101678,2,0
47429,2018-09-20,2918879973994241,662980001,0.033881,1,0


## Features engineering CUSTOMERS

### Has kids

I'm going to add a column has_kids with 1 for those who bought baby articles and 0 for the rest

In [21]:
df_art["index_group_name"].value_counts()

0    39737
1    34711
2    15149
3    12553
4     3392
Name: index_group_name, dtype: int64

In [22]:
# df_art["index_group_name"] == Baby/Children 

In [23]:
baby_articles = df_art[df_art['index_group_name'] == 1]

baby_article_ids = baby_articles['article_id'].tolist()

len(baby_article_ids)

34711

In [24]:
transazioni_baby = df_tran[df_tran['article_id'].isin(baby_article_ids)]

customer_ids = transazioni_baby['customer_id'].unique()

len(customer_ids)

225761

In [25]:
df_cust['has_kids'] = 0

df_cust.loc[df_cust['customer_id'].isin(customer_ids), 'has_kids'] = 1

### FN, Active, fn_freq

In [26]:
# drop FN and Active
df_cust = df_cust.drop(["FN", "Active"], axis=1)

In [27]:
df_cust.isna().any()

customer_id               False
club_member_status        False
fashion_news_frequency    False
age                       False
postal_code               False
not_solid_neutral         False
has_kids                  False
dtype: bool

## Work on  graphical appearence +  color group

In [10]:
df_art["colour_group_name"].value_counts()
# 0 is black
# 2 is white
# 4 is grey

0     22670
1     12171
2      9542
3      5811
4      4487
5      3356
6      3308
7      3056
8      3012
9      2767
10     2731
11     2726
12     2712
13     2340
14     2106
15     2105
16     2063
17     1645
18     1520
19     1471
20     1377
21     1084
22     1027
23      984
24      886
25      818
26      815
27      779
28      750
29      709
30      681
31      574
32      553
33      473
34      435
35      315
36      285
37      226
38      225
39      182
40      153
41      129
42      114
43      105
44       94
45       51
46       46
47       31
48       28
49       14
Name: colour_group_name, dtype: int64

In [12]:
df_art["graphical_appearance_name"].value_counts()
# 0 is solid

0     49747
1     17165
2      5938
3      4990
4      4842
5      3215
6      3098
7      2178
8      1830
9      1513
10     1502
11     1341
12     1165
13     1132
14      958
15      830
16      806
17      681
18      586
19      515
20      376
21      346
22      322
23      153
24       86
25       86
26       66
27       52
28       15
29        8
Name: graphical_appearance_name, dtype: int64

In [11]:
#create a list of articles that have neutral color
just_neutral = df_art[(df_art['colour_group_name'] == 2) | (df_art['colour_group_name'] == 2) | (df_art['colour_group_name'] == 4)]
just_neutral_ids = just_neutral['article_id'].tolist()
len(just_neutral_ids)

14029

In [14]:
just_solid = df_art[df_art['graphical_appearance_name'] == 0]
just_solid_ids = just_solid['article_id'].tolist()
len(just_solid_ids)

49747

In [16]:
# find transactions of non neutral articles
transazioni_not_neutral= df_tran[~df_tran['article_id'].isin(just_neutral_ids)]
#find related customers
customer_not_neutral = transazioni_not_neutral['customer_id'].unique()
len(customer_not_neutral)

1339256

In [17]:
# find transactions of non solid articles
transazioni_not_solid= df_tran[~df_tran['article_id'].isin(just_solid_ids)]
#find the related customers
customer_not_solid = transazioni_not_solid['customer_id'].unique()
len(customer_not_solid)

1171395

In [19]:
#create a new column with 1 for people who also bought colorful and not solid clothes
df_cust['not_solid_neutral'] = 0
df_cust.loc[(df_cust['customer_id'].isin(customer_not_neutral)) & (df_cust['customer_id'].isin(customer_not_solid)), 'not_solid_neutral'] = 1

### Export new costumers

In [29]:
folder = "generated_data/"
df_cust.to_parquet(folder + "generated_customers_v3.parquet")

In [28]:
df_cust

Unnamed: 0,customer_id,club_member_status,fashion_news_frequency,age,postal_code,not_solid_neutral,has_kids
0,6883939031699146327,0,0,49,6305,1,1
1,11246327431398957306,0,0,25,33726,1,1
2,18439897732908966680,0,0,24,3247,1,0
3,18352672461570950206,0,0,54,168643,0,0
4,18162778555210377306,0,1,52,168645,1,0
...,...,...,...,...,...,...,...
1371975,7551062398649767985,0,0,24,50351,1,0
1371976,9305341941720086711,0,0,21,80169,1,1
1371977,10160427316885688932,0,1,21,106737,1,0
1371978,2551401172826382186,0,1,18,111894,1,0


## Features Engineering ARTICLES description

I want to classify every item as "winter" or "summer" usind its description and with a sistem based on semantic similarity. I'm gonna use bert

In [None]:
# drop null and missing values from the column
df_art.dropna(subset=['detail_desc'], inplace=True)

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
Inst

In [None]:
# make a list of all the descriptions
sentences = df_art["detail_desc"].unique().tolist()
len(sentences)

43404

In [None]:
#some common imports
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

### Attempt 1

I'm going to get the embedding of every sentence (I have to do that in batches) and compare that to the embedding of winter and summer

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
sent = sentences

In [None]:
# create batches (200 sentences in every batch ca.)
num_batches = 200

batch_size = len(sent) // num_batches

# split sentences into batch
batches = [sent[i:i + batch_size] for i in range(0, len(sent), batch_size)]


In [None]:
type(batches[0])

list

In [None]:
# I create the sentences for the seasons
seasons = ["Autumn Winter", "Spring Summer"]

# Tokenize and encode sentences
inputs_seas = tokenizer(seasons, padding=True, truncation=True, return_tensors="pt")
outputs_seas = model(**inputs_seas)

# Get the sentence embeddings from the output
season_embeddings = outputs_seas.last_hidden_state.mean(dim=1)

In [None]:

# Tokenize and encode your sentences
inputs_sent = tokenizer(batches[0], padding=True, truncation=True, return_tensors="pt")
outputs_sent = model(**inputs_sent)

# Get the sentence embeddings from the output
sentence_embeddings = outputs_sent.last_hidden_state.mean(dim=1)

similarity_matrix = cosine_similarity(sentence_embeddings.detach().numpy(), season_embeddings.detach().numpy())

print(f"Similarità del coseno tra le frasi: {batches[0][0]} and {seasons[0]} is {similarity_matrix[0][0]}")
print(f"Similarità del coseno tra le frasi: {batches[0][0]} and {seasons[1]} is {similarity_matrix[0][1]}")


Similarità del coseno tra le frasi: Jersey top with narrow shoulder straps. and Winter Autumn warm clothes and cold weather is 0.44531017541885376
Similarità del coseno tra le frasi: Jersey top with narrow shoulder straps. and Summer Spring lightweight clothes and sunny weather is 0.5363214015960693


In [None]:
sent_to_emb = {}

In [None]:
sent_to_season = {}

In [None]:
for frase in sent[400:450]:
  # Tokenize and encode your sentences
  inputs_sent = tokenizer(frase, padding=True, truncation=True, return_tensors="pt")
  outputs_sent = model(**inputs_sent)

  # Get the sentence embeddings from the output
  sentence_embeddings = outputs_sent.last_hidden_state.mean(dim=1)

  #compute similarity matrix
  similarity_matrix = cosine_similarity(sentence_embeddings.detach().numpy(), season_embeddings.detach().numpy())

  if similarity_matrix[0][0] > similarity_matrix[0][1]:
    conto = 0
  else:
    conto = 1

  print(f"{similarity_matrix[0][0]} e {similarity_matrix[0][1]}")
  print(f"{frase} annd {conto}")
  print()

0.41624191403388977 e 0.4358997046947479
Top in patterned jersey with a boat neck and 3/4-length sleeves. annd 1

0.41179972887039185 e 0.42145317792892456
Top in soft jersey with short raglan sleeves with sewn-in turn-ups. annd 1

0.39721760153770447 e 0.393527090549469
Aviator-style,tinted sunglasses with metal and plastic frames and UV-protective, tinted lenses. annd 0

0.3768013119697571 e 0.39913439750671387
Tights that provide support for the tummy, thighs and calves as well as encouraging circulation in the legs. Elasticated waist and extra space for a growing tummy. 30 denier. annd 1

0.42579537630081177 e 0.45531636476516724
Long-sleeved T-shirt in waffled cotton jersey with ribbed cuffs. annd 1

0.34486767649650574 e 0.38484352827072144
Sports vest top in fast-drying functional fabric with a racer back that has a rubber print and ventilating hole pattern. annd 1

0.3928540349006653 e 0.4149923026561737
Trousers in sweatshirt fabric made from organic cotton with an elasticated

### Attempt 2

i'm gonna try to use clusters to see how sentences are divided

In [None]:
from sklearn.cluster import KMeans

# Example clothing descriptions (replace with your own data)
clothing_descriptions = sentences[:200]

# Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenize and obtain BERT embeddings for each description
def get_bert_embeddings(texts):
    tokenized_texts = [tokenizer(text, padding=True, truncation=True, return_tensors="pt") for text in texts]
    embeddings = [model(**text).last_hidden_state.mean(dim=1).squeeze().detach().numpy() for text in tokenized_texts]
    return np.array(embeddings)

description_embeddings = get_bert_embeddings(clothing_descriptions)

# Perform K-Means clustering
n_clusters = 2  # Set the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(description_embeddings)

# Print the clusters and their corresponding descriptions
for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    print(f"Cluster {cluster_id + 1}:")
    for idx in cluster_indices:
        print(f"- {clothing_descriptions[idx]}")




Cluster 1:
- Semi shiny nylon stockings with a wide, reinforced trim at the top. Use with a suspender belt. 20 denier.
- Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.
- Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.
- Opaque matt tights. 200 denier.
- Sweatshirt in soft organic cotton with a  press-stud on one shoulder (sizes 12-18 months and 18-24 months without a press-stud). Brushed inside.
- 50 denier tights with reinforcement at the top for a shaping effect on the tummy and thighs.
- Plastic hair claws. Width 3.5 cm.
- Umbrella with a telescopic handle and matching cover. Length 23 cm folded.
- Matt tights with an elasticated waist and extra space for a growing tummy. 100 denier.
- Long-sleeved sports top in fast-drying, breathable functional fabric with overlocked seams for optimum comfort. Some of the polyester content of the top is recycled.
- 

### Attempt 3: OK

In [None]:
# some libraries
from transformers import AutoTokenizer, AutoModel

In [None]:
# create model
model_name = 'bert-base-cased'

# load model
model = AutoModel.from_pretrained(model_name, output_hidden_states=True, output_attentions=True)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
#create a dictionary for seasons embeddings
seasons_emb = {}
#these are the sentences for the season collections
seasons = ["Spring Summer collection", "Autumn Winter collection"]

In [None]:
#for both the seasons
for frase_input in seasons:

  #tokenize and encode
  tokens = tokenizer.tokenize(frase_input, add_special_tokens=True)
  input_ids = tokenizer.encode(frase_input, add_special_tokens=True)

  input_ids = torch.tensor([input_ids])

  #apply the model
  with torch.no_grad():
    outputs = model(input_ids)

  all_hidden_states = outputs["hidden_states"]
  attention_heads = outputs["attentions"]

  n_strati = len(all_hidden_states)

  #select layer
  layer = 12
  # get hidden states at that layer (last one)
  hidden_state = all_hidden_states[layer]
  dim_hidden_state = hidden_state.shape
  #print("Dimensione del tensore:", dim_hidden_state)

  # get embeddings from the tokens and link them to representations
  embeddings_frase = hidden_state[0]
  word_embeddings = {k:v for k, v in zip(tokens, embeddings_frase)}

  #embedding of firs token CLS is considered to be the embedding of the sentence
  frase = word_embeddings["[CLS]"]

  # add embedding to the dictionary
  seasons_emb[frase_input] = word_embeddings["[CLS]"]

In [None]:
len(seasons_emb)

2

In [None]:
#get embeddings of seasons
estate = seasons_emb['Spring Summer collection']
inverno = seasons_emb['Autumn Winter collection']

#define functions to get cosine similarity between a season ad a sentence

def get_cosine_similarity_summer(estate, frase):
    return cosine_similarity(estate.reshape(1, -1), frase.reshape(1, -1))[0][0]

def get_cosine_similarity_winter(inverno, frase):
    return cosine_similarity(inverno.reshape(1, -1), frase.reshape(1, -1))[0][0]

In [None]:
#create a new dictionary for binary values
sentence_to_season = {}

In [None]:
#for every unique description
for frase_input in sentences:

  #tokenize and encode
  tokens = tokenizer.tokenize(frase_input, add_special_tokens=True)
  input_ids = tokenizer.encode(frase_input, add_special_tokens=True)

  input_ids = torch.tensor([input_ids])

  #apply the model
  with torch.no_grad():
    outputs = model(input_ids)

  all_hidden_states = outputs["hidden_states"]
  attention_heads = outputs["attentions"]
  n_strati = len(all_hidden_states)

  # select the layer
  layer = 12
  #get the hidden state
  hidden_state = all_hidden_states[layer]
  dim_hidden_state = hidden_state.shape

  #get embedding at that hidden state
  embeddings_frase = hidden_state[0]
  # link to representations
  word_embeddings = {k:v for k, v in zip(tokens, embeddings_frase)}

  #get embedding of the sentence
  frase = word_embeddings["[CLS]"]

  #update the dictionary with 0 for summer and 1 for winter
  if get_cosine_similarity_summer(estate, frase) > get_cosine_similarity_winter(inverno, frase):
    sentence_to_season[frase_input] = 0
  else:
    sentence_to_season[frase_input] = 1


In [None]:
len(sentence_to_season)

43404

In [None]:
#create a function to map dictionary values to a new feature
def assegna_categoria(frase):
  return sentence_to_season[frase]


In [None]:
#new column is Season
df_art['Season'] = df_art['detail_desc'].apply(assegna_categoria)

In [None]:
df_art

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,Season
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,1
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,1
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,1
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",1
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,...,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,Socks in a fine-knit cotton blend with a small...,1
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...,1
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, A-line dress in jersey with a round nec...",1
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,...,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,Large plastic hair claw.,1


### Export new articles

In [None]:
#export to parquet
folder = "drive/MyDrive/UNI MAGISTRALE/AI Project/data/"
df_art.to_parquet(folder + "final_art.parquet")
