In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
from mistune.markdown import preprocess
%matplotlib inline

In [2]:
x_train = pd.read_csv("Raw_data/x_train_update.csv", index_col=0)
x_train.head(20)

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786
5,Afrique Contemporaine N° 212 Hiver 2004 - Doss...,,5862738,393356830
6,Christof E: Bildungsprozessen Auf Der Spur,,91920807,907794536
7,Conquérant Sept Cahier Couverture Polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347
8,Puzzle Scooby-Doo Avec Poster 2x35 Pieces,,4239126071,1325918866
9,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185


In [3]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ MB


In [3]:
y_train = pd.read_csv("Raw_data/Y_train_CVw08PX.csv", index_col=0)
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   prdtypecode  84916 non-null  int64
dtypes: int64(1)
memory usage: 1.3 MB


In [7]:
# identification du format des images :

train_pwd = "Raw_data/images/image_train"
img_lst = os.listdir(train_pwd)

img_size = []

for i in img_lst:
    img = imread(train_pwd + "/" + i)
    img_size.append(img.shape)
# plt.imshow(img)

img_size = pd.DataFrame(np.stack(img_size, axis=0))
img_size.value_counts(normalize=True)

# Toutes les images ont la même taille : 500*500 RGB

0    1    2
500  500  3    1.0
Name: proportion, dtype: float64

Prétraitement des données textuelles en vue d'une data-viz

In [4]:
# Regroupement des designations en fonction des classes :

df = pd.concat([x_train.designation, y_train], axis=1)
df = df.drop_duplicates().dropna()
df['designation'] = df['designation'].str.lower().str.strip()
grouped_dict = df.groupby("prdtypecode")['designation'].apply(list).to_dict()

print(grouped_dict.keys())
print(type(grouped_dict[10]))

dict_keys([10, 40, 50, 60, 1140, 1160, 1180, 1280, 1281, 1300, 1301, 1302, 1320, 1560, 1920, 1940, 2060, 2220, 2280, 2403, 2462, 2522, 2582, 2583, 2585, 2705, 2905])
<class 'list'>


In [29]:
# transformation en token et suppression des bad words
tokenized_dict = dict()

import spacy
nlp = spacy.load('xx_ent_wiki_sm') # Traitement multilingual

for key in grouped_dict.keys():
    doc = nlp(str(grouped_dict[key]))
    # Filtrage des tokens: bad words, ponctuation, espaces, caractères vides (emoji etc...
    tokenized_dict[key] = [token.text for token in doc if not (token.is_stop or token.is_punct or token.is_space or len(token) < 1)]

print(tokenized_dict[10][:20])

['olivia', 'personalisiertes', 'notizbuch', '150', 'seiten', 'punktraster', 'ca', 'din', 'a5', 'rosen', 'design', 'christof', 'e', 'bildungsprozessen', 'auf', 'der', 'spur', 'vassivière', '-en', 'limousin']


In [34]:
pd.Series(tokenized_dict[10]).value_counts()

de             1318
la              734
et              563
the             514
le              482
               ... 
régent            1
mercurius         1
heilsame          1
din               1
punktraster       1
Name: count, Length: 10542, dtype: int64

In [30]:
# Comptabilisation des tokens pour chaque catégorie
from collections import Counter

freq_by_category = {cat: Counter(tokens) for cat, tokens in tokenized_dict.items()}
all_tokens = set(token for tokens in tokenized_dict.values() for token in tokens)
common_tokens = set(token for token in all_tokens if sum(token in tokens for tokens in tokenized_dict.values()) > 1)

# Exclusion des tokens communs
filtered_freq_by_category = {
    cat: {token: freq for token, freq in freqs.items() if token not in common_tokens}
    for cat, freqs in freq_by_category.items()}


AttributeError: 'dict' object has no attribute 'head'

In [35]:
import torch
preprocess_nlp_data = {"tokenized_dict":tokenized_dict, "freq_by_category":freq_by_category, "filtered_freq_by_category":filtered_freq_by_category}
torch.save(preprocess_nlp_data, "Exported_data/preprocess_nlp_data.pth")

Création d'un nuage de mots en fonction des catégories

In [None]:
import torch
preprocess_nlp_data = torch.load("Exported_data/preprocess_nlp_data.pth", weights_only=True)
filtered_freq_by_category = preprocess_nlp_data["filtered_freq_by_category"]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Couleurs par catégorie
colors = {"Category A": "blue", "Category B": "green", "Category C": "red"}

# Fonction pour appliquer les couleurs
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    for category, tokens in filtered_freq_by_category.items():
        if word in tokens:
            return colors[category]
    return "black"  # Par défaut

# Fusionner toutes les fréquences pour le nuage
merged_freq = {}
for freqs in filtered_freq_by_category.values():
    merged_freq.update(freqs)

# Créer le nuage de mots
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(merged_freq)

# Appliquer les couleurs par catégorie
wordcloud.recolor(color_func=color_func)

# Afficher le nuage de mots
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
