In [1]:
# Version de Python utilisée : 3.9.xx
# Import des librairies
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Access to the raw dataset iterators

In [2]:
from src.data import data

# Import des fichiers contenant les données
df = data.load_data("data").fillna("")

# data !!!!!
features = df['designation']+" "+df['description']
target = df['prdtypecode']

# split to train, valid and test
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

## Pre-processing & vectorisation pipelines

In [3]:
from src.data.vectorization_pipeline import BagOfWordsDefault, TfidfV1
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(TfidfV1())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

bow_preproc.save(X_train_preproc, "data/X_train_preproc")
bow_preproc.save(X_test_preproc, "data/X_test_preproc")

bow_preproc.save(y_train, "data/y_train")
bow_preproc.save(y_test, "data/y_test")

print(bow_preproc.save_voc("data/voc"))





{'collection': 1421, 'colle': 1419, 'figure': 2463, 'home': 2850, 'table': 5217, 'console': 1556, 'extensible': 2351, 'rectangulaire': 4533, 'jusqu': 3137, '140': 83, 'cm': 1398, 'salle': 4815, 'manger': 3386, 'sejour': 4894, 'couleur': 1639, 'chene': 1311, 'clair': 1365, 'brosse': 1103, 'personnes': 4038, 'dimensions': 1919, 'fermee': 2434, '50': 335, 'longueur': 3309, '90': 447, 'largeur': 3188, '78': 422, 'hauteur': 2809, 'finition': 2498, 'haute': 2806, 'qualite': 4440, 'resistance': 4652, 'acier': 511, 'grande': 2739, 'capacite': 1169, 'charge': 1280, 'meuble': 3503, 'polyvalent': 4203, 'multifonctionnel': 3646, 'design': 1841, 'simple': 4969, 'elegant': 2115, 'permettra': 4027, 'adapter': 534, 'importe': 2931, 'quelle': 4450, 'piece': 4068, 'maison': 3370, 'canape': 1158, 'places': 4103, 'convertible': 1603, '100': 20, 'coton': 1630, 'coloris': 1437, 'chocolat': 1334, 'fois': 2539, 'pratique': 4270, 'star': 5113, 'salon': 4817, 'trouvera': 5455, 'place': 4100, 'univers': 5499, 'c