In [1]:
# Version de Python utilisée : 3.9.xx
# Import des librairies
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Access to the raw dataset iterators

In [2]:
from src.data import data

# Import des fichiers contenant les données
df = data.load_data("data").fillna("")

# data !!!!!
features = df['designation']+" "+df['description']
target = df['prdtypecode']

# split to train, valid and test
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

## Pre-processing & vectorisation pipelines

In [3]:
from src.data.vectorization_pipeline import BagOfWordsDefault, TfidfV1
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(TfidfV1())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

# print(bow_preproc.save_voc("data/voc"))
voc = bow_preproc.get_voc()






In [8]:
from src.data.vectorization_pipeline import BOW_Stemming
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(BOW_Stemming())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
print(len(bow_preproc.pipeline.get_voc()))
bow_preproc.pipeline.get_voc()


118111


{'kant': 68809,
 'collect': 41301,
 'kan': 68773,
 'coll': 41268,
 'kashim': 69085,
 'lpm': 73781,
 'figur': 55217,
 'kai': 68696,
 'seg': 98781,
 'hom': 63465,
 'innov': 66262,
 'tabl': 105399,
 'consol': 42345,
 'extensibl': 53814,
 'rectangulair': 93509,
 'avec': 29791,
 'rallong': 92696,
 'jusqu': 68485,
 '140': 4126,
 'cm': 40648,
 'pour': 89417,
 'sall': 97335,
 'mang': 75219,
 'et': 53122,
 'séjour': 105162,
 'couleur': 43303,
 'chên': 39763,
 'clair': 40119,
 'bross': 34850,
 'person': 86607,
 'dimens': 47359,
 'de': 45581,
 'la': 70859,
 'ferm': 54926,
 '50': 16385,
 'longueur': 73538,
 '90': 22689,
 'largeur': 71386,
 '78': 21069,
 'hauteur': 62123,
 'finit': 55458,
 'en': 51846,
 'mélamin': 80231,
 'haut': 62115,
 'qualit': 91870,
 'agglomer': 25537,
 'résist': 96721,
 'rail': 92629,
 'aci': 24819,
 'une': 110389,
 'grand': 60098,
 'capac': 36960,
 'charg': 38845,
 'ce': 38110,
 'meubl': 77093,
 'polyvalent': 88907,
 'multifonctionnel': 79746,
 'au': 29381,
 'design': 46584,

In [6]:
from src.data.vectorization_pipeline import BagOfWordsDefault
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(BagOfWordsDefault())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
bow_preproc.pipeline.get_voc()

{'kantai': 85507,
 'collection': 47311,
 'kan': 85466,
 'colle': 47282,
 'kashima': 85795,
 'lpm': 91354,
 'figure': 67613,
 'kai': 85379,
 'sega': 125686,
 'home': 78173,
 'innovation': 82052,
 'table': 134270,
 'console': 49176,
 'extensible': 65696,
 'rectangulaire': 117912,
 'avec': 32765,
 'rallonges': 116696,
 'jusqu': 85144,
 '140': 4161,
 'cm': 46522,
 'pour': 111989,
 'salle': 123841,
 'manger': 93194,
 'et': 64634,
 'séjour': 133958,
 'couleur': 50678,
 'chêne': 45344,
 'clair': 45826,
 'brossé': 39079,
 'personnes': 108262,
 'dimensions': 56013,
 'de': 53691,
 'la': 87691,
 'fermée': 67223,
 '50': 16584,
 'longueur': 91049,
 '90': 22951,
 'largeur': 88376,
 '78': 21315,
 'hauteur': 76637,
 'finition': 67966,
 'en': 62615,
 'mélaminé': 99764,
 'haute': 76629,
 'qualité': 115750,
 'aggloméré': 26588,
 'résistance': 122910,
 'rails': 116594,
 'extensibles': 65697,
 'acier': 25489,
 'une': 140825,
 'grande': 74002,
 'capacité': 41665,
 'charge': 44077,
 'ce': 43116,
 'meuble': 9

In [7]:
len(bow_preproc.pipeline.get_voc())

150628

In [3]:


from src.data.vectorization_pipeline import TfidfStemmingV1
from src.data.text_preproc_pipeline import TextPreprocess

tdfidf_stem_preproc = TextPreprocess(TfidfStemmingV1())
tdfidf_stem_preproc.fit(X_train)

X_train_preproc = tdfidf_stem_preproc.transform(X_train)
X_test_preproc = tdfidf_stem_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
tdfidf_stem_preproc.pipeline.get_voc()



{'collect': 802,
 'coll': 800,
 'figur': 1663,
 'hom': 1983,
 'innov': 2104,
 'tabl': 3923,
 'consol': 901,
 'extensibl': 1581,
 'rectangulair': 3343,
 'rallong': 3293,
 'jusqu': 2218,
 'cm': 776,
 'sall': 3571,
 'mang': 2427,
 'sejour': 3645,
 'couleur': 974,
 'chen': 695,
 'clair': 748,
 'bross': 514,
 'person': 2950,
 'dimens': 1207,
 'ferme': 1639,
 'longueur': 2358,
 'largeur': 2258,
 'hauteur': 1945,
 'finit': 1681,
 'melamin': 2487,
 'haut': 1944,
 'qualit': 3256,
 'resist': 3443,
 'rail': 3290,
 'aci': 40,
 'grand': 1883,
 'capacit': 583,
 'charg': 675,
 'meubl': 2514,
 'polyvalent': 3082,
 'multifonctionnel': 2631,
 'design': 1146,
 'simpl': 3704,
 'eleg': 1369,
 'permettr': 2948,
 'adapt': 49,
 'import': 2046,
 'quel': 3266,
 'piec': 2983,
 'maison': 2413,
 'canap': 574,
 'plac': 3006,
 'convertibl': 940,
 'coton': 966,
 'color': 810,
 'chocolat': 718,
 'lois': 2351,
 'fois': 1717,
 'pratiqu': 3129,
 'star': 3830,
 'salon': 3572,
 'trouv': 4111,
 'univer': 4149,
 'contemporai

In [5]:
len(tdfidf_stem_preproc.pipeline.get_voc())

4373