In [1]:
# Version de Python utilisée : 3.9.xx
# Import des librairies
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Access to the raw dataset iterators

In [2]:
from src.data import data

# Import des fichiers contenant les données
df = data.load_data("data").fillna("")

# data !!!!!
features = df['designation']+" "+df['description']
target = df['prdtypecode']

# split to train, valid and test
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

## Pre-processing & vectorisation pipelines

In [3]:
from src.data.vectorization_pipeline import BagOfWordsDefault, TfidfV1
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(TfidfV1())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

bow_preproc.save(X_train_preproc, "data/X_train_preproc")
bow_preproc.save(X_test_preproc, "data/X_test_preproc")

bow_preproc.save(y_train, "data/y_train")
bow_preproc.save(y_test, "data/y_test")

print(bow_preproc.save_voc("data/voc"))





{'collection': 1421, 'colle': 1419, 'figure': 2463, 'home': 2850, 'table': 5217, 'console': 1556, 'extensible': 2351, 'rectangulaire': 4533, 'jusqu': 3137, '140': 83, 'cm': 1398, 'salle': 4815, 'manger': 3386, 'sejour': 4894, 'couleur': 1639, 'chene': 1311, 'clair': 1365, 'brosse': 1103, 'personnes': 4038, 'dimensions': 1919, 'fermee': 2434, '50': 335, 'longueur': 3309, '90': 447, 'largeur': 3188, '78': 422, 'hauteur': 2809, 'finition': 2498, 'haute': 2806, 'qualite': 4440, 'resistance': 4652, 'acier': 511, 'grande': 2739, 'capacite': 1169, 'charge': 1280, 'meuble': 3503, 'polyvalent': 4203, 'multifonctionnel': 3646, 'design': 1841, 'simple': 4969, 'elegant': 2115, 'permettra': 4027, 'adapter': 534, 'importe': 2931, 'quelle': 4450, 'piece': 4068, 'maison': 3370, 'canape': 1158, 'places': 4103, 'convertible': 1603, '100': 20, 'coton': 1630, 'coloris': 1437, 'chocolat': 1334, 'fois': 2539, 'pratique': 4270, 'star': 5113, 'salon': 4817, 'trouvera': 5455, 'place': 4100, 'univers': 5499, 'c

In [8]:
from src.data.vectorization_pipeline import BOW_Stemming
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(BOW_Stemming())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
print(len(bow_preproc.pipeline.get_voc()))
bow_preproc.pipeline.get_voc()


118111


{'kant': 68809,
 'collect': 41301,
 'kan': 68773,
 'coll': 41268,
 'kashim': 69085,
 'lpm': 73781,
 'figur': 55217,
 'kai': 68696,
 'seg': 98781,
 'hom': 63465,
 'innov': 66262,
 'tabl': 105399,
 'consol': 42345,
 'extensibl': 53814,
 'rectangulair': 93509,
 'avec': 29791,
 'rallong': 92696,
 'jusqu': 68485,
 '140': 4126,
 'cm': 40648,
 'pour': 89417,
 'sall': 97335,
 'mang': 75219,
 'et': 53122,
 'séjour': 105162,
 'couleur': 43303,
 'chên': 39763,
 'clair': 40119,
 'bross': 34850,
 'person': 86607,
 'dimens': 47359,
 'de': 45581,
 'la': 70859,
 'ferm': 54926,
 '50': 16385,
 'longueur': 73538,
 '90': 22689,
 'largeur': 71386,
 '78': 21069,
 'hauteur': 62123,
 'finit': 55458,
 'en': 51846,
 'mélamin': 80231,
 'haut': 62115,
 'qualit': 91870,
 'agglomer': 25537,
 'résist': 96721,
 'rail': 92629,
 'aci': 24819,
 'une': 110389,
 'grand': 60098,
 'capac': 36960,
 'charg': 38845,
 'ce': 38110,
 'meubl': 77093,
 'polyvalent': 88907,
 'multifonctionnel': 79746,
 'au': 29381,
 'design': 46584,

In [6]:
from src.data.vectorization_pipeline import BagOfWordsDefault
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(BagOfWordsDefault())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
bow_preproc.pipeline.get_voc()

{'kantai': 85507,
 'collection': 47311,
 'kan': 85466,
 'colle': 47282,
 'kashima': 85795,
 'lpm': 91354,
 'figure': 67613,
 'kai': 85379,
 'sega': 125686,
 'home': 78173,
 'innovation': 82052,
 'table': 134270,
 'console': 49176,
 'extensible': 65696,
 'rectangulaire': 117912,
 'avec': 32765,
 'rallonges': 116696,
 'jusqu': 85144,
 '140': 4161,
 'cm': 46522,
 'pour': 111989,
 'salle': 123841,
 'manger': 93194,
 'et': 64634,
 'séjour': 133958,
 'couleur': 50678,
 'chêne': 45344,
 'clair': 45826,
 'brossé': 39079,
 'personnes': 108262,
 'dimensions': 56013,
 'de': 53691,
 'la': 87691,
 'fermée': 67223,
 '50': 16584,
 'longueur': 91049,
 '90': 22951,
 'largeur': 88376,
 '78': 21315,
 'hauteur': 76637,
 'finition': 67966,
 'en': 62615,
 'mélaminé': 99764,
 'haute': 76629,
 'qualité': 115750,
 'aggloméré': 26588,
 'résistance': 122910,
 'rails': 116594,
 'extensibles': 65697,
 'acier': 25489,
 'une': 140825,
 'grande': 74002,
 'capacité': 41665,
 'charge': 44077,
 'ce': 43116,
 'meuble': 9

In [7]:
len(bow_preproc.pipeline.get_voc())

150628

In [3]:


from src.data.vectorization_pipeline import TfidfStemmingV1
from src.data.text_preproc_pipeline import TextPreprocess

bow_preproc = TextPreprocess(TfidfStemmingV1())
bow_preproc.fit(X_train)

X_train_preproc = bow_preproc.transform(X_train)
X_test_preproc = bow_preproc.transform(X_test)

# bow_preproc.save(X_train_preproc, "data/X_train_preproc")
# bow_preproc.save(X_test_preproc, "data/X_test_preproc")

# bow_preproc.save(y_train, "data/y_train")
# bow_preproc.save(y_test, "data/y_test")

#print(bow_preproc.save_voc("data/voc"))
bow_preproc.pipeline.get_voc()



{'collect': 1247,
 'coll': 1245,
 'figur': 2106,
 'hom': 2419,
 'innov': 2542,
 'tabl': 4352,
 'consol': 1346,
 'extensibl': 2025,
 'rectangulair': 3775,
 'rallong': 3725,
 'jusqu': 2656,
 '140': 84,
 'cm': 1225,
 'sall': 4004,
 'mang': 2860,
 'sejour': 4076,
 'couleur': 1419,
 'chen': 1146,
 'clair': 1197,
 'bross': 970,
 'person': 3377,
 'dimens': 1652,
 'ferme': 2082,
 '50': 331,
 'longueur': 2794,
 '90': 444,
 'largeur': 2695,
 '78': 419,
 'hauteur': 2382,
 'finit': 2124,
 'melamin': 2920,
 'haut': 2381,
 'qualit': 3687,
 'resist': 3875,
 'rail': 3722,
 'aci': 502,
 'grand': 2320,
 'capacit': 1036,
 'charg': 1126,
 'meubl': 2947,
 'polyvalent': 3509,
 'multifonctionnel': 3059,
 'design': 1591,
 'simpl': 4134,
 'eleg': 1814,
 'permettr': 3375,
 'adapt': 511,
 'import': 2484,
 'quel': 3697,
 'piec': 3410,
 'maison': 2846,
 'canap': 1027,
 'plac': 3433,
 'convertibl': 1385,
 '100': 20,
 'coton': 1411,
 'color': 1255,
 'chocolat': 1169,
 'lois': 2787,
 'fois': 2160,
 'pratiqu': 3556,
 

In [4]:
len(bow_preproc.pipeline.get_voc())

4799