In [16]:
# Version de Python utilisée : 3.9.xx
# Import des librairies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import pickle

import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

try:
    from nltk.corpus import stopwords
except:
    nltk.download('stopwords')
    from nltk.corpus import stopwords

pd.set_option('display.max_colwidth', None)

## Access to the raw dataset iterators

In [2]:
# Import des fichiers contenant les données
df = pd.concat(
        [pd.read_csv("../data/X_train.csv", index_col=0),
        pd.read_csv("../data/Y_train.csv", index_col=0)],
        axis=1)
#
df = df.fillna("")

data = df['designation']+" "+df['description']
data = df['description']

target = df['prdtypecode']

# remove tags
#html_tags = np.unique(np.array(re.findall("<[^>]*>", data.str.cat())))

html_tags = [x for y in data.apply(lambda x: np.unique(np.array(re.findall("<[^>]*>", x)))) for x in y ]
html_tags = np.unique(html_tags)
data.replace(html_tags, "", inplace=True)

# split to train, valid and test
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=123)



In [74]:
data

0                                                                                                                                                                 Olivia: Personalisiertes Notizbuch / 150 Seiten / Punktraster / Ca Din A5 / Rosen-Design
1                                           Journal Des Arts (Le) N° 133 Du 28/09/2001 - L'art Et Son Marche Salon D'art Asiatique A Paris - Jacques Barrere - Francois Perrier - La Reforme Des Ventes Aux Encheres Publiques - Le Sna Fete Ses Cent Ans.
2                                                                                                                                                                             Grand Stylet Ergonomique Bleu Gamepad Nintendo Wii U - Speedlink Pilot Style
3                                                                                                                                                                                          Peluche Donald - Europe - Disneyland 2000 (Marionnette À Doi

## Pre-processing & vectorisation pipelines

In [3]:
vectorisation_type = 'tf_idf' # "bag_of_word"  # or tf_idf

# nltk french and english stop words
final_stopwords_list = stopwords.words('english') + stopwords.words('french') + stopwords.words('german')

preproc_config_CountVect = {'strip_accents' : 'unicode',
                     'lowercase' : True,
                     'stop_words' : final_stopwords_list, # got from nltk (french and english)
                     'ngram_range' : (1,1),   # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams
                     'analyzer' : 'word',     # The feature should be made of word n-gram ('word') or character n-grams (‘char’)
                     'max_df' : 1., # the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
                     'min_df' : 0.001, # the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
                     'max_features' : None, #build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
                     'vocabulary' : None # Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix or iterable
                    }

preproc_config_TfidfVect = {'norm' : 'l2',
                            'use_idf' : True,
                            'smooth_idf' : True,
                            'sublinear_tf' : False
                            }

if vectorisation_type == "bag_of_word":
    # bag of word
    pipe = Pipeline([('count', CountVectorizer(**preproc_config_CountVect))]).fit(X_train.to_list())
if vectorisation_type == "tf_idf":
    # tfidf
    pipe = Pipeline([('count', CountVectorizer(**preproc_config_CountVect)),
                    ('tfid', TfidfTransformer(**preproc_config_TfidfVect))]).fit(X_train.to_list())



### Save vocabulary

In [18]:
vocabulary = pipe['count'].vocabulary_

with open('../data/training/voc_tfidf_min_df0_001.pkl','wb') as fp:
    pickle.dump(vocabulary, fp)

### Vectorisation X_train

In [21]:
X = pipe.transform(X_train.to_list()).toarray()
X_train_tfidf = pd.DataFrame(X).fillna(0)

#X_train_tfidf.to_pickle("../data/training/X_train_tfidf_min_df0_001_description.pkl")
y_train.to_pickle("../data/training/y_train_tfidf_min_df0_001_description.pkl")

### Vectorisation X_test

In [23]:
X = pipe.transform(X_test.to_list()).toarray()

X_test_tfidf = pd.DataFrame(X).fillna(0)

# save X_test
#X_test_tfidf.to_pickle("../data/training/X_test_tfidf_min_df0_001_description.pkl")
y_test.to_pickle("../data/training/y_test_tfidf_min_df0_001_description.pkl")

# view words with tfidf > 0.
if 0:
    for c in X_test_tfidf.columns:

        if X_test_tfidf.head(2)[c].values[0]>0.0:
            pos = list(pipe['count'].vocabulary_.values()).index(c)
            word = list(pipe['count'].vocabulary_.keys())[pos]
            print(word, X_test_tfidf.head(1)[c])
del(X_test_tfidf)

Check le corpus après preprocessing 

In [77]:
# recuperation de la fonction de preprocessing
preproc_ = pipe['count'].build_preprocessor()
# sur le dataframe data mais on peut aussi le faire que sur designation ou description
data_preproc = preproc_(data.str.cat())
# nombre des mots
len(data_preproc.split())

6753275

Statistique sur les mots du vocabulaire

In [54]:
# on peut faire varier 'min_df' dans preproc_config_CountVect : 0 0.001 0.002 0.003 etc.

print("taille du voc sur la base train:", len(pipe['count'].vocabulary_.keys()))
pipe['count'].vocabulary_.keys()
voc_words = pd.DataFrame(pipe['count'].vocabulary_.keys(), columns=['words'])
voc_words['nb_chars'] = voc_words.words.apply(lambda x: len(x))

print('Statistique corpus designation:')
print(voc_words.describe().apply(lambda x: x.apply('{0:<.2f}'.format)))


taille du voc sur la base train: 5778
Statistique corpus designation:
      nb_chars
count  5778.00
mean      6.58
std       2.62
min       2.00
25%       5.00
50%       6.00
75%       8.00
max      16.00


## Single vectorisation


## Texte preprocessing

voir https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words



### The Bag of Words representation

#### CountVectorizer

Convertir un ensemble de document texte dans une matrice d'occurance de token.

In [3]:
# nltk french and english stop words
final_stopwords_list = stopwords.words('english') + stopwords.words('french') + stopwords.words('german')
#final_stopwords_list


##### Au niveau mot

In [5]:
# scikit-learn
preproc_config_CountVect = {'strip_accents' : 'unicode',
                     'lower_case' : True,
                     'stop_word' : final_stopwords_list, # got from nltk (french and english)
                     'ngram_range' : (1,1),   # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams
                     'analyzer' : 'word',     # The feature should be made of word n-gram ('word') or character n-grams (‘char’)
                     'max_df' : 1., # the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
                     'min_df' : 0., # the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
                     'max_features' : None, #build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
                     'vocabulary' : None # Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix or iterable
                    }

count_vect = CountVectorizer()
#count_vect.input = df.designation.head(100)
count_vect.encoding = 'utf-8'
count_vect.strip_accents = preproc_config_CountVect['strip_accents']
count_vect.lower_case = preproc_config_CountVect['lower_case']
count_vect.stop_word = preproc_config_CountVect['stop_word']
count_vect.ngram_range = preproc_config_CountVect['ngram_range']
count_vect.analyzer = preproc_config_CountVect['analyzer']
count_vect.max_df = preproc_config_CountVect['max_df']
count_vect.min_df = preproc_config_CountVect['min_df']
count_vect.stop_word = preproc_config_CountVect['stop_word']
count_vect.vocabulary = preproc_config_CountVect['vocabulary']



In [6]:
count_vect.fit(X_train)
#X_counts = count_vect.fit_transform()
voc = count_vect.vocabulary_
print("Vocabulary size:",len(voc))
print("first 10 words from the voc")
print([k for i, (k,v) in enumerate(zip(voc.keys(),voc.values())) if i<10])

#print("Token:")
#print("nombre de token: ", len(count_vect.get_feature_names_out()))
#print(count_vect.get_feature_names_out())

Vocabulary size: 145969
first 10 words from the voc
['ramadan', 'eid', 'de', 'luxe', 'lumieres', 'del', 'interieur', 'lanternes', 'or', 'mnl535']


#### Au niveau N-grams

In [7]:
#X_train_count_vect = count_vect.transform(X_train)
X_test_count_vect = count_vect.transform(X_test)
X_test_count_vect = pd.DataFrame(X_test_count_vect.toarray())
                                 
X_test_count_vect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145959,145960,145961,145962,145963,145964,145965,145966,145967,145968
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

Convertir un ensemble de document brut dans une matrice de feature TF-IDF.

Si on dispose déjà de la matrice d'occurence des token nous pouvons utiliser la classe _TfidfTransformer_

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
preproc_config_TfidfVect = {'norm' : 'l2',
                            'use_idfbool' : True,
                            'smooth_idfbool' : True,
                            'sublinear_tf' : False
                            }

In [6]:
vectorizer = TfidfVectorizer()
vectorizer.norm = preproc_config_TfidfVect['norm']
vectorizer.use_idfbool = preproc_config_TfidfVect['use_idfbool']
vectorizer.smooth_idfbool = preproc_config_TfidfVect['smooth_idfbool']
vectorizer.sublinear_tf = preproc_config_TfidfVect['sublinear_tf']

#X_train = X_train.head(1000)
#X_test = X_test.head(1000)
vectorizer.fit(X_train)

                                 

In [None]:
X = vectorizer.transform(X_test)

X_test_tfidf = pd.DataFrame(X.toarray())

In [28]:
list(vectorizer.vocabulary_.values()).index(2720)
list(vectorizer.vocabulary_.keys())[338]

'bleu'

In [29]:
for c in X_test_tfidf.columns:
    #print(type(X_test_tfidf.head(1)[c]))
    #print(pd.DataFrame(X_test_tfidf.head(1)[c]))
    #print(pd.DataFrame(X_test_tfidf.head(1)[c])==0.)
    #print(X_test_tfidf.head(1)[c].values==0.)

    if X_test_tfidf.head(1)[c].values[0]>0.0:
        pos = list(vectorizer.vocabulary_.values()).index(c)
        word = list(vectorizer.vocabulary_.keys())[pos]
        print(word, X_test_tfidf.head(1)[c])
    #print(X_test_tfidf[X_test_tfidf>0.].head(1)[c])


bleu 0    0.190921
Name: 2720, dtype: float64
bluetooth 0    0.326458
Name: 2751, dtype: float64
dualshock 0    0.326458
Name: 4918, dtype: float64
fil 0    0.214033
Name: 5935, dtype: float64
gamepad 0    0.326458
Name: 6324, dtype: float64
playstation 0    0.285002
Name: 9876, dtype: float64
pour 0    0.084537
Name: 10078, dtype: float64
sans 0    0.152748
Name: 11380, dtype: float64
sony 0    0.616227
Name: 11893, dtype: float64
v2 0    0.326458
Name: 13135, dtype: float64


### Pipeline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

preproc_config_CountVect = {'strip_accents' : 'unicode',
                     'lowercase' : True,
                     'stop_words' : final_stopwords_list, # got from nltk (french and english)
                     'ngram_range' : (1,1),   # The lower and upper boundary of the range of n-values for different word n-grams or char n-grams
                     'analyzer' : 'word',     # The feature should be made of word n-gram ('word') or character n-grams (‘char’)
                     'max_df' : 1., # the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
                     'min_df' : 0., # the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
                     'max_features' : None, #build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
                     'vocabulary' : None # Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix or iterable
                    }

preproc_config_TfidfVect = {'norm' : 'l2',
                            'use_idf' : True,
                            'smooth_idf' : True,
                            'sublinear_tf' : False
                            }

#count_vect = CountVectorizer()
#count_vect.input = df.designation.head(100)


#vectorizer = TfidfVectorizer()
#X_train.str.lower().to_list()[0]
pipe = Pipeline([('count', CountVectorizer(**preproc_config_CountVect)),
                 ('tfid', TfidfTransformer(**preproc_config_TfidfVect))]).fit(X_train.to_list())






In [7]:
X = pipe.transform(X_test.to_list()).toarray()

X_test_tfidf = pd.DataFrame(X)


if 1:
    for c in X_test_tfidf.columns:
        #print(type(X_test_tfidf.head(1)[c]))
        #print(pd.DataFrame(X_test_tfidf.head(1)[c]))
        #print(pd.DataFrame(X_test_tfidf.head(1)[c])==0.)
        #print(X_test_tfidf.head(1)[c].values==0.)

        if X_test_tfidf.head(2)[c].values[0]>0.0:
            pos = list(pipe['count'].vocabulary_.values()).index(c)
            word = list(pipe['count'].vocabulary_.keys())[pos]
            print(word, X_test_tfidf.head(1)[c])
del(X_test_tfidf)
#pipe['count'].vocabulary_.values()
#pipe['count'].vocabulary_.keys()


1pc 0    0.063711
Name: 7269, dtype: float64
30x35cm 0    0.167637
Name: 12065, dtype: float64
326 0    0.027789
Name: 12435, dtype: float64
34x35cm 0    0.167637
Name: 12897, dtype: float64
39 0    0.316589
Name: 13782, dtype: float64
43 0    0.052511
Name: 15092, dtype: float64
actionner 0    0.032332
Name: 25668, dtype: float64
affecter 0    0.059794
Name: 26380, dtype: float64
aident 0    0.02138
Name: 26830, dtype: float64
ainsi 0    0.012888
Name: 26936, dtype: float64
argile 0    0.021019
Name: 30593, dtype: float64
article 0    0.089787
Name: 31002, dtype: float64
aussi 0    0.012636
Name: 32248, dtype: float64
autour 0    0.08503
Name: 32519, dtype: float64
autres 0    0.147444
Name: 32526, dtype: float64
avoir 0    0.012546
Name: 32759, dtype: float64
balcon 0    0.1351
Name: 33696, dtype: float64
beaucoup 0    0.086232
Name: 34935, dtype: float64
besoin 0    0.01399
Name: 35603, dtype: float64
bon 0    0.065428
Name: 37365, dtype: float64
bonne 0    0.013136
Name: 37428, dty

KeyboardInterrupt: 

## Image preprocessing

voir https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words


### Patch extraction

### Connectivity graph of an image