In [1]:
import numpy as np

np.set_printoptions(threshold=10000, suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

### Importer ce jeu de données avec la librairie pandas (c.f. read_csv)

In [2]:
data = pd.read_csv('PubMed-multi-label-dataset.csv', sep=',', encoding="utf-8")
data.head(3)

Unnamed: 0,Title,abstractText,meshMajor,pmid,meshid,meshroot,A,B,C,D,E,F,G,H,I,J,L,M,N,Z
0,Expression of p53 and coexistence of HPV in pr...,Fifty-four paraffin embedded tissue sections f...,"['DNA Probes, HPV', 'DNA, Viral', 'Female', 'H...",8549602,"[['D13.444.600.223.555', 'D27.505.259.750.600....","['Chemicals and Drugs [D]', 'Organisms [B]', '...",0,1,1,1,1,0,0,1,0,0,0,0,0,0
1,Vitamin D status in pregnant Indian women acro...,The present cross-sectional study was conducte...,"['Adult', 'Alkaline Phosphatase', 'Breast Feed...",21736816,"[['M01.060.116'], ['D08.811.277.352.650.035'],...","['Named Groups [M]', 'Chemicals and Drugs [D]'...",0,1,1,1,1,1,1,0,1,1,0,1,1,1
2,[Identification of a functionally important di...,The occurrence of individual amino acids and d...,"['Amino Acid Sequence', 'Analgesics, Opioid', ...",19060934,"[['G02.111.570.060', 'L01.453.245.667.060'], [...","['Phenomena and Processes [G]', 'Information S...",1,1,0,1,1,0,1,0,0,0,1,0,0,0


In [3]:
data = data.sample(frac=0.1, random_state=0)

### Analyser votre jeu de données, essentiellement la target. Chaque texte est labélisée par un
ensemble de labels parmi les 14 labels suivants : Anatomy [A], Organisms [B], Diseases [C], Chemicals and Drugs [D], Analytical, Diagnostic and Therapeutic Techniques, and Equipment [E], Psychiatry and Psychology [F],Phenomena and Processes [G], Disciplines and Occupations [H], Anthropology, Education, Sociology, and Social Phenomena [I], Technology, Industry, and Agriculture [J], Information Science [L], Named Groups [M], Health Care [N], Geographicals [Z]

### Modéliser le problème d’apprentissage supervisé sur ces données.

On décide de travailler uniquement sur l'abstractText

In [4]:
corpus = data['abstractText']

### Traiter vos données textuelles en supprimant les bruits dans les textes et en les normalisant. 
Vous pouvez vous inspirer par exemple du code par ici1 (en l’améliorant s’il le faut) si vous utiliserez la librairie NLTK. Vous pouvez aussi utiliser d’autres librairies (A vos recherches). N’oubliez pas que cette étape de pré-traitement (preprocessing) dépend de vos données et du problème traité. Quelques traitements à faire sont :
- Suppression des ponctuations comme . , ! $( ) * % @ o Suppression des URLs
- Suppression des Stop words
- Transformation de tout le texte en minuscule.
- Tokenisation de vos textes
- Racinisation (Stemming)
- Lemmatisation (lemmatization) o Etc.

In [6]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)


def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)


def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words


def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems


def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas


def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words


def preprocess(sample):
    sample = remove_URL(sample)
    sample = replace_contractions(sample)
    # Tokenize
    words = nltk.word_tokenize(sample)

    # Normalize
    return normalize(words)

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/floriangeillon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from tqdm import tqdm

words_data = []
for sample in tqdm(corpus):
    sample = remove_URL(sample)
    sample = replace_contractions(sample)

    # Tokenize
    words = nltk.word_tokenize(sample)

    # Normalize
    words = normalize(words)

    words_data.append(words)

100%|██████████| 5000/5000 [01:10<00:00, 70.77it/s]


In [8]:
import pickle

with open('words_data.pkl', 'wb') as f:
    pickle.dump(words_data, f)

In [8]:
import pickle

with open('words_data.pkl', 'rb') as f:
    words_data = pickle.load(f)

In [10]:
print(words_data[0])

['members', 'threezincfinger', 'family', 'transcription', 'factors', 'play', 'important', 'role', 'determining', 'basal', 'transcription', 'cloned', 'mouse', 'bteb3', 'mbteb3', 'new', 'member', 'basic', 'transcription', 'element', 'binding', 'protein', 'bteb', 'family', 'expressed', 'wide', 'variety', 'tissues', 'mbteb3', 'activates', 'transcription', 'simian', 'virus', 'forty', 'early', 'promoter', '4fold', 'tissuespecific', 'sm22alpha', 'promoter', '100fold', 'suggesting', 'like', 'bteb1', 'sp1', 'mbteb3', 'basal', 'transcription', 'factor']


### Séparer les données en jeu de données d’apprentissage et jeu de données de test (50-50).

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(words_data, data[
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']], test_size=0.5, random_state=0)

### Proposer une fonction run_models permettant de comparer plusieurs modèles d’apprentissage (en fonction de votre modélisation) sur ces données. 
Pour les approches multi-label, vous utiliserez l’approche EnsembleClassifierChain et MultiOutputCLassifier du package sklearn.multioutput. Ces classifieurs nécessite un classifieur de base (base_estimator) dont vous avez le libre choix d’utilisation. Votre évaluation se basera sur les deux mesures hamming_loss et zero_one_loss.

In [10]:
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import zero_one_loss


def run_models(X_train, y_train, X_test, y_test, models, file_names=[]):
    for model in models:
        print(model)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("Hamming loss: ", hamming_loss(y_test, y_pred))
        print("Zero-one loss: ", zero_one_loss(y_test, y_pred))
        print()
        if len(file_names) > 0:
            with open(file_names[models.index(model)], 'wb') as f:
                pickle.dump(model, f)

On choisit d'utiliser un classifieur de base RandomForestClassifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

base_estimator = RandomForestClassifier()

models = [ClassifierChain(base_estimator=base_estimator),
          MultiOutputClassifier(estimator=base_estimator)]

### Proposer une première vectorisation de vos données textuelles par une représentation TF-IDF.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

X_train_tfidf = [' '.join(words) for words in X_train]
X_test_tfidf = [' '.join(words) for words in X_test]

X_train_tfidf = vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf = vectorizer.transform(X_test_tfidf)

### Exécuter ensuite votre fonction run_models sur vos données et interpréter les résultats obtenus.
Vous pouvez toujours appliquer votre fonction sur les données pré-traitées et les données non pré-
traitées afin d’analyser l’apport de la partie pré-traitement de données.

In [12]:
run_models(X_train_tfidf, y_train, X_test_tfidf, y_test, models,
           ['models/ClassifierChain_ap.pkl', 'models/MultiOutputClassifier_ap.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())


KeyboardInterrupt: 

Sans prétraitement

In [23]:
from sklearn.model_selection import train_test_split

corpus = data['abstractText']
vectorizer_sp = TfidfVectorizer(stop_words=None)

X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(corpus, data[
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']], test_size=0.5, random_state=0)

X_train_tfidf_sp = vectorizer_sp.fit_transform(X_train_sp)
X_test_tfidf_sp = vectorizer_sp.transform(X_test_sp)

In [17]:
run_models(X_train_tfidf_sp, y_train_sp, X_test_tfidf_sp, y_test_sp, models,
           ['models/ClassifierChain_sp.pkl', 'models/MultiOutputClassifier_sp.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.16888571428571428
Zero-one loss:  0.9088

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.16937142857142856
Zero-one loss:  0.9004


Pas énormément de différence entre les deux approches (avec et sans prétraitement) peut être à cause de RandomForestClassifier

### Appliquer la méthode SVD de réduction de dimensions (TruncatedSVD) afin de construire des "concepts" liés aux documents et aux termes. Elle permettra entre autres de résoudre les problèmes de synonymie (plusieurs mots avec un seul sens) et de polysémie (un seul mot avec plusieurs sens). 
La fonction suivante vous aidera à comprendre les concepts en affichant leurs mots les plus pertinents.

In [16]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Concept #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [13]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=100, random_state=0)
SVD.fit(X_train_tfidf)

In [17]:
feature_names = vectorizer.get_feature_names_out()
print_top_words(SVD, feature_names, 10)

Concept #0: patients thousand ninety seven thirty group study treatment cells eighty
Concept #1: cells cell expression protein gene activity dna human proteins binding
Concept #2: thousand ninety ci health seven eighty seventy thirty million women
Concept #3: thousand cells cancer ninety cell ci expression eighty seven tumor
Concept #4: group rats glucose insulin groups plasma levels control blood significantly
Concept #5: health cells cancer group cell care women levels risk effects
Concept #6: gene expression genes group patients protein health proteins levels family
Concept #7: group cancer groups breast method control lung survival surgery tumor
Concept #8: cancer treatment breast risk glucose therapy insulin activity lung prostate
Concept #9: expression gene genes left case right lung subjects pressure exercise
Concept #10: infection strains isolates virus cases children species cells infections strain
Concept #11: group case binding cases patient treatment report thousand disease

Même chose avec les données sans prétraitement

In [18]:
SVD_sp = TruncatedSVD(n_components=100, random_state=0)
SVD_sp.fit(X_train_tfidf_sp)

feature_names_sp = vectorizer_sp.get_feature_names_out()
print_top_words(SVD_sp, feature_names_sp, 10)

Concept #0: the of and in to with was were for patients
Concept #1: patients with were group was and had treatment who years
Concept #2: cells cell expression in il induced levels protein cancer receptor
Concept #3: group the was were mg rats kg groups glucose min
Concept #4: and health to ci 95 care women risk for research
Concept #5: il in group is rats glucose health insulin induced exercise
Concept #6: cancer cells the cell ci 95 group breast tumor lung
Concept #7: group in gene expression groups patients genes the were control
Concept #8: group binding of patients the cancer glucose protein ci insulin
Concept #9: il patients for alpha cancer treatment dose therapy binding group
Concept #10: of health il cells hiv care infection were cell anti
Concept #11: il the ifn expression 95 ci inflammatory mrna gene type
Concept #12: and cancer case tumor glucose breast levels of acid treatment
Concept #13: and health care protein binding cell patients beta cells the
Concept #14: expression 

In [19]:
X_train_tfidf_svd = SVD.transform(X_train_tfidf)
X_test_tfidf_svd = SVD.transform(X_test_tfidf)

X_train_tfidf_sp_svd = SVD_sp.transform(X_train_tfidf_sp)
X_test_tfidf_sp_svd = SVD_sp.transform(X_test_tfidf_sp)

In [23]:
run_models(X_train_tfidf_svd, y_train, X_test_tfidf_svd, y_test, models,
           ['models/ClassifierChain_ap_svd.pkl', 'models/MultiOutputClassifier_ap_svd.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.16357142857142856
Zero-one loss:  0.8972

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.1652
Zero-one loss:  0.9052


In [24]:
run_models(X_train_tfidf_sp_svd, y_train_sp, X_test_tfidf_sp_svd, y_test_sp, models,
           ['models/ClassifierChain_sp_svd.pkl', 'models/MultiOutputClassifier_sp_svd.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.16785714285714284
Zero-one loss:  0.9092

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.16994285714285715
Zero-one loss:  0.918


### Proposer uncode qui permettra d’apprendre votre propre modèle de plongement lexical Word2Vec sur vos données textuelles


In [11]:
from gensim.models import Word2Vec

model = Word2Vec(words_data, vector_size=100, window=5, min_count=1, workers=4)

### Évaluer visuellement et numériquement sur quelques mots clés votre nouveaumodèle de vectorisation (Embedding)

In [12]:
model.wv.most_similar('cancer')

[('breast', 0.9533255696296692),
 ('receptorpositive', 0.921899676322937),
 ('leukaemia', 0.9211985468864441),
 ('earlystage', 0.920826256275177),
 ('metastatic', 0.9075683355331421),
 ('prostate', 0.9035713076591492),
 ('nonmalignant', 0.9009960293769836),
 ('lung', 0.8916445374488831),
 ('colorectal', 0.8911237716674805),
 ('progression', 0.8892887234687805)]

In [13]:
model.wv.doesnt_match(['diabetes', 'heart', 'bike'])


'bike'

In [14]:
# evaluer visuellement (graphique)
from sklearn.manifold import TSNE

words = list(model.wv.index_to_key)
wvs = model.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=250, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(14, 8))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x + 1, y + 1), xytext=(0, 0), textcoords='offset points')
plt.show()

KeyboardInterrupt: 

In [15]:
def word2vec_generator(texts, model, vector_size):
    dict_word2vec = {}
    for index, word_list in enumerate(texts):
        arr = np.array([0.0 for i in range(0, vector_size)])
        nb_word = 0
        for word in word_list:
            try:
                arr += model.wv[word]
                nb_word = nb_word + 1
            except KeyError:
                continue
        if (len(word_list) == 0):
            dict_word2vec[index] = arr
        else:
            dict_word2vec[index] = arr / nb_word
    df_word2vec = pd.DataFrame(dict_word2vec).T
    return df_word2vec


### Exploiter votre modèle Word2Vec pour la vectorisation de vos textes (avec deux méthodes utilisant ou non le TF-IDF des mots)


In [18]:
# 1. avec prétraitement
X_train_w2v = word2vec_generator(X_train, model, 100)
X_test_w2v = word2vec_generator(X_test, model, 100)

run_models(X_train_w2v, y_train, X_test_w2v, y_test, models,
           ['models/ClassifierChain_ap_w2v.pkl', 'models/MultiOutputClassifier_ap_w2v.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.18177142857142858
Zero-one loss:  0.9268

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.1818
Zero-one loss:  0.9204


In [24]:
# 2. sans prétraitement
X_train_w2v_sp = word2vec_generator(X_train_sp, model, 100)
X_test_w2v_sp = word2vec_generator(X_test_sp, model, 100)

run_models(X_train_w2v_sp, y_train_sp, X_test_w2v_sp, y_test_sp, models,
           ['models/ClassifierChain_sp_w2v.pkl', 'models/MultiOutputClassifier_sp_w2v.pkl'])

ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.2610571428571429
Zero-one loss:  0.9772

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.25948571428571426
Zero-one loss:  0.9808


### Idem en utilisant le modèle Word2Vec pré-entrainé de Google (GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin)

In [25]:
# Variation du code car on à un KeyedVectors et non un Word2Vec
def word2vec_generator_google(texts, model, vector_size):
    dict_word2vec = {}
    for index, word_list in enumerate(texts):
        arr = np.array([0.0 for i in range(0, vector_size)])
        nb_word = 0
        for word in word_list:
            try:
                arr += model[word]
                nb_word = nb_word + 1
            except KeyError:
                continue
        if (len(word_list) == 0):
            dict_word2vec[index] = arr
        else:
            dict_word2vec[index] = arr / nb_word
    df_word2vec = pd.DataFrame(dict_word2vec).T
    return df_word2vec


In [26]:
from gensim.models import KeyedVectors

model_google = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [27]:
# 1. avec prétraitement
X_train_w2v_google = word2vec_generator_google(X_train, model_google, 300)
X_test_w2v_google = word2vec_generator_google(X_test, model_google, 300)

run_models(X_train_w2v_google, y_train, X_test_w2v_google, y_test, models,
           ['models/ClassifierChain_ap_w2v_google.pkl', 'models/MultiOutputClassifier_ap_w2v_google.pkl'])


ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.15057142857142858
Zero-one loss:  0.8835999999999999

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.15228571428571427
Zero-one loss:  0.8924


In [28]:
# 2. sans prétraitement
X_train_w2v_google_sp = word2vec_generator_google(X_train_sp, model_google, 300)
X_test_w2v_google_sp = word2vec_generator_google(X_test_sp, model_google, 300)

run_models(X_train_w2v_google_sp, y_train_sp, X_test_w2v_google_sp, y_test_sp, models,
           ['models/ClassifierChain_sp_w2v_google.pkl', 'models/MultiOutputClassifier_sp_w2v_google.pkl'])


ClassifierChain(base_estimator=RandomForestClassifier())
Hamming loss:  0.23108571428571428
Zero-one loss:  0.9568

MultiOutputClassifier(estimator=RandomForestClassifier())
Hamming loss:  0.23174285714285714
Zero-one loss:  0.9636


### Exploiter votre modèle Word2Vec en entrée de la couche d’embedding d’un modèle à base de réseaux de neurones récurrents de type LSTM permettant :
- Dans un premier temps d’optimiser la 0/1 loss.
- Dans un deuxième temps d’optimiser la hamming loss


In [8]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM

In [9]:
train_tokenized = X_train
test_tokenized = X_test


def to_sequence(index, text):
    indexes = [index[word] for word in text if word in index]
    return indexes


word2idx = {word: idx for idx, word in enumerate(model_google.index_to_key)}
X_train_sequences = [to_sequence(word2idx, x) for x in train_tokenized]
X_test_sequences = [to_sequence(word2idx, x) for x in test_tokenized]

MAX_SEQ_LENGHT = 50
N_FEATURES = len(model_google.index_to_key)
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, value=N_FEATURES)

EMBEDDINGS_LEN = model_google.vector_size
embeddings_index = np.zeros((len(model_google.index_to_key) + 1, EMBEDDINGS_LEN))

for word, idx in word2idx.items():
    try:
        embedding = model_google[word]
        embeddings_index[idx] = embedding
    except:
        pass

model_lstm = Sequential()
model_lstm.add(Embedding(len(model_google.key_to_index) + 1,
                         EMBEDDINGS_LEN,
                         weights=[embeddings_index],
                         trainable=False))

model_lstm.add(LSTM(300, dropout=0.2))
model_lstm.add(Dense(1))
model_lstm.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         900000300 
                                                                 
 lstm (LSTM)                 (None, 300)               721200    
                                                                 
 dense (Dense)               (None, 1)                 301       
                                                                 
Total params: 900721801 (3.36 GB)
Trainable params: 721501 (2.75 MB)
Non-trainable params: 900000300 (3.35 GB)
_________________________________________________________________


In [10]:
model_lstm.compile(loss='binary_crossentropy', optimizer='adam')
model_lstm.fit(X_train_sequences, y_train, epochs=20, batch_size=10, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x375167090>

In [34]:
from keras import backend as K
from keras.utils import custom_object_scope


def hamming_loss_handmade(y_true, y_pred):
    return K.sum(K.abs(K.cast(y_true, 'float32') - y_pred), axis=-1)


with custom_object_scope({'hamming_loss': hamming_loss_handmade}):
    model_lstm.compile(loss='hamming_loss', optimizer='adam')
    model_lstm.fit(X_train_sequences, y_train, epochs=20, batch_size=10, validation_split=0.1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Pipeline : Automatiser l’enchainement de votre meilleur traitement dans une fonction ou un pipeline.

In [6]:
import numpy as np

np.set_printoptions(threshold=10000, suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')


# Variation du code car on à un KeyedVectors et non un Word2Vec
def word2vec_generator_google(texts, model, vector_size):
    dict_word2vec = {}
    for index, word_list in enumerate(texts):
        arr = np.array([0.0 for i in range(0, vector_size)])
        nb_word = 0
        for word in word_list:
            try:
                arr += model[word]
                nb_word = nb_word + 1
            except KeyError:
                continue
        if (len(word_list) == 0):
            dict_word2vec[index] = arr
        else:
            dict_word2vec[index] = arr / nb_word
    df_word2vec = pd.DataFrame(dict_word2vec).T
    return df_word2vec


from gensim.models import KeyedVectors

model_google = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain

pipeline = Pipeline([
    ('word2vec', FunctionTransformer(word2vec_generator_google, kw_args={'model': model_google, 'vector_size': 300})),
    ('classifier', ClassifierChain(base_estimator=RandomForestClassifier()))
])

In [7]:
import pickle

with open('words_data.pkl', 'rb') as f:
    words_data = pickle.load(f)

from sklearn.model_selection import train_test_split

data = pd.read_csv('PubMed-multi-label-dataset.csv', sep=',', encoding="utf-8")
data = data.sample(frac=0.1, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(words_data, data[
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'Z']], test_size=0.5, random_state=0)

from sklearn.metrics import hamming_loss, zero_one_loss

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Hamming loss: ", hamming_loss(y_test, y_pred))
print("Zero-one loss: ", zero_one_loss(y_test, y_pred))

Hamming loss:  0.15068571428571428
Zero-one loss:  0.8855999999999999
