# Test de Word/Sentence embedding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer



In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.columns

Index(['Title', 'Body', 'Tags', 'title_bow_lem', 'body_bow_lem',
       'title_bow_stem', 'body_bow_stem'],
      dtype='object')

In [4]:
for c in ['Tags', 'title_bow_lem', 'body_bow_lem', 'title_bow_stem', 'body_bow_stem']:
    data[c] = data[c].transform(lambda x: eval(x))

In [8]:
data = data.sample(frac=1)
data

Unnamed: 0,Title,Body,Tags,title_bow_lem,body_bow_lem,title_bow_stem,body_bow_stem
9614,image resources for ios,i m probably missing something obvious here y...,"[c, objective, xcode, ios]",[image],"[project, image, someimage, png, someimage, pn...",[imag],"[project, imag, someimag, png, someimag, png, ..."
20844,why response expectedcontentlength always re...,i have this code download_size is nsinteger ...,"[iphone, ios]","[response, expectedcontentlength]","[download_size, someone, use, effect, help]","[respons, expectedcontentlength]","[download_s, someon, use, effect, help]"
1771,why would you ever want to allocate memory on ...,possible duplicate when is it best to use a ...,"[c, memory]","[memory, heap, stack]","[duplicate, stack, heap, vice, versa, heap, v,...","[memori, heap, stack]","[duplic, stack, heap, vice, versa, heap, vs, s..."
16732,what is the best standard style for a tostring...,we have a lot of objects for which we like to ...,"[java, javascript, php]","[style, tostring, implementation]","[lot, output, standard, practice, style, case,...","[style, tostr, implement]","[lot, output, standard, practic, style, case, ..."
19418,sql server invalid column name after adding ne...,i just added an identity column to an existing...,"[server, sql]","[sql, server, column, name, column]","[identity, column, table, designer, table, que...","[sql, server, column, name, column]","[ident, column, tabl, design, tabl, queri, que..."
...,...,...,...,...,...,...,...
7351,removing trailing newline character from fgets...,i am trying to get some data from the user and...,"[c, string]",[character],"[user, function, gcc, character, end, string]",[charact],"[user, function, gcc, charact, end, string]"
20731,what is an internal address in java,in the javadoc for object hashcode it states...,"[java, memory]","[address, java]","[javadoc, hashcode, hashcode, method, class, a...","[address, java]","[javadoc, hashcod, hashcod, method, class, add..."
12193,detect carrier connection type 3g edge gprs,how can i get the type of connection of a carr...,"[c, objective, iphone, ios]","[detect, carrier, connection, type, edge, gprs]","[type, connection, carrier, network, connectio...","[detect, carrier, connect, type, edg, gprs]","[type, connect, carrier, network, connect, cla..."
2568,how to enable disable wifi from an application,i want to enable disable wifi from my android ...,"[android, java]","[wifi, application]","[wifi, application]","[wifi, applic]","[wifi, applic]"


In [10]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(data['Tags'])

In [11]:
resultats = {}

# Word2Vec

In [12]:
import gensim

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [13]:
X = data['body_bow_lem'].apply(lambda x: ' '.join(x))

In [14]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 755 # adapt to length of sentences

In [15]:
sentences = X.to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [16]:
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=3)

In [17]:
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

Vocabulary size: 20740
Word2Vec trained


In [18]:
# Préparation des sentences (tokenization)

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

Fit Tokenizer ...
Number of unique words: 20741


In [19]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

Create Embedding matrix ...
Word embedding rate :  1.0
Embedding matrix: (20741, 300)


In [20]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 755)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 755, 300)          6222300   
_________________________________________________________________
global_average_pooling1d (Gl (None, 300)               0         
Total params: 6,222,300
Trainable params: 6,222,300
Non-trainable params: 0
_________________________________________________________________


In [21]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape

(24728, 300)

In [22]:
embeddings

array([[ 0.00068567,  0.02067879, -0.02711758, ...,  0.0087561 ,
         0.00513197,  0.01007272],
       [ 0.00019466, -0.0014124 , -0.00175916, ...,  0.00143563,
         0.00022033,  0.00356921],
       [-0.03191302, -0.00644214,  0.02830587, ...,  0.00451324,
        -0.01391686,  0.0004166 ],
       ...,
       [ 0.00385025,  0.00931357, -0.02145561, ...,  0.00442275,
         0.01606862,  0.00126241],
       [ 0.00173325, -0.00148214, -0.0016219 , ...,  0.00146125,
         0.00146609, -0.00124672],
       [-0.00402275,  0.03207737, -0.02735896, ...,  0.02952688,
         0.01471878,  0.0059419 ]], dtype=float32)

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(embeddings, y, random_state=21, train_size=0.8)

In [24]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score

In [25]:
clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
jaccard_score(Y_test, pred, average='macro')

0.10699639347410564

In [26]:
resultats['Word2Vec'] = jaccard_score(Y_test, pred, average='macro')

# Bert

In [27]:
print(tf.__version__)
print(tensorflow.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

2.6.0
2.6.0
Num GPUs Available:  1
True


In [28]:
import time
import transformers

In [29]:
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = transformers.AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
        
        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode=='TFhub' : # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids" : input_ids, 
                                 "input_mask" : attention_mask, 
                                 "input_type_ids" : token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']
             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot

In [30]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = transformers.TFAutoModel.from_pretrained(model_type)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [31]:
X = data.iloc[:2000]

In [32]:
sentences = X['Body'].to_list()

In [33]:
# Création des features

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences, 
                                                         max_length, batch_size, mode='HF')

temps traitement :  28.0


In [34]:
features_bert.shape

(2000, 768)

In [35]:
features_bert

array([[-0.08041111, -0.00679683,  0.645215  , ..., -0.13352282,
         0.02891598,  0.04615709],
       [-0.03140354, -0.09737708,  0.53651834, ..., -0.2158382 ,
        -0.09667631,  0.11912355],
       [-0.29708138,  0.05941759, -0.04106521, ..., -0.06608517,
         0.12023851,  0.22784819],
       ...,
       [-0.03171147, -0.23251066,  0.30946395, ..., -0.36356553,
         0.05622838, -0.00628305],
       [-0.24331787, -0.2569276 ,  0.01086113, ..., -0.33381334,
        -0.15886006,  0.32918283],
       [-0.30160648,  0.21060966,  0.15644309, ..., -0.39124644,
        -0.11557137,  0.31652117]], dtype=float32)

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(features_bert, y[:2000], random_state=21, train_size=0.8)

In [37]:
clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
jaccard_score(Y_test, pred, average='macro')



0.16368632943420716

In [38]:
resultats['Bert'] = jaccard_score(Y_test, pred, average='macro')

## Sur les titres

In [39]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = transformers.TFAutoModel.from_pretrained(model_type)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [40]:
sentences = data['Title'].iloc[:3000].to_list()

In [41]:
# Création des features

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences, 
                                                         max_length, batch_size, mode='HF')

temps traitement :  47.0


In [42]:
features_bert.shape

(3000, 768)

In [43]:
features_bert

array([[-0.11041918, -0.28854597,  0.24364178, ..., -0.17264582,
        -0.21411662,  0.11876289],
       [-0.31735778, -0.05406343,  0.4588824 , ..., -0.13966443,
         0.063547  ,  0.25379562],
       [-0.06042207, -0.03452685, -0.13640599, ...,  0.06004938,
        -0.06353816,  0.20839533],
       ...,
       [-0.0997923 , -0.33459848,  0.06694856, ..., -0.01030366,
         0.27915046,  0.22758228],
       [-0.12001015, -0.43744138,  0.09285793, ..., -0.29207155,
         0.10852643,  0.10941215],
       [-0.3182044 , -0.23519686, -0.09940531, ..., -0.10990528,
         0.25867304,  0.09447138]], dtype=float32)

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(features_bert, y[:3000], random_state=21, train_size=0.8)

In [45]:
clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
jaccard_score(Y_test, pred, average='macro')



0.2139875046952236

In [46]:
resultats['Bert_sur_les_titres'] = jaccard_score(Y_test, pred, average='macro')

# USE

In [47]:
import tensorflow_hub as hub

In [48]:
def feature_USE_fct(sentences, b_size) :
    batch_size = b_size
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        feat = embed(sentences[idx:idx+batch_size])

        if step ==0 :
            features = feat
        else :
            features = np.concatenate((features,feat))

    time2 = np.round(time.time() - time1,0)
    return features

In [49]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [50]:
batch_size = 10

In [51]:
X = data.iloc[:2000]
sentences = X['Body'].to_list()

In [52]:
features_USE = feature_USE_fct(sentences, batch_size)

In [53]:
features_USE.shape

(2000, 512)

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(features_USE, y[:2000], random_state=21, train_size=0.8)

In [56]:
clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
jaccard_score(Y_test, pred, average='macro')

0.28434121425943565

In [57]:
resultats['USE'] = jaccard_score(Y_test, pred, average='macro')

### Sur les titres

In [59]:
sentences = data['Title'].iloc[:4000].to_list()

In [60]:
features_USE = feature_USE_fct(sentences, batch_size)

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(features_USE, y[:4000], random_state=21, train_size=0.8)

In [62]:
clf = OneVsRestClassifier(LinearSVC(), n_jobs=1)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
jaccard_score(Y_test, pred, average='macro')

0.3591963055297009

In [63]:
resultats['USE_titres'] = jaccard_score(Y_test, pred, average='macro')

In [64]:
resultats

{'Word2Vec': 0.10699639347410564,
 'Bert': 0.16368632943420716,
 'Bert_sur_les_titres': 0.2139875046952236,
 'USE': 0.28434121425943565,
 'USE_titres': 0.3591963055297009}