<a href="https://colab.research.google.com/github/JeMigli/src/blob/master/inf8460_tp3_A20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# École Polytechnique de Montréal
# Département Génie Informatique et Génie Logiciel

# INF8460 – Traitement automatique de la langue naturelle - TP3

# Objectifs d’apprentissage
 • Utiliser des plongements lexicaux pré-entrainés pour de la classification
 
 • Entrainer des plongements lexicaux de type word2vec
 
 • Implanter des modèles de classification neuronaux

## Équipe et contributions 
Veuillez indiquer la contribution effective de chaque membre de l'équipe en pourcentage et en indiquant les modules ou questions sur lesquelles chaque membre a travaillé

Vincent Dandenault: 33% (détail)

Dominique Piché: 33% (détail)

Jérémie Miglierina: 33% (détail)

# Librairies externes

In [4]:
import gensim
import io
import nltk
import numpy as np
import os
import pandas as pd
import requests
import sklearn
import sklearn.naive_bayes
import tensorflow as tf
import time
from typing import Dict
import zipfile
import nltk
nltk.download('stopwords')
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Téléchargement et lecture des données

In [5]:
DATA_PATH = os.path.join(os.getcwd(), "aclImdb")

## Téléchargement

In [6]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz
!rm aclImdb_v1.tar.gz
!echo Done!

--2020-10-18 14:24:35--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-10-18 14:24:37 (42.5 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]

Done!


In [7]:
def download_wikipedia_embeddings() -> None:
    if not os.path.exists(os.path.join(os.getcwd(), "model.txt")):
        res = requests.get("http://vectors.nlpl.eu/repository/11/3.zip")
        with zipfile.ZipFile(io.BytesIO(res.content)) as z:
            z.extractall("./")
        os.remove(os.path.join(os.getcwd(), "3.zip"))
        os.remove(os.path.join(os.getcwd(), "meta.json"))
        os.remove(os.path.join(os.getcwd(), "model.bin"))
        os.remove(os.path.join(os.getcwd(), "README"))

## Lecture

In [8]:
def read_data(path):
    traintest = ['train', 'test']
    classes = ['pos', 'neg']
    corpus = {cls: [] for cls in classes}

    # Each data is a list of strings(reviews)
    reviews = []
    labels = []
    for cls in classes:
        dir_path = os.path.join(path, cls)
        
        for filename in os.listdir(dir_path):
            file = os.path.join(dir_path, filename)
            with open(file, encoding = 'utf-8') as f:
                corpus[cls].append(f.read().replace("\n", " "))
        
    return corpus

In [9]:
train_data = read_data(os.path.join(DATA_PATH, 'train'))
test_data = read_data(os.path.join(DATA_PATH, 'test'))

In [10]:
def create_wikipedia_embeddings(word_indices: Dict[str, int], vocab_len: int) -> np.ndarray:
    with open("./model.txt", "r", encoding="UTF-8") as f:
        shape_string = f.readline()
        lines = f.readlines() 
        
    embedding = np.zeros((vocab_len, 300), dtype=float)
    for line in lines:
        splitted_line = line.split(" ")
        word = splitted_line[0].split("_")[0]
        if word in word_indices and word_indices[word] < vocab_len:
            embedding_line = splitted_line[1:]
            embedding[word_indices[word]] = list(map(float, embedding_line))
        
    return embedding

## Prétraitement

In [11]:
class Preprocess(object):
    def __init__(self, lemmatize=True):
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.lemmatize = lemmatize

    def preprocess_pipeline(self, data):
        clean_tokenized_data = self._clean_doc(data)
        if self.lemmatize:
            clean_tokenized_data = self._lemmatize(clean_tokenized_data)

        return clean_tokenized_data

    def _clean_doc(self, data):
        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        return [
            [
                token.lower()
                for token in tokenizer.tokenize(review)
                if token.lower() not in self.stopwords
                and len(token) > 1
                and token.isalpha()
                and token != "br]"
            ]
            for review in data
        ]

    def _lemmatize(self, data):
        lemmatizer = nltk.stem.WordNetLemmatizer()
        return [[lemmatizer.lemmatize(word) for word in review] for review in data]

    def convert_to_reviews(self, tokenized_reviews):
        reviews = []
        for tokens in tokenized_reviews:
            reviews.append(" ".join(tokens))

        return reviews

In [12]:
pre = Preprocess()

train_pos = pre.preprocess_pipeline(train_data["pos"])
train_neg = pre.preprocess_pipeline(train_data["neg"])
test_pos = pre.preprocess_pipeline(test_data["pos"])
test_neg = pre.preprocess_pipeline(test_data["neg"])

y_train = [1] * len(train_pos) + [0] * len(train_neg)
y_test = [1] * len(test_pos) + [0] * len(test_neg)
X_train = [" ".join(sentence) for sentence in train_pos + train_neg]
X_test = [" ".join(sentence) for sentence in test_pos + test_neg]

print("{} training sentences: {} pos and {} neg".format(len(X_train), len(train_pos), len(train_neg)))
print("{} test sentences: {} pos and {} neg".format(len(X_test), len(test_pos), len(test_neg)))

25000 training sentences: 12500 pos and 12500 neg
25000 test sentences: 12500 pos and 12500 neg


In [13]:
print(X_test[0])

commenter stated movie worst ever forced upon child say though loved movie little still love today movie best running theme family togetherness considering time period movie released thought movie acted well wish could still find copy somewhere film watched kid favorite know probably watched least week brother mom would definitely recommend anyone know know find copy suggest watching wonderful heartwarming


# 1. Entrainement de plongements lexicaux

Vous devez réaliser les étapes suivantes:

## a) Utiliser Gensim pour entrainer un modèle word2vec sur le corpus. 

In [14]:


from gensim import utils

class MyCorpusXTrain(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_train:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)


class MyCorpusXTest(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in X_test:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)




In [15]:
import gensim.models

sentences = MyCorpusXTrain()
model = gensim.models.Word2Vec(sentences=sentences,size = 256,min_count=3, window=5, workers=4)



KeyboardInterrupt: ignored

In [None]:

print(model.wv.vectors.shape)
print(y_test)
print(X_test[0])

In [None]:
# Python3 program to count words 
# in a given string
OUT = 0
IN = 1

# Returns number of words in string
def countWords(string):
	state = OUT
	wc = 0

	# Scan all characters one by one
	for i in range(len(string)):

		# If next character is a separator, 
		# set the state as OUT
		if (string[i] == ' ' or string[i] == '\n' or
			string[i] == '\t'):
			state = OUT

		# If next character is not a word 
		# separator and state is OUT, then 
		# set the state as IN and increment 
		# word count
		elif state == OUT:
			state = IN
			wc += 1

	# Return the number of words
	return wc

# Driver Code
string = X_train[400]
print("No. of words : " + str(countWords(string)))

# This code is contributed by BHAVYA JAIN


## b) Décrire les paramètres du ou des modèles entraînés, leur taille sur disque, le nombre de mots encodés, le temps d'entraînement, etc.

Taille des vecteurs

In [None]:
model.vector_size 

Taille sur le disque


In [None]:
size_model = model.estimate_memory()
total_size_model = size_model["total"]
print("Taille en byte: " + str(total_size_model))

Nombre de mot encodé

In [None]:
print (str(model.corpus_total_words) + " mots encodés") ## Je sais pas vraiment si sa retourne les mots encod/


Temps d'entrainenement:

In [None]:

print(str(model.total_train_time) + " secondes")

## c) Décrire le cas échéant et de manière précise tout problème que vous avez eu à obtenir votre modèle et les façons de résoudre ces problèmes.

## d) Retrouvez les 5 mots voisins des mots suivants : excellent, terrible

In [None]:
print("Les 5 mots voisin de excellent avec leur score de rapprochement:")
result = model.similar_by_word("excellent")
for x in range(6):
  print("{}: {:.4f}".format(*result[x]))

In [None]:
print("Les 5 mots voisin de terrible avec leur score de rapprochement:")
result = model.wv.similar_by_word("terrible")
for x in range(6):
  print("{}: {:.4f}".format(*result[x]))

In [None]:
print(model.wv.syn0)
model

# 2. Classification avec des plongements lexicaux

On vous demande d’effectuer de la classification avec les plongements lexicaux obtenus.

## a) En reprenant le code développé dans le TP1 avec Scikitlearn, on vous demande cette fois de tester un modèle Naïve Bayes et de régression logistique avec des n-grammes (n=1,2,3 ensemble). Essayez de voir si une réduction de dimension améliore la classification. Ne fournissez que votre meilleur modèle. Evaluez vos algorithmes selon les métriques d’accuracy générale et de F1 par classe sur l’ensemble de test.

In [None]:
# Import pipeline
from sklearn.pipeline import Pipeline

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Import other preprocessing modules
#from sklearn.preprocessing import Imputer
from sklearn.feature_selection import chi2, SelectKBest

# Select 300 best features
chi_k = 300

# Import functional utilities
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler
from sklearn.pipeline import FeatureUnion

# Perform preprocessing
#get_text_data = FunctionTransformer(lambda x: X_train, validate=False)

# Create the token pattern: TOKENS_ALPHANUMERIC
# Instantiate pipeline: pl
pipe_log = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('text_features', Pipeline([
                    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', LogisticRegression())
    ])

pipe_log.fit(X_train, y_train)
pipe_log.score(X_test, y_test)

In [None]:
pipe_bayes = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('text_features', Pipeline([
                    ('vectorizer', CountVectorizer(ngram_range=(1,3))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()),
        ('clf', MultinomialNB())
    ])

pipe_bayes.fit(X_train, y_train)
pipe_bayes.score(X_test, y_test)

## b) En utilisant Tensorflow (ou Pytorch), on vous demande de développer un classificateur perceptron multicouches et un bi-LSTM avec les vecteurs d’un modèle word2vec pré-entrainé sur Wikipédia en Anglais (enwiki_upos_skipgram_300_3_2019) disponible à http://vectors.nlpl.eu/repository/11/3.zip. 

On s’attend à ce que vous effectuiez une moyenne des vecteurs de mots de chaque document pour obtenir un plongement du document.  

Evaluez vos algorithmes selon les métriques d’accuracy générale et de F1 par classe sur l’ensemble de test. Pour chacun des modèles, indiquez ses performances et ses spécifications (nombre d’époques, régularisation, optimiseur, nombre de couches, etc.). N’hésitez pas à expérimenter avec différents paramètres. Vous ne devez reporter que votre meilleure expérimentation.

In [45]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


bi-LSTM

In [16]:
import gensim.models

sentences_X_train = MyCorpusXTrain()
model_X_train = gensim.models.Word2Vec(sentences=sentences_X_train,size = 256,min_count=3, window=5, workers=4)

# sentences_X_test = MyCorpusXTest()
# model_X_test = gensim.models.Word2Vec(sentences=sentences_X_test,size = 256,min_count=3, window=5, workers=4)

In [72]:


## Setup


import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer

max_features = 33058  # Only consider the top 20k words
maxlen = 150  # Only consider the first 200 words of each movie review


## Build the model


# # Input for variable-length sequences of integers
# inputs = keras.Input(shape=(None,), dtype="int32")
# # Embed each integer in a 128-dimensional vector
# x = layers.Embedding(max_features, 128)(inputs)
# # Add 2 bidirectional LSTMs
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
# x = layers.Bidirectional(layers.LSTM(64))(x)
# # Add a classifier
# outputs = layers.Dense(1, activation="sigmoid")(x)
# model = keras.Model(inputs, outputs)
# model.summary()


model_bi_lstm_word2vec = keras.Sequential(
    [
        layers.Embedding(max_features, 128),
        layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(64)),
      layers.Dense(1, activation="sigmoid")
     
    ]
)
model_bi_lstm_word2vec.summary()


word_index_vocab = {key:indx for indx,key in enumerate(list(model_X_train.wv.vocab.keys()))}
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab
X_sequence =  tokenizer.texts_to_sequences(X_train)
X_sequences = tokenizer.texts_to_sequences(X_test)

print(type(X_sequence[0]))

##print(len(X_val), "Validation sequences")
# Use pad_sequence to standardize sequence length:
# this will truncate sequences longer than 200 words and zero-pad sequences shorter than 200 words.
x_train = keras.preprocessing.sequence.pad_sequences(X_sequence, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(X_sequences, maxlen=maxlen)



## Train and evaluate the model

model_bi_lstm_word2vec.compile("adam", "binary_crossentropy", metrics=["accuracy",f1_m, precision_m, recall_m])
model_bi_lstm_word2vec.fit(x_train, np.array(y_train), batch_size=32, epochs=10, validation_split=0.15)
metrics_bi_lstm_word2vec = model_bi_lstm_word2vec.evaluate(x_val, np.array(y_test), verbose=0)

Model: "sequential_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, None, 128)         4231424   
_________________________________________________________________
bidirectional_24 (Bidirectio (None, None, 128)         98816     
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_69 (Dense)             (None, 1)                 129       
Total params: 4,429,185
Trainable params: 4,429,185
Non-trainable params: 0
_________________________________________________________________
<class 'list'>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


MLP


In [73]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras.preprocessing.text import Tokenizer

max_features = 33058  # Only consider the top 20k words
maxlen = 150  # Only consider the first 200 words of each movie review



model_MLP_word2vec = keras.Sequential(
    [
        layers.Embedding(max_features, 128),
        layers.Dense(16, activation='relu'),
        layers.Dense(8, activation='relu'),
        layers.Dense(1, activation="sigmoid")
     
    ]
)
model_MLP_word2vec.summary()

word_index_vocab = {key:indx for indx,key in enumerate(list(model_X_train.wv.vocab.keys()))}
tokenizer = Tokenizer()
tokenizer.word_index = word_index_vocab
X_sequence =  tokenizer.texts_to_sequences(X_train)
X_sequences = tokenizer.texts_to_sequences(X_test)

print(type(X_sequence[0]))

##print(len(X_val), "Validation sequences")
# Use pad_sequence to standardize sequence length:
# this will truncate sequences longer than 200 words and zero-pad sequences shorter than 200 words.
x_train = keras.preprocessing.sequence.pad_sequences(X_sequence, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(X_sequences, maxlen=maxlen)



## Train and evaluate the model

model_MLP_word2vec.compile("adam", "binary_crossentropy", metrics=["accuracy",f1_m, precision_m, recall_m])
model_MLP_word2vec.fit(x_train, np.array(y_train), batch_size=32, epochs=10, validation_data=(x_val, np.array(y_test)))
metrics_MLP_word2vec = model_MLP_word2vec.evaluate(x_val, np.array(y_test), verbose=0)


Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, None, 128)         4231424   
_________________________________________________________________
dense_70 (Dense)             (None, None, 16)          2064      
_________________________________________________________________
dense_71 (Dense)             (None, None, 8)           136       
_________________________________________________________________
dense_72 (Dense)             (None, None, 1)           9         
Total params: 4,233,633
Trainable params: 4,233,633
Non-trainable params: 0
_________________________________________________________________
<class 'list'>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
print(accuracy_MLP_word2vec)

[0.6810118556022644, 0.5432206392288208, 0.9805772304534912, 0.4996356666088104, 26.38811492919922]


## c) Ré-entrainez les modèles en b) avec vos propres vecteurs. Comparez maintenant la performance obtenue en en b) avec celles que vous obtenez en utilisant vos propres vecteurs de mots entrainés sur le corpus. 

## d) Générez une table ou un graphique qui regroupe les performances des modèles, leurs spécifications, la durée d’entraînement et commentez ces résultats. Quelle est l’influence des word embeddings sur les performances?  Quel est votre meilleur modèle ?

In [75]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["Model","Accuracy", "F1","Recall","Precision", "# epoch"," #layers", "optimizer", "loss", "loss function"]

x.add_row(["Bayes", "","","" ,"" ,10,"","","",""])
x.add_row(["Bi-LSTM with Word2Vec", metrics_bi_lstm_word2vec[1],metrics_bi_lstm_word2vec[2], metrics_bi_lstm_word2vec[4], metrics_bi_lstm_word2vec[3],10,"4","Adam",metrics_bi_lstm_word2vec[0],"Binary Crossentropy"])
x.add_row(["Multi-Layer Perceptron with Word2Vec", metrics_MLP_word2vec[1],metrics_MLP_word2vec[2], metrics_MLP_word2vec[4], metrics_MLP_word2vec[3],10,"4","Adam",metrics_MLP_word2vec[0],"Binary Crossentropy"])
x.add_row(["Bi-LSTM with our vectors ", "","","" ,"" ,10,"","","",""])
x.add_row(["Multi-Layer Perceptron with our vectors", "","","" ,"" ,10,"","","",""])



print(x)

+-----------------------------------------+--------------------+--------------------+---------------------+--------------------+---------+----------+-----------+--------------------+---------------------+
|                  Model                  |      Accuracy      |         F1         |        Recall       |     Precision      | # epoch |  #layers | optimizer |        loss        |    loss function    |
+-----------------------------------------+--------------------+--------------------+---------------------+--------------------+---------+----------+-----------+--------------------+---------------------+
|                  Bayes                  |                    |                    |                     |                    |    10   |          |           |                    |                     |
|          Bi-LSTM with Word2Vec          | 0.818880021572113  | 0.467944473028183  | 0.44077685475349426 | 0.4998837411403656 |    10   |    4     |    Adam   | 0.8546251058578491