<a href="https://colab.research.google.com/github/LPDPasAI/autoencoder_nlp_cond/blob/main/autoencoderNlp_cond.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Questo mi serve per montare il drive di Google 

Mounted at /content/drive


In [None]:
import tensorflow


In [None]:
import csv
import nltk
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model, Sequential, Input
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.models import model_from_json

In [None]:
import os

In [None]:
BASE_PATH = '/content/drive/My Drive/COLAB'
print(os.listdir(BASE_PATH))

['embedding', 'dataset', 'models']


In [None]:
SENTENCE_LENGTH = 15

In [None]:
EMBEDDING_DIM = 50
LANGUAGE = 'en'
WORD_LENGTH = 10
SENTENCES_SELECTED = 'sentences_selected.csv'
NUM_WORDS = 24000

In [None]:
def np_array_from_csv(sentences_file_csv, ):
    X = []
    y = []

    with open(BASE_PATH + "/dataset/" + sentences_file_csv, 'r') as file_dataset:
        reader = csv.reader(file_dataset, delimiter=',')
        for riga in reader:
            sentence = [0]
            verbo = [1]
            sentence = nltk.word_tokenize(sentence)
            X.append(sentence)
            y.append(verbo)

    X_np = np.array(X)
    y_np = np.array(y)
    return X_np, y_np


In [None]:
def get_tokenizer_and_padded_sequences(X_np, sentence_length=SENTENCE_LENGTH) -> (type(Tokenizer), type(np.zeros(2))):
    tokenizer = Tokenizer()  # num_words=5000)  # Fix1
    tokenizer.fit_on_texts(X_np)
    print("Numero Parole:", len(tokenizer.word_index))
    X_index = tokenizer.texts_to_sequences(X_np)
    X_index = pad_sequences(X_index, maxlen=sentence_length)

    return tokenizer, X_index


In [None]:
def get_model(word_index, sentence_length=SENTENCE_LENGTH, embeddings_index=None, embedding_dim=EMBEDDING_DIM, trainable=False, language=LANGUAGE) -> type(Model()):
    # Sarà ripreso dopo
    #model = Model()
    #inputs = Input(shape=(10, 39, 15))
    #encoder =
    #decoder=
    #output = Dense((10, 39), activation=activations.softmax)(decoder)

    model = Sequential()
    my_emb = get_embeddings_layer(sentence_length, word_index, embeddings_index, embedding_dim, trainable, language)
    print(type(my_emb))
    model.add(layer=my_emb)
    model.add(Dense(embedding_dim * 5))
    model.add(Dense(embedding_dim * 4))
    model.add(Flatten())
    model.add(Dense(len(word_index), activation="softmax", use_bias=False))  # Fix1
    print(model.summary())
    return model


In [None]:
def diz_from_list(words):
    word_index = {}
    for i, word in enumerate(words):
        word_index[word] = i

    return word_index



In [None]:
def get_embeddings_index(embedding_dim=EMBEDDING_DIM, language=LANGUAGE, word_index=None):
    embeddings_index = {}

    with open(BASE_PATH + "/embedding/"+language+"/glove/glove.6B."+str(embedding_dim)+"d.txt", 'r') as file:
        try:
            for line in file:
                #print(line)
                elementi = line.split()
                word = elementi[0]
                if word_index is None or word in word_index:
                    coeff = np.asarray(elementi[1:], dtype='float32')
                    embeddings_index[word] = coeff
        except Exception as e:
            print(e)


    return embeddings_index



In [None]:
def get_embeddings_layer(sentence_length, word_index, embeddings_index=None, embedding_dim=EMBEDDING_DIM, trainable=False, language=LANGUAGE):
    """
    Costruisce lo strato Embeddings avente come vettori quelli passati tramite il parametro embeddings_index ordinati in base agli indici
    del dizionario word_index passato
    :param sentence_length: Lunghezza della frase
    :param word_index: dizionario ottenuto a partire dai testi, formato da indice e parola
    :param embeddings_index: dizionario estratto da un embedding preaddestrato formato da parola come chiave e vettore come valore,
                            se non viene passato viene chiamata la funzione get_embeddings_index che che si occupa dell'estrazione
    :param embedding_dim: la dimensione dei vettori dell'embedding utilizzato
    :param trainable: se impostato a True anche questo strato viene addestrato, per cui i vettori vengono modificati
    :return: lo strato di tipo Embeddings
    """

    if embeddings_index is None:
        embeddings_index = get_embeddings_index(embedding_dim, language, word_index)
    num_vocaboli = len(word_index)+1
    embeddings_matrix = np.zeros((num_vocaboli, embedding_dim))

    for word in word_index.keys():
        embeddings_vector = embeddings_index.get(word)
        if embeddings_vector is not None:
           embeddings_matrix[word_index[word]] = embeddings_vector

    embeddings_layer = Embedding(num_vocaboli, embedding_dim, weights=[embeddings_matrix], input_length=sentence_length, trainable=trainable)

    return embeddings_layer



In [None]:
def one_hot_encode(sequence_of_index, n_unique=0):
    """
    one hot encode sequence
    :param sequence_of_index:
    :param n_unique:
    :return:
    """
    if n_unique == 0:
        n_unique = len(sequence_of_index)

    vectors_sparse = list()
    for value in sequence_of_index:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        vectors_sparse.append(vector)
    return np.array(vectors_sparse)



In [None]:
def one_hot_decode(encoded_seq):
    return [np.argmax(vector) for vector in encoded_seq]


In [None]:

def word_extractor_first_words(file_name, num_words: int):
    words_freq = {}
    lines = 0
    with open(BASE_PATH + "/dataset/" + file_name, 'r', encoding='utf-8') as file_csv:
        reader = csv.reader(file_csv)
        for row in reader:
            lines += 1
            sentence = row[0]
            words = sentence.split()
            for word in words:
                freq = words_freq.get(word)
                if freq is not None:
                    freq = +freq
                    words_freq[word] = freq
                else:
                    words_freq[word] = 1
            if lines % 1000 == 0:
                print("Line:", lines)

    words_freq_ord = {k: v for k, v in sorted(words_freq.items(), key=lambda item: item[1])}
    words = list(words_freq_ord.keys())
    print(type(words))
    print("Erano", len(words), "parole")
    words = words[:num_words]
    print("Selezionate le prime", len(words), "parole")
    return words



In [None]:
    words = word_extractor_first_words(SENTENCES_SELECTED, NUM_WORDS)
    words = list(words)
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(words)
    word_index = tokenizer.word_index
    words = word_index.keys()
    print("**** Numero Parole:", len(words))  # Fix1
    print(type(words))  # Fix1
    print(" Parole:", words)  # Fix1
    words_index = np.arange(len(words))
    #print(words_index)
    #print(len(words_index))
    #vectors_sparse = one_hot_encode(words_index)
    vectors_sparse = np.eye(len(words_index))
    #print(vectors_sparse[0])
    #print(vectors_sparse[100])
    #print(vectors_sparse)
    #crossent = nn.sparse_softmax_cross_entropy_with_logits(labels=vectors_sparse, logits="logits")  # Fix1

    word_index_diz = diz_from_list(words)

    model_10000 = get_model(word_index_diz, 1)
    model_10000.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])  # Fix1
    history = model_10000.fit(words_index, vectors_sparse, epochs=100, verbose=1, batch_size=128)
    #print(history.history)
    #tokenizer = get_tokenizer_and_padded_sequences(x, SENTENCE_LENGTH)
    #model = get_model(tokenizer.word_index, SENTENCE_LENGTH, embedding_dim=EMBEDDING_DIM)
    #history = model.fit(x, y, epochs=10, verbose=1)
    #print(history.history)


Line: 1000
Line: 2000
Line: 3000
Line: 4000
Line: 5000
Line: 6000
Line: 7000
Line: 8000
Line: 9000
Line: 10000
Line: 11000
Line: 12000
Line: 13000
Line: 14000
Line: 15000
Line: 16000
Line: 17000
Line: 18000
Line: 19000
Line: 20000
Line: 21000
Line: 22000
Line: 23000
Line: 24000
<class 'list'>
Erano 24151 parole
Selezionate le prime 24000 parole
**** Numero Parole: 23860
<class 'dict_keys'>
<class 'tensorflow.python.keras.layers.embeddings.Embedding'>
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 50)             1193050   
_________________________________________________________________
dense (Dense)                (None, 1, 250)            12750     
_________________________________________________________________
dense_1 (Dense)              (None, 1, 200)            50200     
_______________________________________________________________

In [None]:
print(model.evaluate(words_index, words_index))

[0.023061025887727737, 0.9909909963607788]


In [None]:
def verifica_path(path):
  esiste_dir = os.path.exists(path)
  if not esiste_dir:
    os.makedirs(path)

In [None]:
def save_model(model, dim_embedding=EMBEDDING_DIM, len_seq=SENTENCE_LENGTH, language=LANGUAGE, model_name='model'):
    path_models = BASE_PATH + "/models/"+language+"/"+str(len_seq)  # The path where we put the models, we make it dynamic in order to save the models in different paths based on the language and size of the embedding
    verifica_path(path_models)
    path_name_model = path_models+"/model_"+model_name+str(dim_embedding)  # We add to the path the name of the file (without extension) dynamically created based on the parameters
    model_json = model.to_json()  # This model method directly returns the model structure in json format
    with open(path_name_model+".json", "w") as json_file:  # We open the write file with the name created above and with the extension .json
        json_file.write(model_json)  # saving the model structure in json format

    # serialize weights to HDF5
    model.save_weights(path_name_model+".h5")  # This method directly saves the model weights in the file whose name we pass
    print("Saved model to disk")

In [None]:
save_model(model=model_10000, dim_embedding=50, len_seq=SENTENCE_LENGTH, model_name='corr_23860')

Saved model to disk


In [None]:
def load_model(dim_embedding=EMBEDDING_DIM, len_seq=SENTENCE_LENGTH, language=LANGUAGE, model_name='model'):
    """
    Retrieve a previously trained and saved model
    :param dim_embedding: the size of embedding, that is, the number of elements from which the vectors are formed
    :param len_seq: the length of the sentences
    :param language: optional, the language (default: 'en')
    :param model_name: optional, the name of model (default: '')
    :return: The model previously trained
    """
    model = None  # We initialize this empty variable since we will create the model in a portion of code inside the if that would therefore not be visible outside it
    path_models = BASE_PATH + "/models/"+language+"/"+str(len_seq)  # The path where we put the models, we make it dynamic in order to save the models in different paths based on the language and size of the embedding
    path_name_model = path_models+"/model_"+model_name+str(dim_embedding)  # We add to the path the name of the file (without extension) dynamically created based on the parameters
    print(path_name_model)
    try:
        # load json and create model
        with open(path_name_model+".json", 'r') as json_file:  # We open the read file with the name created above and with the extension .json
          print(type(json_file))
          if json_file != None:  # We only proceed if the file has actually been found
              loaded_model_json = json_file.read()  # The contents of the file are put into this object
              print(type(loaded_model_json))
              model = model_from_json(loaded_model_json)  # With the model_from_json function we load the model structure directly
              print(type(model))
              model.load_weights(path_name_model+".h5")  # we load weights into model
              model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])  # We compile the model to make it functional
              print("Loaded model from disk: " + path_name_model)
    except Exception as e:  # If any unexpected event occurs that prevents the smooth running of the code, the underlying part is performed
      print("Model not found:  " + path_name_model)  # The path to the file that was not found is printed

    return model  # The ready-to-use model is returned


In [None]:
model_24000 = load_model(50, 1, model_name="model_24000")

/content/drive/My Drive/COLAB/models/en/1/model_model_2400050
<class '_io.TextIOWrapper'>
<class 'str'>
<class 'tensorflow.python.keras.engine.sequential.Sequential'>
Loaded model from disk: /content/drive/My Drive/COLAB/models/en/1/model_model_2400050


In [None]:
os.listdir(BASE_PATH + "/models/en/1/")

['model_model_2400050.h5',
 'model_model_2400050.json',
 'model_model_100050.json',
 'model_model_100050.h5']