In [1]:
import os
os.chdir('..')

In [2]:
import numpy as np
import pandas as pd

from src.data import extract, load
from src.models import embedding as embedding_model, utils as model_utils
from src.tools.startup import settings
from src.tools import utils as tools_utils
import tensorflow as tf
from tensorboard.plugins import projector

2023-10-19 17:32:07 - INFO - startup.py - initialize_logger: Logger initialized


2023-10-19 17:32:08.121957: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-19 17:32:08.431976: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-19 17:32:08.432851: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
os.environ['SENTENCE_TRANSFORMERS_HOME'] = './.cache'

## Constants

In [4]:
# Folders
interim_folder = os.path.join(settings['volumes']['interim'])
logs_folder = os.path.join(settings['volumes']['logs'], 'embeddings')

# Multilingual
multilingual = embedding_model.EmbeddingType.multilingual.value
muse_languages = settings['embeddings']['muse']['languages']

multilingual_transformer = embedding_model.EmbeddingType.multilingual_transformer.value

# Separate languages
separate_languages = embedding_model.EmbeddingType.separate_languages.value
spacy_languages = settings['embeddings']['spacy']['languages']

# TF-IDF method
tfidf_by_language = embedding_model.TfidfMethod.by_language.value

## Parameters

In [5]:
filenames = [
    os.path.join(interim_folder, 'ocrs', 'all_words_bloc1.json'),
    os.path.join(interim_folder, 'ocrs', 'all_words_bloc2.json'),
    os.path.join(interim_folder, 'ocrs', 'all_words_bloc2_Revista.json'),
    os.path.join(interim_folder, 'ocrs', 'all_words_bloc2_Revista nova.json')
]

## Extract

In [6]:
multilingual_corpus_df = extract.read_processed_ocrs(
    muse_languages, filenames)
sep_lang_corpus_df = extract.read_processed_ocrs(
    spacy_languages, filenames)
# TODO: remove for function that extracts full text
multilingual_transformer_df = extract.read_processed_ocrs(
    spacy_languages, filenames)
multilingual_transformer_df['full_text'] = multilingual_transformer_df.ca_text

## Transform

In [7]:
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util as st_utils

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
sequence_len = 10

In [9]:
multilingual_transformer_df['texts_chunks'] = multilingual_transformer_df['full_text'].apply(
    tools_utils.get_string_chunks, args=(sequence_len, ))

In [10]:
multilingual_transformer_df['texts_chunks'].head(10).values[0]

['francis ad',
 'am abraham',
 ' alfred ab',
 'raham par ',
 'oliva vila',
 'nova casan',
 'ova']

In [11]:
texts = multilingual_transformer_df['texts_chunks'].head(10).values

In [12]:
transformer_settings = { 
    'task': 'feature-extraction',
    'tokenizer': 'bert-base-multilingual-cased',
    'model': 'bert-base-multilingual-cased',
}
feature_extraction = pipeline(**transformer_settings)

feature_extraction

<transformers.pipelines.feature_extraction.FeatureExtractionPipeline at 0x7f5e92da9ba0>

In [13]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

sentence = 'Hola, Que tal?'
encoded_seq = tokenizer(sentence)

In [14]:
tokenizer.tokenize(sentence)

['Ho', '##la', ',', 'Que', 'tal', '?']

In [15]:
encoded_seq

{'input_ids': [101, 20220, 10330, 117, 27158, 13675, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [62]:
embeddings = feature_extraction(sentence)

In [63]:
embeddings = np.array(embeddings[0])
mean_embedding = np.mean(embeddings, axis=0)

In [64]:
mean_embedding = mean_embedding.astype('float32')

In [67]:
model = SentenceTransformer('bert-base-multilingual-cased')

No sentence-transformers model found with name ./.cache/bert-base-multilingual-cased. Creating a new one with MEAN pooling.


In [68]:
st_embeddings = model.encode(sentence)

In [73]:
st_utils.cos_sim(mean_embedding, st_embeddings)

tensor([[1.0000]])

### Metadata

In [7]:
# revista_title_code = {
#     991001612299706717: 'instant',
#     991001624269706717: 'matrencada',
#     991001732409706717: 'amicarts',
#     991001813989706717: 'AC',
#     991002553879706717: 'Fulls grocs',
#     991003272219706717: 'algol',
#     991003294699706717: 'anti',
#     991005036609706717: 'iberia',
#     991005076959706717: 'esportcat',
#     991005105169706717: 'helix',
#     991005119309706717: 'monitor',
#     991006467819706717: 'arcvoltaic',
#     991006630279706717: 'themis',
#     991006631789706717: 'trocos',
#     991007018719706717: 'unenemicpob',
#     991010414779706717: 'dauset',
#     991011072099706717: '391',
#     991014134819706717: 'Proa',
#     991017182844906716: 'cobalto49',
#     991017182846406716: 'cobalto'   
# }

# selected_subjects = [
#     'Arquitectura',
#     'Art modern',
#     'Arts visuals',
#     'Avantguarda (Estètica)',
#     'Catalunya',
#     'Cultura',
#     'Dadaisme',
#     'Espanya',
#     'Esports',
#     'Guerra Mundial I, 1914-1918',
#     'Literatura catalana',
#     'Literatura francesa',
#     'Noucentisme (Art)',
#     'Poesia catalana',
#     'Poesia francesa',
#     'Segle XX'
# ]

# metadata_df['title_code'] = metadata_df['MMS Id'].map(revista_title_code)
# metadata_df['filtered_subjects'] = [[s for s in selected_subjects if s in subs]
#                                        for subs in metadata_df['Subjects']]
# metadata_df['subject_labels'] = metadata_df.filtered_subjects.str.join(',')

# metadata_df.head()

### Compute document embeddings

#### Multilingual

In [8]:
multilingual_embedder = embedding_model.Embedder(
    multilingual_corpus_df, 
    embedding_type=multilingual, 
    weight_by_tfidf=True, 
    tfidf_method=tfidf_by_language)

multilingual_embedder.compute_doc_embeddings()

2023-08-16 12:37:32 - INFO - embedding.py - __init__: Running embedding type 'multilingual'...
2023-08-16 12:38:12 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'ca'
2023-08-16 12:38:12 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...
2023-08-16 12:38:15 - INFO - embedding.py - _apply_tfidf: Done.
2023-08-16 12:38:15 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'es'
2023-08-16 12:38:15 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...
2023-08-16 12:38:16 - INFO - embedding.py - _apply_tfidf: Done.
2023-08-16 12:38:17 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'fr'
2023-08-16 12:38:17 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...
2023-08-16 12:38:17 - INFO - embedding.py - _apply_tfidf: Done.
2023-08-16 12:38:17 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'it'
2023-08-16 12:38:17 - INFO - embedding.py - _apply_tfidf:

  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:17 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19170605


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:18 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19170706
2023-08-16 12:38:18 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19200312


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:18 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19200713


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:18 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19201114
2023-08-16 12:38:19 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19240516


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:19 - INFO - embedding.py - _compute_cross_language: Computing embedding for document 391_19241019
2023-08-16 12:38:19 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193101


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:19 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193102
2023-08-16 12:38:19 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193103
2023-08-16 12:38:20 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193104
2023-08-16 12:38:20 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193205
2023-08-16 12:38:20 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193206
2023-08-16 12:38:20 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193207
2023-08-16 12:38:21 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193208
2023-08-16 12:38:21 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC_193309
2023-08-16 12:38:21 - INFO - embedding.py - _compute_cross_language: Computing embedding for document AC

  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:34 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19481103
2023-08-16 12:38:34 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19481204
2023-08-16 12:38:34 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19490105
2023-08-16 12:38:34 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19490506


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:34 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19490607
2023-08-16 12:38:35 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19490708


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:35 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19491009


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:35 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500110


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:35 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500311
2023-08-16 12:38:36 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500412
2023-08-16 12:38:36 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500513


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:36 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500614
2023-08-16 12:38:36 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500715


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:36 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19500916
2023-08-16 12:38:37 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19501017


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:37 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19501118
2023-08-16 12:38:37 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19501219


  split_words = pd.Series(row[split_column])
  split_words = pd.Series(row[split_column])


2023-08-16 12:38:37 - INFO - embedding.py - _compute_cross_language: Computing embedding for document dauset_19511220


  split_words = pd.Series(row[split_column])


2023-08-16 12:38:37 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19205117
2023-08-16 12:38:38 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250401
2023-08-16 12:38:38 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250402
2023-08-16 12:38:38 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250403
2023-08-16 12:38:38 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250404
2023-08-16 12:38:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250505
2023-08-16 12:38:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250506
2023-08-16 12:38:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document esportcat_19250507
2023-08-16 12:38:39 - INFO - emb

  split_words = pd.Series(row[split_column])


2023-08-16 12:39:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document instant_19190802
2023-08-16 12:39:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document instant_19190903
2023-08-16 12:39:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document instant_19190904
2023-08-16 12:39:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document instant_19191005
2023-08-16 12:39:39 - INFO - embedding.py - _compute_cross_language: Computing embedding for document matrencada_19241101
2023-08-16 12:39:40 - INFO - embedding.py - _compute_cross_language: Computing embedding for document matrencada_19241102
2023-08-16 12:39:40 - INFO - embedding.py - _compute_cross_language: Computing embedding for document matrencada_19241203
2023-08-16 12:39:40 - INFO - embedding.py - _compute_cross_language: Computing embedding for document matrencada_19241204
2023-08-16 12:39:40 - INFO - embeddi

  split_words = pd.Series(row[split_column])


2023-08-16 12:39:49 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19170402
2023-08-16 12:39:49 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19170603
2023-08-16 12:39:49 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19170704
2023-08-16 12:39:49 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19170805
2023-08-16 12:39:49 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19170906


  split_words = pd.Series(row[split_column])


2023-08-16 12:39:50 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19171107
2023-08-16 12:39:50 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19171108
2023-08-16 12:39:50 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19171209
2023-08-16 12:39:50 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19180110
2023-08-16 12:39:50 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19180211
2023-08-16 12:39:51 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19180212
2023-08-16 12:39:51 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19180513
2023-08-16 12:39:51 - INFO - embedding.py - _compute_cross_language: Computing embedding for document unenemicpob_19181014
2023-08-16 12:39

  split_words = pd.Series(row[split_column])


2023-08-16 12:40:01 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250117.pdf
2023-08-16 12:40:01 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250218.pdf
2023-08-16 12:40:01 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250219.pdf
2023-08-16 12:40:01 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250320.pdf
2023-08-16 12:40:02 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250321.pdf
2023-08-16 12:40:02 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250422.pdf
2023-08-16 12:40:02 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Gaseta de les Arts_19250423.pdf
2023-08-16 12:40:02 - INFO - embedding.py - _compute_cross_lan

  split_words = pd.Series(row[split_column])


2023-08-16 12:41:57 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19141131.pdf
2023-08-16 12:41:57 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160532.pdf
2023-08-16 12:41:57 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160533.pdf
2023-08-16 12:41:57 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160634.pdf
2023-08-16 12:41:58 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160635.pdf
2023-08-16 12:41:58 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160736.pdf
2023-08-16 12:41:58 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Revista nova_19160737.pdf
2023-08-16 12:41:58 - INFO - embedding.py - _compute_cross_language: Computing embedding for document Re

#### Separate languages

In [None]:
sep_lang_embedder = embedding_model.Embedder(
    sep_lang_corpus_df, 
    embedding_type=separate_languages,
    weight_by_tfidf=True, 
    tfidf_method=tfidf_by_language)

sep_lang_embedder.compute_doc_embeddings()

2023-08-16 12:42:00 - INFO - embedding.py - __init__: Running embedding type 'separate_languages'...
2023-08-16 12:42:07 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'ca'
2023-08-16 12:42:07 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...
2023-08-16 12:42:10 - INFO - embedding.py - _apply_tfidf: Done.
2023-08-16 12:42:10 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'en'
2023-08-16 12:42:10 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...
2023-08-16 12:42:10 - INFO - embedding.py - _apply_tfidf: Done.
2023-08-16 12:42:10 - INFO - embedding.py - compute_word_relevances: Running by_language for language 'es'
2023-08-16 12:42:10 - INFO - embedding.py - _apply_tfidf: Running TF-IDF...


#### Multilingual transformer

In [None]:
multilingual_transformer_embedder = embedding_model.Embedder(
    multilingual_transformer_df, 
    embedding_type=multilingual_transformer,
    weight_by_tfidf=True, 
    tfidf_method=tfidf_by_language)

sep_lang_embedder.compute_doc_embeddings()

## Tensorboard

#### Prepare data for Tensorboard

In [None]:
# Multilingual
embeddings, doc_ids = [], []
for doc_id, doc_embedding in multilingual_embedder.embeddings.items():
    doc_ids.append(doc_id)
    embeddings.append(doc_embedding)

multilingual_metadata_df = multilingual_corpus_df \
    .set_index('doc_id').loc[doc_ids]
multilingual_embeddings = np.array(embeddings)

# Separate languages
embeddings, doc_ids = [], []
for doc_id, doc_embedding in sep_lang_embedder.embeddings.items():
    doc_ids.append(doc_id)
    embeddings.append(doc_embedding)

sep_lang_metadata_df = sep_lang_corpus_df \
    .set_index('doc_id').loc[doc_ids]
sep_lang_embeddings = np.array(embeddings)

In [None]:
checkpoint = tf.train.Checkpoint(
    multilingual_embeddings=tf.Variable(
        multilingual_embeddings, name='multilingual_embeddings'),
    separate_languages_embeddings=tf.Variable(
        sep_lang_embeddings, name='separate_languages_embeddings')
)

checkpoint.save(os.path.join(logs_folder, f"checkpoint_weighted_avg.ckpt"))

#### Metadata

In [None]:
# Metadata
metadata_columns = [
    'title',
    'doc_id',
    # 'langs',
    # 'subject_labels',
]

load.create_embeddings_metadata(
    multilingual_metadata_df.reset_index(), metadata_columns, 
    logs_folder, 'multilingual_embeddings.tsv')
load.create_embeddings_metadata(
    sep_lang_metadata_df.reset_index(), metadata_columns, 
    logs_folder, 'separate_languages_embeddings.tsv')

#### Configurate Tensorboard

In [None]:
config = projector.ProjectorConfig()

model_utils.add_embedding_to_projector_config(
    config, 'multilingual_embeddings', 'multilingual_embeddings.tsv')
model_utils.add_embedding_to_projector_config(
    config, 'separate_languages_embeddings', 
    'separate_languages_embeddings.tsv')

projector.visualize_embeddings(logs_folder, config)