In [44]:
import io, json, os
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def load_word_embedding(embeddings_path, max_words=1000):
    
    embeddings_dict = {}
    with io.open(embeddings_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in embeddings_dict, 'word found twice'
            embeddings_dict[word] = vect
            
            if len(embeddings_dict) == max_words:
                break
            
    return embeddings_dict

Just a simple demo to see how words in different languages relate to each other.

In [3]:
def find_nearest_neighbors(source_word, source_language, embeddings_dicts, top_n = 10):
    
    assert source_word in embeddings_dicts[source_language], 'word not found in source dict'
    
    source_word_emb = embeddings_dicts[source_language][source_word]
    for lang in embeddings_dicts:
        if lang != source_language:
            scores = {}
            for word in embeddings_dicts[lang]:
                word_emb = embeddings_dicts[lang][word]
                scores[word] = cosine(source_word_emb, word_emb)
            
            sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1]))
            top_neighbors = list(sorted_scores.items())[:top_n]
            print(f'\nTop {top_n} neighbors for {source_word} in {lang}:')
            for word, score in top_neighbors:
                print(f'{word} ({score})')

In [4]:
languages = ['ca', 'es', 'it']

In [5]:
vectors_paths = {l : f'../data/external/MUSE_word_embeddings/{l}.txt'
                    for l in languages}

In [6]:
embeddings_dicts = {}
for lang in vectors_paths:
    embeddings_dicts[lang] = load_word_embedding(vectors_paths[lang], max_words=500)

In [7]:
find_nearest_neighbors('juliol', 'ca', embeddings_dicts)


Top 10 neighbors for juliol in es:
agosto (0.17332030813134447)
septiembre (0.18125311620064632)
junio (0.18204914911580217)
abril (0.18691855050942652)
enero (0.18771206002228202)
mayo (0.18822406549705029)
diciembre (0.18823122044500762)
noviembre (0.19571972482122646)
octubre (0.19804523841700405)
febrero (0.20352564006130414)

Top 10 neighbors for juliol in it:
giugno (0.17554887662923357)
luglio (0.17773827762604155)
maggio (0.17790697095248653)
settembre (0.17797054206645002)
agosto (0.17797642865963115)
marzo (0.1798108860278287)
aprile (0.1838171955828738)
ottobre (0.18482301804499845)
novembre (0.18843932543551611)
febbraio (0.195112209575663)


## Embedding creation

In [9]:
with open('../data/interim/ocrs/all_words_Publicacions.json', 'r') as f:
    postprocessed_ocrs = json.load(f)

In [10]:
def join_words(postprocessed_ocrs, language='all'):
    
    texts = []
    index_to_publication = {}
    for revista in postprocessed_ocrs:
        for publication_id in postprocessed_ocrs[revista]:
            words = []
            if language == 'all':
                words = []
                for lang in postprocessed_ocrs[revista][publication_id]:
                    words.extend(postprocessed_ocrs[revista][publication_id][lang]['existing'])
            elif language in postprocessed_ocrs[revista][publication_id]:
                words = postprocessed_ocrs[revista][publication_id][language]['existing']
            
            if len(words) > 0:          
                index_to_publication[len(texts)] = f'{revista}_{publication_id}'
                text = ' '.join(words)
                texts.append(text)
    return texts, index_to_publication

In [11]:
def apply_tfidf(texts, index_to_publication, max_df=0.8, min_df=1):
    
    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df)           
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names()
    
    word_relevances = {}
    for i, tfidf_vector in enumerate(tfidf_matrix):

        tfidf_vector = tfidf_vector.toarray().flatten()
        sorted_indices = np.argsort(tfidf_vector, axis=None)  
        word_relevances[index_to_publication[i]] = {feature_names[index]: tfidf_vector[index] 
            for index in reversed(sorted_indices) 
                if (not any(char.isdigit() for char in feature_names[index])) and (tfidf_vector[index]>0)}
    
    return word_relevances

Methods can be:

- cross_language: compute frequencies and relevance scores of words regardless of their language, putting them all together
- by_language: compute frequencies and relevance scores of words language by language

In [12]:
def compute_word_relevances(postprocessed_ocrs, max_df=0.8, min_df=1, 
                            method='cross_language', languages=['es','ca','it','fr']):
    
    word_relevances = {}
    
    texts = {}
    if method == 'cross_language':
        texts, index_to_publication = join_words(postprocessed_ocrs, 'all')
        word_relevances['cross_language'] = apply_tfidf(texts, index_to_publication, max_df=max_df, min_df=min_df)
    else:
        for lang in languages:
            texts, index_to_publication = join_words(postprocessed_ocrs, lang)
            word_relevances[lang] = apply_tfidf(texts, index_to_publication, max_df=max_df, min_df=min_df)
    
    return word_relevances

In [13]:
def create_doc_embeddings(postprocessed_ocrs, embeddings_dicts, weight_by_tfidf=True, 
                          max_df=0.8, min_df=1, tfidf_method='cross_language', 
                          languages=['es','ca','it','fr']):
    
    if weight_by_tfidf:
        word_relevances = compute_word_relevances(postprocessed_ocrs,
                                                  max_df=max_df,
                                                  min_df=min_df,
                                                  method=tfidf_method,
                                                  languages=languages)

    doc_embeddings = {}
    for revista in postprocessed_ocrs:
        for publication_id in postprocessed_ocrs[revista]:
            embedding = np.zeros(300)
            normalizer = 0
            for lang in postprocessed_ocrs[revista][publication_id]:
                if lang in embeddings_dicts:
                    for word in set(postprocessed_ocrs[revista][publication_id][lang]['existing']):
                        if word in embeddings_dicts[lang]:
                            if weight_by_tfidf:
                                try:
                                    weight = word_relevances['cross_language' if tfidf_method == 'cross_language' else lang][f'{revista}_{publication_id}'][word]
                                except KeyError:
                                    weight = 0
                            else:
                                weight = 1
                                
                            embedding += weight * embeddings_dicts[lang][word]
                            normalizer += weight
            if normalizer > 0:
                embedding /= normalizer
            doc_embeddings[f'{revista}_{publication_id}'] = embedding
            
    return doc_embeddings

In [75]:
doc_embeddings = create_doc_embeddings(postprocessed_ocrs, embeddings_dicts, weight_by_tfidf=False,
                                       tfidf_method='cross_language', max_df=0.8, min_df=3)

## Metadata

In [76]:
revista_title_code = {
    991001612299706717: 'instant',
    991001624269706717: 'matrencada',
    991001732409706717: 'amicarts',
    991001813989706717: 'AC',
    991002553879706717: 'Fulls grocs',
    991003272219706717: 'algol',
    991003294699706717: 'anti',
    991005036609706717: 'iberia',
    991005076959706717: 'esportcat',
    991005105169706717: 'helix',
    991005119309706717: 'monitor',
    991006467819706717: 'arcvoltaic',
    991006630279706717: 'themis',
    991006631789706717: 'trocos',
    991007018719706717: 'unenemicpob',
    991010414779706717: 'dauset',
    991011072099706717: '391',
    991014134819706717: 'Proa',
    991017182844906716: 'cobalto49',
    991017182846406716: 'cobalto'   
}

In [77]:
selected_subjects = [
    'Arquitectura',
    'Art modern',
    'Arts visuals',
    'Avantguarda (Estètica)',
    'Catalunya',
    'Cultura',
    'Dadaisme',
    'Espanya',
    'Esports',
    'Guerra Mundial I, 1914-1918',
    'Literatura catalana',
    'Literatura francesa',
    'Noucentisme (Art)',
    'Poesia catalana',
    'Poesia francesa',
    'Segle XX'
]

In [78]:
metadata_df = pd.read_csv("../data/raw/metadata/registres_metaglam_columnes.csv", 
                                          delimiter = ';')
metadata_df['title_code'] = [revista_title_code[mms_id] for mms_id in metadata_df['MMS Id']]
metadata_df['filtered_subjects'] = [[s for s in selected_subjects if s in subs]
                                       for subs in metadata_df['Subjects']]
metadata_df.head()

Unnamed: 0,MMS Id,URL Catàleg BC,Author (contributor),Title (Complete),Uniform Title,Publication Place,Place Code,Publisher,Publication Date,Begin Publication Date,End Publication Date,ISSN,Description,Notes_1,Notes_2,Subjects,Uniform Resource Identifier,title_code,filtered_subjects
0,991001612299706717,https://explora.bnc.cat/permalink/34CSUC_BC/4c...,"Biblioteca de Catalunya.; Pérez-Jorbà, J. 1878...",L'Instant [Recurs electrònic] : revue franco-c...,Instant (En línia).,París :,sp,sn,1918-1919,1918,1919,2604-4706; 2479-0710,,Fundada i dirigida per: Joan Pérez-Jorba ; col...,,Avantguarda (Estètica); Literatura catalana; P...,https://arca.bnc.cat/arcabib_pro/ca/consulta/r...,instant,"[Avantguarda (Estètica), Catalunya, Literatura..."
1,991001624269706717,https://explora.bnc.cat/permalink/34CSUC_BC/4c...,Biblioteca de Catalunya.; Ateneu Barcelonès.; ...,La Mà trencada [Recurs electrònic] : revista q...,Mà trencada (En línia).,Barcelona :,sp,Edicions Joan Merli,1924-1925,1924,1925,2604-5117; 1695-2960,,Dirigida per: Joan Merli ; col·laboradors: J. ...,,Avantguarda (Estètica); Literatura catalana; P...,https://arca.bnc.cat/arcabib_pro/ca/consulta/r...,matrencada,"[Art modern, Avantguarda (Estètica), Catalunya..."
2,991001732409706717,https://explora.bnc.cat/permalink/34CSUC_BC/4c...,Biblioteca de Catalunya.; Barcelona (Catalunya...,L'Amic de les arts [Recurs electrònic] : gaset...,"Amic de les arts (Sitges, Catalunya : 1926 : E...",Sitges :,spc,sn,1926-1929,1926,1929,2564-9671; 1135-8270,,Dirigida per: Josep Carbonell i Gener ; col·la...,,Avantguarda (Estètica); Literatura catalana; C...,https://arca.bnc.cat/arcabib_pro/ca/consulta/r...,amicarts,"[Avantguarda (Estètica), Catalunya, Literatura..."
3,991001813989706717,https://explora.bnc.cat/permalink/34CSUC_BC/4c...,GATCPAC.; Grupo de Arquitectos y Técnicos Espa...,AC : documentos de actividad contemporánea,,"Barcelona, Madrid, San Sebástian :",sp,GATEPAC,1931-1937,1931,1937,1579-1580,25 núm. : il. ; 27 cm,Subtítol del núm. 25 (juny 1937): documents d'...,,Arquitectura; Catalunya; Espanya; Revistes; Ar...,http://hemerotecadigital.bne.es/details.vm?q=i...,AC,"[Arquitectura, Catalunya, Espanya]"
4,991002553879706717,https://explora.bnc.cat/permalink/34CSUC_BC/4c...,Col·lecció de Premsa Agustí Pedro Pons (Biblio...,Fulls grocs,,[Barcelona :,spc,sn,1929],1929,1929,,1 núm. ; 55 cm,Només publiquen un núm.; Col·laboradors: G. Dí...,,Avantguarda (Estètica)--Catalunya--Revistes.,,Fulls grocs,"[Avantguarda (Estètica), Catalunya]"


In [79]:
subjects_dict = {title: subjects 
                 for title, subjects in zip(metadata_df['title_code'], metadata_df['filtered_subjects'])}

## Tensorboard

In [80]:
from tensorboard.plugins import projector
import tensorflow as tf

In [81]:
log_dir='../reports/tensorboard/logs/embeddings/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

In [82]:
embeddings = []
metadata = []
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    f.write('revista\tpub_id\tsubjects\n')
    for k in doc_embeddings:
        embeddings.append(doc_embeddings[k])
        revista = k.split('_')[0]
        pub_id  = k.split('_')[1]
        subjects_label = ' '.join(subjects_dict[revista])
        f.write(f'{revista}\t{pub_id}\t{subjects_label}\n')

In [83]:
embeddings = np.array(embeddings)
weights = tf.Variable(embeddings)
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

'../reports/tensorboard/logs/embeddings/embedding.ckpt-1'

In [84]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [85]:
%load_ext tensorboard
os.environ['TENSORBOARD_BINARY'] = '/home/luca.piras/.local/bin/tensorboard'

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [86]:
%tensorboard --logdir ../reports/tensorboard/logs/embeddings --bind_all

Reusing TensorBoard on port 6006 (pid 17782), started 1:26:34 ago. (Use '!kill 17782' to kill it.)