In [1]:
import json 
data = [json.loads(line) for line in open('subset_well_parsed.json', 'r')]

In [2]:
spanish_compatible_data = [d for d in data if d["fulltext_html_es"] is not None]
len(spanish_compatible_data)

996

In [3]:
spanish_compatible_data[0].keys()

dict_keys(['_id', 'ab_pt', 'fulltext_html_es', 'fulltext_pdf_en', 'ab', 'ti_en', 'journal_title', 'wok_subject_categories', 'keyword_es', 'fulltext_html_pt', 'da', 'subject_area', 'au', 'sponsor', 'available_languages', 'keyword_en', 'ti_es', 'arquivo', 'fulltext_html_en', 'fulltext_pdf_es', 'ti_pt', 'doi', 'ab_es', 'la', 'fulltext_pdf_pt', 'ur', 'wok_citation_index', 'keyword_pt', 'version', 'ti', 'ab_en', 'type', 'use_license', 'ta', 'body_content_en', 'body_content_es', 'body_content_pt'])

In [4]:
scientific_text_data = list()
for i,d in enumerate(spanish_compatible_data):
    spanish_text = str(d["body_content_es"])
    if "body_content_en" not in d:
        continue
    english_text = str(d["body_content_en"])
    article_object = {"en":english_text,"es":spanish_text}
    scientific_text_data.append(article_object)
scientific_text_data[0]

{'en': '{\'Description of evidence collection method\': [\'We conducted searches in multiple databases (Medline 1965-2012; Cochrane Library,               Lilacs) and cross-references with the collected material to identify studies with               better methodological design, followed by a critical evaluation of their contents and               classification according to the strength of evidence.\', \'We conducted searches between August and December 2012. The following strategies were               used for searches in PubMed:\', \'"regional anaesthesia" OR "anesthesia, conduction" OR "anesthesia" AND                     "conduction" OR "conduction anesthesia" OR "regional" AND "anesthesia" OR                     "regional anesthesia" AND "antithrombotic";\', \'"regional anaesthesia" OR "anesthesia, conduction" [MeSH Terms] AND "infection"                     [MeSH Terms] AND "thromboembolism" [MeSH Terms] OR "thromboembolism" [All                     Fields];\', \'"thromboemboli

In [1]:
" ".join([])

''

In [5]:
import json
with open("scientific_text_data.json","w") as scd:
    json.dump(scientific_text_data,scd)

In [1]:
import json
scientific_text_data = list()
with open("scientific_text_data.json","r") as st:
    scientific_text_data = json.load(st)
len(scientific_text_data)

995

In [2]:
from nltk.corpus import stopwords
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from unidecode import unidecode
import numpy as np


class TextPreparation(object):
    def __init__(self,text_series,cv_params={"min_df":0.001,"max_df":0.7},language="english"):
        self.text_series = text_series
        self.cv_params = cv_params
        self.language=language
        language_model_name_dict = {"english":"en_core_web_sm","spanish":"es_core_news_sm"}
        self.model = spacy.load(language_model_name_dict[self.language], disable = ['parser','ner'])

    def _get_stopwords(self):
        nltk.download("stopwords")
        english_stopwords = stopwords.words(self.language)
        self.stopwords = english_stopwords

    
    def _clean_text(self,text):
        self._get_stopwords()
        lower_text = text.str.lower()
        no_accents = lower_text.apply(unidecode)
        alpha_text =  no_accents.str.replace(r"(@\[a-z]+)|([^a-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "")
        nostops = alpha_text.apply(lambda x: " ".join([word for word in x.split() if word not in self.stopwords]))
        return nostops
    
    def _lemmatize_words(self,text):
        def _lemmatize_doc(doc):
            doc_obj = self.model(doc)
            lemmatized = " ".join([token.lemma_ for token in doc_obj])
            return lemmatized
        return text.apply(_lemmatize_doc)
    
    def _filter_words(self,text):
        cv = CountVectorizer(min_df=self.cv_params["min_df"],max_df=self.cv_params["max_df"])
        cv = CountVectorizer(max_features=10000)
        cv.fit(text)
        filtered_text = text.apply(lambda x: " ".join([word for word in x.split() if word in cv.vocabulary_]))
        return filtered_text
    
    def _indexes_to_eliminate(self,text):
        indexes_keep = np.where(text.str.len() >=0)
        self.indexes_to_keep = indexes_keep
        text = text.loc[indexes_keep]
        return text
    
    
    def prepare_text(self,pipeline=["clean","lemmatize","filter","keep"]):
        functions = {"clean":self._clean_text,"filter":self._filter_words,"lemmatize":self._lemmatize_words,"keep":self._indexes_to_eliminate}
        text = self.text_series
        for step in pipeline:
            text = functions[step](text)
        
        return text



In [1]:
import gc;gc.collect()
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
scientific_text_df = pd.read_csv("parallel_UN_100w.csv")
scientific_text_df.head()

Unnamed: 0,en,es
0,• awareness raising activities (seminar in Tur...,• actividades de sensibilización (seminario en...
1,The following outcomes have been broadly agree...,Los donantes han acordado los siguientes resul...
2,"Skills transferred, knowledge updated for part...",Transferencia de aptitudes Dy actualización de...
3,Operational new association of universities in...,Puesta en funcionamiento determinar el proyect...
4,"During the year, the following functions neede...",En el transcurso del año. ha sido necesa­rio c...


In [5]:
scientific_text_df.head()

Unnamed: 0,en,es
0,• awareness raising activities (seminar in Tur...,• actividades de sensibilización (seminario en...
1,The following outcomes have been broadly agree...,Los donantes han acordado los siguientes resul...
2,"Skills transferred, knowledge updated for part...",Transferencia de aptitudes Dy actualización de...
3,Operational new association of universities in...,Puesta en funcionamiento determinar el proyect...
4,"During the year, the following functions neede...",En el transcurso del año. ha sido necesa­rio c...


In [6]:
text_prep_en = TextPreparation(scientific_text_df.en)
prepared_text_en_science = text_prep_en.prepare_text()
prepared_text_en_science.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juan9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  alpha_text =  no_accents.str.replace(r"(@\[a-z]+)|([^a-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "")


0    awareness raise activity seminar turin follow ...
1    follow outcome broadly agree donor see consoli...
2    skill transfer knowledge update partner countr...
3    operational new association universitie specif...
4    year follow function need add system creation ...
Name: en, dtype: object

In [7]:
prepared_text_en_science

0        awareness raise activity seminar turin follow ...
1        follow outcome broadly agree donor see consoli...
2        skill transfer knowledge update partner countr...
3        operational new association universitie specif...
4        year follow function need add system creation ...
                               ...                        
19811    respect comparative vocational follow differen...
19812    order able assess however consideration must g...
19813    preparation planning building site carry pract...
19814    planning duty type carry germany practical tem...
19815    consequently also planning activity instead pr...
Name: en, Length: 19816, dtype: object

In [8]:
text_prep_es = TextPreparation(scientific_text_df.es,language="spanish")
prepared_text_es_science = text_prep_es.prepare_text()
prepared_text_es_science.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juan9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  alpha_text =  no_accents.str.replace(r"(@\[a-z]+)|([^a-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "")


0    actividad sensibilizacion seminario turin segu...
1    donante acordado siguiente resultado general v...
2    transferencia aptitud actualizacion conocimien...
3    puesto funcionamiento determinar proyecto obje...
4    transcurso ano ser necesario completar sistema...
Name: es, dtype: object

In [9]:
prepared_text_es_science.values[0]

'actividad sensibilizacion seminario turin seguido curso practico nacional especifico visto potenciar intercambio informacion acerca mecanismo instrumento desarrollado ue mejorar transparencia titulo academico reforzar intercambio conocimiento miembro ue país asociado acerca respectivo sistema titulacion academico certificacion estudios determinacion posible manera lograr especialista país asociado participar iniciativa red desarrollado ue fomento transparencia emprender amplio debate país asociado recomendación surgido estudio caso anteriormente mencionado'

In [10]:
from sentence_transformers import SentenceTransformer

In [15]:
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import numpy as np

class TextEmbeddingGenerator(object):

    def __init__(self,texts,model='all-MiniLM-L6-v2'):
        self.model = model
        self.texts = texts
        self.kept_indices = list()

    def _load_transformer(self):
        self.transformer = SentenceTransformer(self.model)

    def unload_transformer(self):
        del self.transformer


    def _generate_text_embeddings(self):
        text_embeddings = list()
        for i,text in self.texts.items():
            tokenized_sentences = sent_tokenize(text)
            if len(tokenized_sentences) > 0:
                sentence_embeddings = self.transformer.encode(tokenized_sentences)
                averaged_sentences = sentence_embeddings.mean(axis=0)
                text_embeddings.append(averaged_sentences)
                self.kept_indices.append(i)
            else:
                continue
            
        return np.array(text_embeddings)
    

    def calculate_embeddings(self):
        self._load_transformer()
        embeddings = self._generate_text_embeddings()
        return embeddings
    

        
        

In [16]:
embedding_gen = TextEmbeddingGenerator(prepared_text_en_science,"paraphrase-multilingual-MiniLM-L12-v2")

In [17]:
embeddings = embedding_gen.calculate_embeddings()

In [18]:
embedding_gen.unload_transformer()

In [19]:
del embedding_gen
gc.collect()

147

In [20]:
embeddings.shape

(19474, 384)

In [50]:
from sklearn.mixture import GaussianMixture
import numpy as np

class TopicModel(object):

    def __init__(self,n_topics=50):
        self.model = GaussianMixture(n_components=n_topics,random_state=777)


    def _fit_model(self,embeddings):
        self.model.fit(embeddings)

    
    def _select_topics(self,embeddings,save=True):
        probs = self.model.predict_proba(embeddings)
        topics = [np.where(p > 0) for p in probs]
        if save:
            self.topics = topics
            self.probs = probs.tolist()
    

    def get_topics(self,embeddings,save=True):
        if save == True:
            self._fit_model(embeddings)
        self._select_topics(embeddings,save)
        return self.topics

In [51]:
topic_model  = TopicModel()
topics = topic_model.get_topics(embeddings)

In [52]:
topics

[(array([ 3,  9, 11, 14, 24, 34, 38, 48], dtype=int64),),
 (array([ 3, 11, 14, 18, 24, 34], dtype=int64),),
 (array([ 3,  9, 11, 14, 34], dtype=int64),),
 (array([ 3,  9, 11, 14, 24, 28, 34], dtype=int64),),
 (array([ 3, 11, 28, 34, 38], dtype=int64),),
 (array([1], dtype=int64),),
 (array([10], dtype=int64),),
 (array([ 3, 11, 14, 17, 18, 24], dtype=int64),),
 (array([ 3,  7,  9, 11, 14, 16, 17, 18, 24, 38, 48], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([ 3,  7,  9, 11, 14, 15, 16, 17, 18, 24, 34, 38, 48], dtype=int64),),
 (array([ 3,  9, 11, 14, 16, 17, 18, 24, 38], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([14, 18, 24, 38], dtype=int64),),
 (array([ 3,  9, 11, 28, 38], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([10], dtype=int64),),
 (array([ 3,  7,  9, 11, 14, 16, 17, 18, 24, 34, 38, 48], dtype=int64

In [53]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize


class TopicUtilities(object):

    def __init__(self,coherence_mode="c_npmi",n_topics=20,topk=10):
        self.coherence_mode = coherence_mode
        self.n_topics = n_topics
        self.topk = topk


    def create_utility_objects(self,data):
        self.tokenized_word_sentences = [word_tokenize(s) for s in data.values]
        self.id2word = corpora.Dictionary(self.tokenized_word_sentences)

    

    def get_top_topic_tokens(self,topics):
        topic_top_n = list()
        for topic in range(self.n_topics):
            topic_indexes = [i for i,t in enumerate(topics) for p in t if topic in p]
            topic_sentences = [t for i,t in enumerate(self.tokenized_word_sentences) if i in topic_indexes]
            all_topic_words = [word for sentence in topic_sentences for word in sentence]
            frequency = FreqDist(all_topic_words)
            top_n = frequency.most_common(self.topk)
            top_n_words = [t[0] for t in top_n]
            topic_top_n.append(top_n_words)
        
        self.topic_top_n = topic_top_n
        return topic_top_n
    
    def get_topic_diversity(self,top_tokens):
        unique_words = set()
        for topic in range(self.n_topics):
            unique_words = unique_words.union(set(top_tokens[topic]))
        diversity = len(unique_words) / (self.topk * self.n_topics)
        return diversity

        

    

    def get_coherence(self,top_tokens):
        cm = CoherenceModel(topics=top_tokens,texts = self.tokenized_word_sentences, dictionary=self.id2word, coherence=self.coherence_mode,processes=1,topn=10)
        coherence = cm.get_coherence()
        return coherence

    

    



In [54]:
gc.collect()

79

In [55]:
utils = TopicUtilities()
utils.create_utility_objects(prepared_text_en_science)
top_tokens = utils.get_top_topic_tokens(topics)
coherence = utils.get_coherence(top_tokens)

In [56]:
coherence

0.06940398332433

In [43]:
diversity = utils.get_topic_diversity(top_tokens)
diversity

0.395

In [29]:
embedding_gen = TextEmbeddingGenerator(prepared_text_es_science,"paraphrase-multilingual-MiniLM-L12-v2")
spa_embeddings = embedding_gen.calculate_embeddings()
embedding_gen.unload_transformer()
del embedding_gen

In [30]:
gc.collect(
)

20

In [31]:
spanish_topics = topic_model.get_topics(spa_embeddings,save=False)

In [32]:
utils = TopicUtilities()
utils.create_utility_objects(prepared_text_es_science)
top_tokens = utils.get_top_topic_tokens(spanish_topics)
coherence = utils.get_coherence(top_tokens)

In [33]:
coherence

0.01455248030633817

In [34]:
diversity = utils.get_topic_diversity(top_tokens)
diversity

0.3

In [35]:
def average_topic_matching(english_topics,spanish_topics):
    matches= list()
    for i in range(len(english_topics)):
        english_document_topics = english_topics[i]
        spanish_document_topics = spanish_topics[i]
        match = np.array_equal(english_document_topics,spanish_document_topics)
        matches.append(match)
    return np.array(matches).mean()

In [36]:
average_topic_matching(topics,spanish_topics)

1.0

In [37]:
from collections import Counter

In [86]:
def get_tokenized_word_sentences(texts):
    return [word_tokenize(s) for s in texts]


def get_topic_counters(tokenized_word_sentences):
    topic_counters = list()
    for topic in range(20):
                topic_indexes = [i for i,t in enumerate(topics) for p in t if topic in p]
                topic_sentences = [t for i,t in enumerate(tokenized_word_sentences) if i in topic_indexes]
                all_topic_words = [word for sentence in topic_sentences for word in sentence]
                frequency = Counter(all_topic_words)
                topic_counters.append(frequency)
    return topic_counters

In [87]:
tokenized_word_sentences_en = get_tokenized_word_sentences(prepared_text_en_science.values)

In [88]:
topic_counters = get_topic_counters(tokenized_word_sentences_en)

In [89]:
all_words_en = [word for sentence in tokenized_word_sentences_en for word in sentence]
frequency_en = Counter(all_words_en)
len(frequency_en.keys())

10000

In [90]:
def calculate_word_probabilities(topic_counters,vocab_size,beta=0.5):
    word_dict_probs = list()
    for topic in topic_counters:
        word_probabilities = dict()
        denominator = ( sum(list(topic.values())) + (vocab_size * beta))
        for word,counter in topic.items():
            numerator = counter + beta
            prob = numerator / denominator
            word_probabilities[word] = prob
        
        word_dict_probs.append(word_probabilities)
    return word_dict_probs




In [91]:
word_probabilities_en = calculate_word_probabilities(topic_counters,len(frequency_en.keys()))

In [92]:
def calculate_word_vectors(word_probabilities,vocabulary):
    word_norms = dict()
    word_vectors = dict()
    for word in vocabulary:
        word_norm_total = 0
        for topic,word_topic_probabilities in enumerate(word_probabilities):
            if word in word_topic_probabilities:
                word_norm_total += word_topic_probabilities[word]
        word_norms[word] = word_norm_total
        word_vector = list()
        for topic,word_topic_probabilities in enumerate(word_probabilities):
            if word in word_topic_probabilities:
                word_vector.append(word_topic_probabilities[word])
            else:
                word_vector.append(0)
        word_vector_array = np.array(word_vector) / word_norms[word]
        word_vectors[word] = word_vector_array
    return word_vectors

In [93]:
word_vectors_en = calculate_word_vectors(word_probabilities_en,list(frequency_en.keys()))
word_vectors_en

{'description': array([0.        , 0.08978368, 0.01772622, 0.08035089, 0.04293376,
        0.08253212, 0.07319536, 0.03350046, 0.05397497, 0.08293647,
        0.05243161, 0.02270763, 0.0506701 , 0.04968555, 0.04361044,
        0.06082912, 0.01689122, 0.03684404, 0.07809644, 0.03129991]),
 'evidence': array([0.02176367, 0.03070796, 0.03869138, 0.06805479, 0.05473231,
        0.03790587, 0.05072187, 0.04522848, 0.07376388, 0.03033936,
        0.05115552, 0.07714724, 0.07777826, 0.02688068, 0.03944378,
        0.0786967 , 0.02842364, 0.04936807, 0.07166524, 0.04753129]),
 'collection': array([0.02130521, 0.08962829, 0.04469623, 0.06877342, 0.08044743,
        0.01700096, 0.04075982, 0.04800669, 0.05990586, 0.01804552,
        0.06670894, 0.03659527, 0.02733434, 0.01724049, 0.07768003,
        0.06258749, 0.00945594, 0.103129  , 0.09636279, 0.01433628]),
 'method': array([0.05276048, 0.04041226, 0.03130104, 0.06145774, 0.03716305,
        0.04123051, 0.05388116, 0.03717806, 0.04520926, 0.0

In [94]:
def get_ner(texts,language):
    language_model_name_dict = {"english":"en_core_web_sm","spanish":"es_core_news_sm"}
    model = spacy.load(language_model_name_dict[language], disable = ['parser'])
    entities = set()
    for text in texts:
        data = model(text)
        for ent in data.ents:
            entity_text = ent.text
            for entity in entity_text.split(" "):
                entities.add(entity)
    return entities

In [78]:
english_entities = get_ner(prepared_text_en_science.values,"english")

In [80]:
spanish_entities = get_ner(prepared_text_es_science.values,"spanish")

In [82]:
common_entities = english_entities.intersection(spanish_entities)

In [96]:
tokenized_word_sentences_es = get_tokenized_word_sentences(prepared_text_es_science.values)
topic_counters_es = get_topic_counters(tokenized_word_sentences_es)
all_words_es = [word for sentence in tokenized_word_sentences_es for word in sentence]
frequency_es = Counter(all_words_es)
word_probabilities_es = calculate_word_probabilities(topic_counters_es,len(frequency_es.keys()))
word_vectors_es = calculate_word_vectors(word_probabilities_es,list(frequency_es.keys()))

In [98]:
comparison_vectors = dict()
for word in common_entities:
    comparison_vectors[word] = dict()
    comparison_vectors[word]["en"] = word_vectors_en[word]
    comparison_vectors[word]["es"] = word_vectors_es[word]

In [101]:
en_stacked_vectors = np.array([v["en"] for v in comparison_vectors.values()])
es_stacked_vectors = np.array([v["es"] for v in comparison_vectors.values()])


In [103]:
from sklearn.metrics.pairwise import cosine_similarity
average_similarity = cosine_similarity(en_stacked_vectors,es_stacked_vectors).mean()
average_similarity

0.2835802875571471

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
scientific_text_df

Unnamed: 0,en,es
0,• awareness raising activities (seminar in Tur...,• actividades de sensibilización (seminario en...
1,The following outcomes have been broadly agree...,Los donantes han acordado los siguientes resul...
2,"Skills transferred, knowledge updated for part...",Transferencia de aptitudes Dy actualización de...
3,Operational new association of universities in...,Puesta en funcionamiento determinar el proyect...
4,"During the year, the following functions neede...",En el transcurso del año. ha sido necesa­rio c...
...,...,...
19811,With respect to comparative vocational trainin...,Paralos estudios comparativos de la formación ...
19812,In order to be able to assess these figurescor...,Para poder realizar una valoración completaen ...
19813,Preparation and planning of the building site ...,"Se encarga de la preparación de los trabajos, ..."
19814,Planning duties of this type are not carried o...,La formación de los művezető en Hungría seconc...


In [8]:
random_data = scientific_text_df.en[:100].values.tolist()
random_data[:5]

['• awareness raising activities (seminar in Turin followed by targeted national workshops) with a view to: a) promoting the transfer of information on mechanisms and tools developed at the EU level for improving the transparency of qualifications; b) reinforcing the exchange of knowledge between EU Member States and partner countries on their respective qualification and certification systems; c) identifying possible ways and conditions to involve experts from partner countries in initiatives and networks developed in the EU for promoting transparency; and d) engaging a wider debate in the partner countries on the recommendations provided by the case studies mentioned above.',
 'The following outcomes have been broadly agreed by all donors (see below): a) consolidation, deepening and dissemination of the results of the pilot phase: a) best practice and results of the pilot project on VET reform applied and disseminated in the regions; b) curricula completed and teaching materials deve

In [65]:
import numpy as np
tfidf = TfidfVectorizer()
transformed_data = tfidf.fit_transform(random_data)
top_word_indexes = np.squeeze(np.asarray(transformed_data.mean(axis=0))).argsort()[-10:]
wordlist = tfidf.get_feature_names_out()
top_words = [wordlist[i] for i in top_word_indexes]
top_words

['mark', 'trade', 'is', 'on', 'for', 'in', 'to', 'and', 'of', 'the']