In [6]:
# https://wefe.readthedocs.io/en/latest/loading_embeddings.html
# https://crscardellino.ar/SBWCE/
# https://github.com/dccuchile/spanish-word-embeddings#glove-embeddings-from-sbwc
# https://github.com/LaurentVeyssier/Unsupervised-text-classification-with-BERT-embeddings/blob/main/unsupervised_text_classification_with_BERT.ipynb
    
from wefe.query import Query
from wefe.word_embedding_model import WordEmbeddingModel
from wefe.metrics.WEAT import WEAT
from wefe.datasets.datasets import load_weat
from gensim.models import KeyedVectors

word_sets = load_weat()

w2v_embeddings = KeyedVectors.load_word2vec_format("models/sbw_vectors.bin", binary=True)
word2vec = WordEmbeddingModel(w2v_embeddings, 'word2vec')

query = Query([word_sets['male_terms'], word_sets['female_terms']],
              [word_sets['career'], word_sets['family']],
              ['Male terms', 'Female terms'],
              ['Career', 'Family'])

# instantiate the metric
weat = WEAT()

result = weat.run_query(query, word2vec)
result

{'query_name': 'Male terms and Female terms wrt Career and Family',
 'result': 0.14108615275472403,
 'weat': 0.14108615275472403,
 'effect_size': 0.5770427540727928,
 'p_value': nan}

In [1]:
# https://github.com/aitoralmeida/spanish_word2vec

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

model = Word2Vec.load("models/complete.model")

#vector = model.wv['rey']  # get numpy vector of a word
sims = model.wv.most_similar('rey', topn=10)  # get other similar words
print(sims)

[('monarca', 0.8643362522125244), ('emperador', 0.8535380363464355), ('príncipe', 0.8354155421257019), ('soberano', 0.791935920715332), ('sultán', 0.7623835802078247), ('emir', 0.7423332929611206), ('faraón', 0.7316471338272095), ('califa', 0.7286533117294312), ('duque', 0.7255862951278687), ('regente', 0.7245674133300781)]


In [1]:
# https://stackoverflow.com/questions/63895772/understanding-get-sentence-vector-and-get-word-vector-for-fasttext

import fasttext
import fasttext.util
from gensim.models.wrappers import FastText

fasttext.util.download_model('es', if_exists='ignore')
model = fasttext.load_model('cc.es.300.bin')

#sentencia = model.get_sentence_vector('poder')
#palabra   = model.get_word_vector('coche')
#print(palabra)

# get nearest neighbors for the interested words (100 neighbors)
arancia_nn=model.get_nearest_neighbors('amor', k=2000)
kiwi_nn=model.get_nearest_neighbors('castillo', k=2000)

# get only words sets (discard the similarity cosine)
arancia_nn_words=set([el[1] for el in arancia_nn])
kiwi_nn_words=set([el[1] for el in kiwi_nn])

# compute the intersection
common_similar_words=arancia_nn_words.intersection(kiwi_nn_words)

print(common_similar_words)



set()


In [7]:
# https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne-sqac

import tensorflow as tf
import transformers
import os
from tensorflow import keras
from keras import layers
from keras.layers import Input, Conv2D, Dense, Flatten, Dropout
from keras.layers import GlobalMaxPooling2D, MaxPooling2D
from keras.layers import BatchNormalization
from keras.models import Model
from keras.models import load_model

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_built_with_cuda()) 
print(tf.__version__)

def carga_modelo():

    inputs  = keras.Input(shape=(784,), name='digits')
    x = layers.Dense(64, activation='relu', name='dense_1')(inputs)
    x = layers.Dense(64, activation='relu', name='dense_2')(x)
    outputs = layers.Dense(10, activation='softmax', name='predictions')(x)
    model   = keras.Model(inputs=inputs, outputs=outputs, name='datasets/roberta-base/pytorch_model.bin')
    
    model.summary()
    
carga_modelo()

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True
2.9.1
Model: "datasets/roberta-base/pytorch_model.bin"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 digits (InputLayer)         [(None, 784)]             0         
                                                                 
 dense_1 (Dense)             (None, 64)                50240     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 predictions (Dense)         (None, 10)                650       
                                                                 
Total params: 55,050
Trainable params: 55,050
Non-trainable params: 0
_________________________________________________________________


In [3]:
# Libraries: TfidfVectorizer
    
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Hey diddle, diddle,',
    'The cow jumped over the moon.',
    'The little dog laughed to see such sport,',
    'and the dish ran away with the spoon. '
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

df_tf_idf = pd.DataFrame(
    data=X.todense().round(2),
    columns=vectorizer.get_feature_names_out()
)

print(df_tf_idf)

    and  away   cow  diddle  dish   dog   hey  jumped  laughed  little  moon  \
0  0.00  0.00  0.00    0.89  0.00  0.00  0.45    0.00     0.00    0.00  0.00   
1  0.00  0.00  0.42    0.00  0.00  0.00  0.00    0.42     0.00    0.00  0.42   
2  0.00  0.00  0.00    0.00  0.00  0.37  0.00    0.00     0.37    0.37  0.00   
3  0.36  0.36  0.00    0.00  0.36  0.00  0.00    0.00     0.00    0.00  0.00   

   over   ran   see  spoon  sport  such   the    to  with  
0  0.00  0.00  0.00   0.00   0.00  0.00  0.00  0.00  0.00  
1  0.42  0.00  0.00   0.00   0.00  0.00  0.54  0.00  0.00  
2  0.00  0.00  0.37   0.00   0.37  0.37  0.23  0.37  0.00  
3  0.00  0.36  0.00   0.36   0.00  0.00  0.46  0.00  0.36  


In [14]:
# https://www.kaggle.com/code/vpkprasanna/usage-of-transformers-pipelines#Sentence_Classification
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model='distilbert-base-uncased-finetuned-sst-2-english')
classifier("I don´t like a shit the film!")

[{'label': 'NEGATIVE', 'score': 0.9944590330123901}]

In [16]:
nlp_qa = pipeline('question-answering', model = 'distilbert-base-cased-distilled-squad')
nlp_qa(context='Hugging Face is a French company based in New-York.', question='Where is based Hugging Face ?')

{'score': 0.963358998298645, 'start': 42, 'end': 50, 'answer': 'New-York'}