In [33]:
from gensim.models.keyedvectors import load_word2vec_format
from gensim.models import KeyedVectors
import pandas as pd
import zipfile
import os
import urllib.request

--------------------------------------------------------------------------

### WORD2VEC

In [26]:
# Load embeddings model to use in the assignment
w2v_model = load_word2vec_format("GoogleNews-vectors-negative300.bin", binary = True)

In [27]:
# Words of the model
w2v_words = list(w2v_model.key_to_index.keys())

In [None]:
# Words of our vocab
csv_paths = ["finer_ord_train.csv", "finer_ord_validation.csv", "finer_ord_test.csv"]
fin_words = set()

for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    words = df["gold_token"].astype(str).values 
    for word in words:
        fin_words.add(word)

In [32]:
common_words = fin_words.intersection(w2v_words)
percentage = (len(common_words) / len(fin_words)) * 100
print(f"w2v percentage: {percentage}")

w2v percentage: 88.28588826305374


-----------------------------------------------------------------------------------------

### GLOVE

In [None]:
glove_zip = "glove.6B.zip"
glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"

if not os.path.exists(glove_zip):
    print("Descargando GloVe...")
    urllib.request.urlretrieve(glove_url, glove_zip)

if not os.path.exists("glove.6B.100d.txt"):
    print("Extrayendo archivos...")
    with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall()

def cargar_glove(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = list(map(float, values[1:]))
            embeddings[word] = vector
    return embeddings

glove_path = "glove.6B.100d.txt"
glove = cargar_glove(glove_path)

print(f"Número de palabras cargadas: {len(glove)}")

In [None]:
csv_paths = ["finer_ord_train.csv", "finer_ord_validation.csv", "finer_ord_test.csv"]
fin_words = set()
glove_words = list(glove.keys())

for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    words = df["gold_token"].astype(str).values 
    for word in words:
        fin_words.add(word)

common_words = fin_words.intersection(glove_words)
percentage = (len(common_words) / len(fin_words)) * 100
print(f"Glove percentage: {percentage}")

-------------------------------------------------------------------------------------------------------

### KEYEYED VECTORS

In [None]:
key_model = KeyedVectors.load_word2vec_format("cc.es.300.vec", binary=False)

In [None]:
# Words of the model
key_words = list(key_model.key_to_index.keys())

In [None]:
# Words of our vocab
csv_paths = ["finer_ord_train.csv", "finer_ord_validation.csv", "finer_ord_test.csv"]
fin_words = set()

for csv_path in csv_paths:
    df = pd.read_csv(csv_path)
    words = df["gold_token"].astype(str).values 
    for word in words:
        fin_words.add(word)

In [None]:
common_words = fin_words.intersection(key_words)
percentage = (len(common_words) / len(fin_words)) * 100
print(f"key percentage: {percentage}")

w2v percentage: 88.28588826305374
