<h1 align="center">LAB1_S3. Static word-embeddings for text representation</h1>

<h3 style="display:block; margin-top:5px;" align="center">Natural Language and Information Retrieval</h3>
<h3 style="display:block; margin-top:5px;" align="center">Degree in Data Science</h3>
<h3 style="display:block; margin-top:5px;" align="center">2024-2025</h3>    
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

Authors:
-   Marcos Ranchal
-   Marc Siquier

In [43]:
#Installing Gensim library
#!pip install -U gensim
#!pip install -U nltk
#!pip install -U fasttext

## Some libraries

In [1]:
#import fasttext.util
import re
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import gensim.downloader as downloader
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors


nltk.download("punkt_tab")#nltk.download("punkt")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\marcos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load both corpora

In [45]:
path_english = "EXIST2024_EN_examples.csv"
path_spanish = "EXIST2024_ES_examples.csv"

df = {
    "english": pd.read_csv(path_english, sep="\t"),
    "spanish": pd.read_csv(path_spanish, sep="\t")
}

# OR USING Google colab
#from google.colab import drive
#drive.mount('/content/drive')
#df = {
#    "english": pd.read_csv("/content/drive/MyDrive/LNR/LNR2025/Lab1/EXIST2024_EN_examples.csv", sep="\t"),
#    "spanish": pd.read_csv("/content/drive/MyDrive/LNR/LNR2025/Lab1/EXIST2024_ES_examples.csv", sep="\t")
#}


## Preprocess and tokenize the corpora:

- remove URLs
- remove hashtags
- remove users
- lowercase
- tokenize (using _word_tokenize_)
- remove stopwords (using nltk stopwords)

Note: tokenization and stopword removal are language-dependent.

In [46]:
# Info: nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)

web_re = re.compile(r"https?:\/\/[^\s]+", re.U)
user_re = re.compile(r"(@\w+\-?(?:\w+)?)", re.U)
hashtag_re = re.compile(r"(#\w+\-?(?:\w+)?)", re.U)

stopw = {
    "english": nltk.corpus.stopwords.words("english"),
    "spanish": nltk.corpus.stopwords.words("spanish")
}

def preprocess(text):
    # COMPLETE
    text = web_re.sub("", text)
    text = user_re.sub("", text)
    text = hashtag_re.sub("", text)
    text = text.lower()
    return text

def tokenize(text_list, lang="english"):
    # COMPLETE
    token_list = []
    for t in text_list:
        text = preprocess(t)
        list_t = nltk.tokenize.word_tokenize(text, language=lang, preserve_line=False)
        list_t = [word for word in list_t if word not in stopw[lang]]
        token_list.append(list_t)
    return token_list

tokenized_text = {
    "english": tokenize(df["english"]["text"], "english"),
    "spanish": tokenize(df["spanish"]["text"], "spanish")
}

t = ["Hola, mi nombre es Antonio, ¿todo bien? https://www.upv.es @paquita", "Hi! my name is Peter"]
print(t)
print(tokenize(t, "spanish"))
print(tokenize(t, "english"))

['Hola, mi nombre es Antonio, ¿todo bien? https://www.upv.es @paquita', 'Hi! my name is Peter']
[['hola', ',', 'nombre', 'antonio', ',', '¿todo', 'bien', '?'], ['hi', '!', 'my', 'name', 'is', 'peter']]
[['hola', ',', 'mi', 'nombre', 'es', 'antonio', ',', '¿todo', 'bien', '?'], ['hi', '!', 'name', 'peter']]


## Text representation using static embeddings

ENGLISH

- word2vec-google-news-300 (using Gemini)
- fasttext-wiki-news-subwords-300 (using Gemini)
- glove-wiki-gigaword-300 (using Gemini)

SPANISH
- Fasttext (https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) (using Gemini)

### Load the models

In [None]:

# COMPLETE
import gensim.downloader as download

word2vec_model = download.load("word2vec-google-news-300")
fasttext_model = download.load("fasttext-wiki-news-subwords-300")
glove_model = download.load("glove-wiki-gigaword-300")




In [23]:
es_model = KeyedVectors.load_word2vec_format('cc.es.300.vec.gz', binary=False)

In [None]:
from gensim.models import KeyedVectors

# Load the models from the gensim-data
model_path_w2v = r"C:\Users\marcos\gensim\word2vec-google-news-300\word2vec-google-news-300.gz"
model_path_glove = r"C:\Users\marcos\gensim\glove-wiki-gigaword-300\glove-wiki-gigaword-300.gz"
model_path_fasttext = r"C:\Users\marcos\gensim\fasttext-wiki-news-subwords-300\fasttext-wiki-news-subwords-300.gz"

word2vec_model = KeyedVectors.load_word2vec_format(model_path_w2v, binary=True)
glove_model = KeyedVectors.load_word2vec_format(model_path_glove, binary=False)
fasttext_model = KeyedVectors.load_word2vec_format(model_path_fasttext, binary=False)



In [47]:
models_en = {
    "w2v300": word2vec_model,
    "ftsub300": fasttext_model,
    "glwiki300": glove_model}

models_es = {
    "ftes300": es_model
}


### Compute static word-embeddings representation of the tweets

In [None]:
# COMPLETE
def representacion_oracion_gensim(texto, vectores_clave, dim=300):
    palabras = texto
    vector_cero = np.zeros(dim)
    vector_promedio = np.zeros(dim)
    total_palabras = 0
    for palabra in palabras:
        try:
            vector_promedio += vectores_clave[palabra]
            total_palabras += 1
        except KeyError:
            pass
    if total_palabras == 0:
        return vector_cero
    return vector_promedio / total_palabras

# Aplicar "representacion_oracion_gensim"
df["english"]["w2v300"] = [representacion_oracion_gensim(tweet, word2vec_model) for tweet in tokenized_text["english"]]
df["english"]["ftsub300"] = [representacion_oracion_gensim(tweet, fasttext_model) for tweet in tokenized_text["english"]]
df["english"]["glwiki300"] = [representacion_oracion_gensim(tweet, glove_model) for tweet in tokenized_text["english"]]
df["spanish"]["ftes300"] = [representacion_oracion_gensim(tweet, es_model) for tweet in tokenized_text["spanish"]]


## Compute cosine similarities

In [None]:
# COMPLETE
from sklearn.metrics.pairwise import cosine_similarity

def encontrar_mas_similar(vectores_texto, etiquetas, etiqueta_objetivo, modelo, idioma="english"):
    indices = df[idioma][etiquetas] == etiqueta_objetivo
    sub_vectores = np.stack(df[idioma][indices][modelo].values)
    similitud_coseno = cosine_similarity(sub_vectores)

    max_similitud, mejor_par = 0, (None, None)
    for i in range(sub_vectores.shape[0]): 
        for j in range(i + 1, sub_vectores.shape[0]):
            if similitud_coseno[i, j] > max_similitud:
                max_similitud = similitud_coseno[i, j]
                mejor_par = (df[idioma][indices].iloc[i]["id"], df[idioma][indices].iloc[j]["id"])

    return mejor_par, max_similitud

## Show results

In [None]:
# COMPLETE

for name, vectors in models_en.items():
    print(f"======================\n{name}\n" + "-" * 22)

    for label in ['NO', 'YES']:
        best_pair, similarity = encontrar_mas_similar(vectors, "label", label, name, lang="english")
        print(f"Label: {label} \nTweets IDs: {best_pair} \nSimilarity: {similarity:.4f}")
        print(f"Tweets: \n \t1: {df["english"][df["english"]['id'] == best_pair[0]]['text'].values[0]} \n \t2: {df["english"][df["english"]['id'] == best_pair[1]]['text'].values[0]}")
        if label == "NO":
            print("-" * 20)


for name, vectors in models_es.items():
    print(f"======================\n{name}\n" + "-" * 22)
    for label in ['NO', 'YES']:
        best_pair, similarity = encontrar_mas_similar(vectors, "label", label, name, lang="spanish")
        print(f"Label: {label} \nTweets IDs: {best_pair} \nSimilarity: {similarity:.4f}")
        print(f"Tweets: \n \t1: {df['spanish'][df['spanish']['id'] == best_pair[0]]['text'].values[0]} \n \t2: {df['spanish'][df['spanish']['id'] == best_pair[1]]['text'].values[0]}")
        if label == "NO":
            print("-" * 20)

w2v300 
----------------------
Label: NO 
Tweets IDs: (201173, 201177) 
Similarity: 0.9444
Tweets: 
 	1: @BLEEDTHISWAY replay free woman breebylon &gt;&gt;&gt; Flop this way 
 	2: replay&gt;alice&gt;babylon&gt;free woman https://t.co/WCEqeUxdtC
----------------------
Label: YES 
Tweets IDs: (201621, 201637) 
Similarity: 0.9614
Tweets: 
 	1: @WeaponizedRage Aerosmith in 1987: "Dude looks like a lady" 
 	2: Dude does not look like a lady! https://t.co/C62JmKSzy0
ftsub300 
----------------------
Label: NO 
Tweets IDs: (201173, 201177) 
Similarity: 0.9898
Tweets: 
 	1: @BLEEDTHISWAY replay free woman breebylon &gt;&gt;&gt; Flop this way 
 	2: replay&gt;alice&gt;babylon&gt;free woman https://t.co/WCEqeUxdtC
----------------------
Label: YES 
Tweets IDs: (201235, 201978) 
Similarity: 0.9716
Tweets: 
 	1: in the living room, all strocking to porn. all4guys+host had fucked me. 1st, straight,had cum in cumhole. I was so surprised&amp; pleased.even if short &amp;light, was so hot be gangbanged w