# Generate Glove Embeddings for pairwise keywords


In [5]:

import numpy as np
import pandas as pd
import os
import time
import spacy 
import scipy
from spacy import displacy
import gensim.downloader as api


# Create the output folder if it doesn't exist
output_folder = "../../../output/embedding_static/glove"
os.makedirs(output_folder, exist_ok=True)

start_time = time.time()

In [10]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api

# Load the input data
df = pd.read_csv('~/Downloads/liwc_meta.csv')

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove = api.load("glove-wiki-gigaword-100")

# Tokenize and filter speeches
speeches = df['text'].tolist()
print("Tokenizing and filtering speeches...")
filtered_speeches = [[word for word in speech.lower().split() if word in glove] for speech in speeches]

# Compute embeddings
print("Computing document embeddings...")
embeddings = []
for speech in filtered_speeches:
    if speech:  # Avoid empty speeches
        emb = np.mean([glove[word] for word in speech], axis=0)
    else:
        emb = np.zeros(glove.vector_size)  # Handle empty speeches with zero vector
    embeddings.append(emb)

# Define keyword pairs
keyword_pairs = [
    ("sovereignty", "territory"),
    ("sovereignty", "responsibility"),
    ("rights", "responsibility"),
    ("rights", "interven"),
    ("rights", "sovereignty"),
    ("sovereignty", "colonial"),
    ("sovereignty", "westphalia"),
    ("liberal", "sovereignty"),
    ("war", "sovereignty"),
    ("war", "peace")
]

# Prepare output DataFrame
output_rows = []

# Compute distances
print("Computing distances for keyword pairs...")
for keyword1, keyword2 in keyword_pairs:
    if keyword1 in glove.key_to_index and keyword2 in glove.key_to_index:
        embedding1 = glove[keyword1].reshape(1, -1)
        embedding2 = glove[keyword2].reshape(1, -1)

        for emb, dd_regime, year in zip(embeddings, df['dd_regime'], df['year']):
            if not np.isnan(dd_regime):  # Exclude rows with NaN in dd_democracy
                similarity = cosine_similarity(embedding1, emb.reshape(1, -1))[0][0]
                distance = 1 - similarity

                # Append results to output
                output_rows.append({
                    'year': year,
                    'dd_regime': dd_regime,
                    'keyword1': keyword1,
                    'keyword2': keyword2,
                    'distance': distance
                })
    else:
        print(f"Keywords '{keyword1}' or '{keyword2}' not found in the GloVe vocabulary.")

# Convert output rows to DataFrame
output_df = pd.DataFrame(output_rows)

# Save the updated DataFrame
output_folder = "../../../output/embedding_glove/"
os.makedirs(output_folder, exist_ok=True)
output_path = os.path.join(output_folder, "liwc_meta_keyword_distances_long_format.csv")
output_df.to_csv(output_path, index=False)

print(f"Distances for all keyword pairs saved in long format as '{output_path}'.")


Loading GloVe embeddings...
Tokenizing and filtering speeches...
Computing document embeddings...
Computing distances for keyword pairs...
Keywords 'rights' or 'interven' not found in the GloVe vocabulary.
Distances for all keyword pairs saved in long format as '../../../output/embedding_glove/liwc_meta_keyword_distances_long_format.csv'.
