# Comparing embeddings across documents

In [1]:
from transformers import BertModel, BertTokenizer, AutoTokenizer
import numpy as np
import streamlit as st
import re
import pandas as pd
from datetime import datetime
import nltk
import torch
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
#input is "light.csv" which does not include stop words. 
df = pd.read_csv('../../../data/processed/light.csv')
# Filter
timestamps = df.year.to_list()
texts = df.text.to_list()
text = texts[1]


In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg")


# Extract texts from different documents
text_document_1 = texts[2]
text_document_2 = texts[3]

# Specify keywords of interest
keywords = ['state', 'economy', 'security']

# Tokenize and compute embeddings for each word in the texts
tokens_document_1 = nlp(text_document_1)
tokens_document_2 = nlp(text_document_2)

# Extract words and their embeddings for the specified keywords
words_embeddings_document_1 = [(token.text, token.vector) for token in tokens_document_1 if token.is_alpha and token.text in keywords]
words_embeddings_document_2 = [(token.text, token.vector) for token in tokens_document_2 if token.is_alpha and token.text in keywords]

# Create DataFrames for each document
df_document_1 = pd.DataFrame(words_embeddings_document_1, columns=['word', 'embedding'])
df_document_2 = pd.DataFrame(words_embeddings_document_2, columns=['word', 'embedding'])

# Merge DataFrames based on the words
merged_df = pd.merge(df_document_1, df_document_2, on='word', how='inner', suffixes=('_doc1', '_doc2'))

# Compute cosine similarities between word embeddings
merged_df['cosine_similarity'] = merged_df.apply(lambda row: cosine_similarity([row['embedding_doc1']], [row['embedding_doc2']])[0][0], axis=1)

# Print the resulting DataFrame with cosine similarities
print(merged_df[['word', 'cosine_similarity']])

# Merge DataFrames based on the words
merged_df = pd.merge(df_document_1, df_document_2, on='word', how='inner', suffixes=('_doc1', '_doc2'))

# Compute cosine similarities between word embeddings
merged_df['cosine_similarity'] = merged_df.apply(lambda row: cosine_similarity([row['embedding_doc1']], [row['embedding_doc2']])[0][0], axis=1)

# Visualize word embeddings in a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(merged_df['embedding_doc1'].apply(lambda x: x[0]), merged_df['embedding_doc1'].apply(lambda x: x[1]), label='Document 1', marker='o')
plt.scatter(merged_df['embedding_doc2'].apply(lambda x: x[0]), merged_df['embedding_doc2'].apply(lambda x: x[1]), label='Document 2', marker='x')

# Annotate points with words
for i, row in merged_df.iterrows():
    plt.annotate(row['word'], (row['embedding_doc1'][0], row['embedding_doc1'][1]), color='blue')
    plt.annotate(row['word'], (row['embedding_doc2'][0], row['embedding_doc2'][1]), color='orange')

# Add labels and legend
plt.title('Word Embeddings - Document Comparison')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()

