<a href="https://colab.research.google.com/github/Mel-Anden/Mel-Anden/blob/main/Visualize_statements_in_an_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Visualize Actor Statements in an Embedding

*DTU - Explore the controversy about Energy Island*

**Goal**:
- Compute a vector representation of each statement using an embedding model
- Reduce dimensionality using the algorithm UMAP
- Visualize for exploration

**Purpose**: allows understanding patterns such as sub-controversies, different arguments, time dynamics and more.

**How to use**:
- Edit settings then use "Runtime > Run all"
- Wait for each cell to run
- ⚠️ You may have to restart the runtime when installing libraries
- ⚠️ Allow the script to access your Google Drive data when prompted to
- **RAG**: you can do retrieval-augmented generation (RAG) at the end of the notebook. Edit the query just under section *"DIY RAG (Retrieval augmented generation)"*, run that cell and all the subsequent ones, then copy-paste the generated prompt into an AI assistant like Claude, Gemini or ChatGPT.

## Settings

In [None]:
# SETTINGS (edit if necessary)
settings = {}
settings['statements_spreadsheet_drive_URL'] = 'https://docs.google.com/spreadsheets/d/1c6U-tF4ZTi-csTkusGFaSclE2-gn3tvi0Xaj4Q8AKvk/edit?usp=sharing'
settings['column_text'] = 'Restated version (the transformed actor statement)'
#settings['column_text'] = 'Original statement (the source text you transformed and translated)'
settings['recompute_embeddings'] = True # Set to True if you changed the documents
settings['visualized_attribute'] = 'Year' # For the embedding plot
#settings['visualized_attribute'] = 'Source medium type' # For the embedding plot

## Code

(You don't have to understand what's going on here, but feel free to take a look)

### Install stuff

In [None]:
# Install necessary libraries
!pip install pandas==2.0.3 gspread==5.10.0 google-auth==2.22.0 google-auth-oauthlib==1.0.0 google-auth-httplib2==0.1.0
!pip install chromadb bokeh umap-learn

In [None]:
# Import necessary libraries
import pandas as pd
import json
import umap

import chromadb
from chromadb.utils import embedding_functions

import bokeh
import bokeh.plotting as bp
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

### Load data from the spreadsheet

In [None]:
# Open the spreadsheet by its key or URL
spreadsheet_key = settings['statements_spreadsheet_drive_URL'].split('/d/')[1].split('/edit')[0]
sh = gc.open_by_key(spreadsheet_key)

# Select the worksheet
worksheet_name = 'Form Responses'
worksheet = sh.worksheet(worksheet_name)

In [None]:
# Get all values from the worksheet as a list of lists
data = worksheet.get_all_values()

# Create a Pandas DataFrame from the list of lists
df = pd.DataFrame(data[1:], columns=data[0])

# Parse dates and convert to years for convenience
df['Year'] = pd.to_datetime(df['Date of publication (today if not available)'], format='%m/%d/%Y', errors='coerce')
df['Year'] = df['Year'].dt.year

# Display dataframe for monitoring purposes
df

In [None]:
# Print keys of df (for monitoring and debug purposes)
print(df.keys())

### Compute embeddings

In [None]:
# Initialize ChromaDB and create a collection
client = chromadb.Client()

if settings['recompute_embeddings']:
  # Delete the collection if it exists
  try:
      client.delete_collection(name="my_documents")
      print("Existing collection 'my_documents' deleted.")
  except:
      pass  # Ignore if collection doesn't exist

# Get or create the collection
try:
  collection = client.get_collection(name="my_documents")
except:
  collection = client.create_collection(name="my_documents")

  # Get the text content and metadata from the DataFrame
  texts = df[settings['column_text']].tolist()
  metadata = df.drop(columns=['Original statement (the source text you transformed and translated)', 'Restated version (the transformed actor statement)']).to_dict(orient="records")

  # Create embeddings and add documents to the collection
  # Instead of passing embedding_function to add(),
  # we will create an embedding function and use it to embed the documents first
  from chromadb.utils import embedding_functions
  embedding_function = embedding_functions.DefaultEmbeddingFunction()
  embeddings = embedding_function(texts) # Embed the documents

  # Use DataFrame's row index as id
  ids = df.index.astype(str).tolist()  # Convert index to strings for ChromaDB

  # Add documents with embeddings and metadata
  collection.add(
      documents=texts,
      metadatas=metadata,
      embeddings=embeddings, # Pass the embeddings here
      ids=ids # Pass the document IDs here
  )
print('Embeddings computed.')

In [None]:
embeddings = collection.get(include=['embeddings'])['embeddings']

### Reduce dimensionality
We use UMAP to reduce the vector space to 2 dimensions

In [None]:
if settings['recompute_embeddings'] or 'umap_result' not in locals():
  # Initialize UMAP with desired parameters
  reducer = umap.UMAP(n_neighbors=15,
                      n_components=2,
                      min_dist=0.05,
                      metric='cosine',
                      random_state=42)

  # Apply UMAP to the embeddings
  umap_result = reducer.fit_transform(embeddings)

print("UMAP reduction complete.")

### Visualize statements

In [None]:
# Set attribute to visualize
painted_attribute = settings['visualized_attribute']
painted_data = df[painted_attribute].astype(str)

# Create a ColumnDataSource for Bokeh
source = ColumnDataSource(data=dict(
    x=umap_result[:, 0],
    y=umap_result[:, 1],
    attribute_to_paint=painted_data,
    author=df['Actor '],
    source=df['Source name (e.g. LinkedIn,  Jyske Vestkysten, Folketinget, etc.)'],
    date=df['Date of publication (today if not available)'],
    text=df[settings['column_text']]
))

# Get unique modalities and create a color mapper
unique_modalities = painted_data.unique()
unique_modalities.sort()
color_mapper = CategoricalColorMapper(factors=list(unique_modalities), palette=bokeh.palettes.turbo(len(unique_modalities)))

In [None]:
# Output the plot to the notebook
output_notebook()

# Create the figure
p = bp.figure(width=700, height=700,
            title="Actor statements",
            tools="pan,wheel_zoom,box_zoom,reset,hover", match_aspect=True)

# Add scatter plot with color mapping and hover tool
p.scatter('x', 'y', source=source, size=10,
        color={'field': 'attribute_to_paint', 'transform': color_mapper},
        legend_group='attribute_to_paint')

# Customize the plot (optional)
p.legend.title = painted_attribute
p.xaxis.axis_label = "UMAP Dimension 1"
p.yaxis.axis_label = "UMAP Dimension 2"

# HTML tooltip
hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div style="width: 300px; word-wrap: break-word;">
        <div>@text</div>
        <div><em>&mdash;@author, @source, @date</em></div>
        <br>
    </div>
"""
hover.mode = 'mouse' # Enable HTML rendering

# Show the plot
show(p)

In [None]:
# Add new columns 'X' and 'Y' with some sample data
df['X'] = [umap_result[x, 0] for x in range(len(df))]
df['Y'] = [umap_result[x, 1] for x in range(len(df))]

# Save the DataFrame to a CSV file
df.to_csv('spatialized actor statements.csv', index=False)

# DIY RAG (Retrieval augmented generation)

Edit the cell below (query), run it and the subsequent cells, then copy the last output into an AI assistant.

In [None]:
# EDIT THE QUERY BELOW
# Then execute this cell and the following ones

query = "Samsø" # We suggest querying a question, but it could be anything
number_of_retrieved_statements = 30

In [None]:
# Embed the query
query_embedding = embedding_function([query])

# Perform a similarity search
results = collection.query(
    query_embeddings=query_embedding,
    n_results=number_of_retrieved_statements
)

# Extract the statements from results
retrieved_txt = results['documents'][0]
retrieved_id = results['ids'][0]

# Print them
print("# Extracted statements:\n")
for chunk in retrieved_txt:
  print("- "+chunk)

In [None]:
# Output the plot to the notebook
output_notebook()

# Create the figure
p = bp.figure(width=1500, height=700,
            title="Actor statements",
            tools="pan,wheel_zoom,box_zoom,reset,hover", match_aspect=True)

# Add scatter plot with color mapping and hover tool
p.scatter('x', 'y', source=source, size=10,
        color={'field': 'attribute_to_paint', 'transform': color_mapper},
        legend_group='attribute_to_paint')

# Customize the plot (optional)
p.legend.title = painted_attribute
p.xaxis.axis_label = "UMAP Dimension 1"
p.yaxis.axis_label = "UMAP Dimension 2"

# Highlight retrieved data points in red
closest_indices = [int(i) for i in retrieved_id]

closest_source = ColumnDataSource(data=dict(
    x=[umap_result[i, 0] for i in closest_indices],
    y=[umap_result[i, 1] for i in closest_indices],
    author=[df['Actor '][i] for i in closest_indices],
    source=[df['Source name (e.g. LinkedIn,  Jyske Vestkysten, Folketinget, etc.)'][i] for i in closest_indices],
    date=[df['Date of publication (today if not available)'][i] for i in closest_indices],
    text=[df[settings['column_text']][i] for i in closest_indices]
))

p.circle('x', 'y', source=closest_source, size=25, color="#ee00ff", legend_label="Closest statements")


# HTML tooltip
hover = p.select(dict(type=HoverTool))
hover.tooltips = """
    <div style="width: 300px; word-wrap: break-word;">
        <div>@text</div>
        <div><em>&mdash;@author, @source, @date</em></div>
        <br>
    </div>
"""
hover.mode = 'mouse' # Enable HTML rendering

# Show the plot
show(p)

In [None]:
prompt = f'''Given the context information provided, and not prior knowledge, answer the query.

QUERY
```txt
{query}
```

CONTEXT INFORMATION
```json
{json.dumps(retrieved_txt)}
```
'''

print(prompt)