In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_random_exponential
import os

import openai

from utils.embedding_utils import get_embedding, sliding_window

In [2]:
victim = "Oppie"

df = pd.read_csv(f"../data/{victim}/db/df_text.csv")
df.head()

Unnamed: 0,source_type,text
0,paper,(Wednesday Moraine: Ele mentary Particles; J.R...
1,website,"J. Robert Oppenheimer - Wikipedia, Jump to con..."
2,website,"Manhattan Project - Wikipedia, Jump to content..."
3,website,"Oppenheimer security hearing - Wikipedia, Jump..."
4,website,"American Prometheus - Wikipedia, Jump to conte..."


In [3]:
df_filtered = df

In [4]:
window_size = 256  # Length of text chunks
stride = 192 # Stride of sliding window; have a bit of overlap

text_chunks = []
embeddings = []

for i in tqdm(range(len(df_filtered))):
    text = df_filtered['text'].values[i].replace('\n', ' ').strip()
    text_chunks_i = list(sliding_window(text, window_size, stride))
    if len(text_chunks_i) > 80:  # If text too long, truncate
        text_chunks_i = text_chunks_i[:80]
    embeddings_i = [get_embedding(text) for text in text_chunks_i]
    text_chunks += text_chunks_i
    embeddings += embeddings_i

100%|██████████| 16/16 [02:37<00:00,  9.84s/it]


In [5]:
embeddings = np.array(embeddings, dtype=np.float64)

In [6]:
data = [text_chunks]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['text_chunks']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [7]:
df.to_csv(f'../data/{victim}/db/text_chunks.csv', index=False)
np.save(f'../data/{victim}/db/embeddings.npy', embeddings)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from utils.embedding_utils import get_embedding

def semantic_search(query_embedding, embeddings):
    """Load context prompt."""
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    ranked_indices = np.argsort(-similarities)
    return ranked_indices

In [9]:
emb = get_embedding("Tell me about the pion")
semantic_search(emb, embeddings)

array([309, 304, 311,   0,   6, 323,  28,  29,  43, 308,  39,  38, 315,
        44, 310,   1,  32, 411,  21,  86, 343, 336, 303, 305,   4, 338,
         8,  22,  24,  42, 316, 306,  17,  25,   3, 329,  20,  16,   9,
        27, 185, 337, 520, 335, 327, 312,  13, 320,   5,  37, 314,  31,
         2,  30,  19,  18, 110, 321, 509, 331, 326, 359, 510,   7, 386,
        61,  26,  56,  41, 184, 330, 381, 328,  55, 380,  10, 355,  23,
       361, 195,  35, 350, 313, 121, 435, 478,  15,  40, 300, 625, 197,
       340,  48, 379,  54, 317, 344, 522, 609, 154, 354, 324, 410, 284,
       351, 388, 347,  85, 360, 385,  60, 342, 554, 508, 288, 502, 641,
       352,  63,  33, 333, 268, 183, 287,  36, 318, 369, 511, 640,  14,
       446, 229, 397, 187, 612,  58, 213, 179,  71, 622, 537, 512, 593,
       364, 341, 271, 403, 153, 409, 505,  84,  11, 538,  81, 180,  67,
       297, 373, 588, 565, 262, 432, 273, 392, 406,  80, 107, 615, 263,
        96, 587, 610, 177, 129,  34, 405, 299, 285,  91, 590,  4

In [10]:
text_chunks[semantic_search(emb, embeddings)[0]]

'production is not at all surpri; ing 0 The discussion now shifted to the pion-nucleon scattering problem* Chew began, this discussion with what he characterized as a simple-minded theoreti\xad cal attempt to understand the problem on the basis of Yukawa 5s fundamental idea. He had agreed to make the following rather glib statements only with the understanding that Dyson and Bethe would not contradict him at this session» but would take up these points in the technical theoretical session* The main feat\xad ure of the Yukawa theory is that the fundamental process consists of the emiss\xad ion or absorption of a single pion 0 If we assume that the motion of the nucléon is unimportant compared to the motion of the pion 3 that is, that nucléon pairs are not important, then the large interaction between the nucléon and pion must be in p states. This can be seen by considering the following diagram: The slow nucléon has angular slow nucléon J = l/2; Parity = + (definition) r.ucleon j -1/2; 