In [1]:
from api_schemas import *
from api_utils import *
from twitter import *
from describe import *

In [6]:
import numpy as np
from numpy.linalg import norm
import faiss

def cos_sim(a: np.ndarray, b: np.ndarray) -> float:
    return (a @ b.T) / (norm(a)*norm(b))

def jina_embed(text: str, dim: int=64) -> np.ndarray:
    data = {
      'input': [
         {"text": text}],
      'model': 'jina-clip-v2',
      'encoding_type': 'float',
      'dimensions': str(dim)
    }
    url = 'https://api.jina.ai/v1/embeddings'
    headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {jina_key}'
    }
    response = requests.post(url, headers=headers, json=data)
    return np.array(response.json()['data'][0]['embedding'])

# jina embedding api: 500 rpm, 1000000 tpm
executor_jina = RateLimitedExecutor(
    max_workers = 25,
    requests_per_minute = 500 * 0.9,
    measure_per_minute = 1000000 * 0.9,
    measure = token_count
)

def openai_embed(text: str, client: OpenAI) -> List[float]:
    emb = client.embeddings.create(input=text, model="text-embedding-3-small")
    return emb.data[0].embedding

# openai embedding api: 5000 rpm, 5000000 tpm
executor_openai_embeddings = RateLimitedExecutor(
    max_workers = 50,
    requests_per_minute = 5000 * 0.9,
    measure_per_minute = 5000000 * 0.9,
    measure = token_count
)

Compute embeddings, once messages have been scraped

In [None]:
# embed all good-enough tweets and replies

with open("messages.json") as f:
    messages = json.load(f)

def wrapper(x: Tuple[str, str, str]) -> Tuple[str, str, str, np.ndarray]:
    e = openai_embed(x[2])
    if random.random() < 0.01:
        print(x[0], x[1], x[2], e)
    embeddings[x[0]][x[1]].append((x[2], e))
    return (x[0], x[1], x[2], e)

to_embed = []
cleaner_regex = "^(?:RT )?(?:@[a-zA-Z0-9_:]+ )*(.*?)(?:https://t.co/\w+)?$"
find_match = lambda rx, s: re.search(rx, s).groups()[0] if s and re.search(rx, s) else None


embeddings = {
    alice: {"tweets": [], "replies": []} for alice in messages
}

filter_fn = lambda alice, kind: [
    z.strip() for z in [
        find_match(cleaner_regex, s) for s in messages[alice][kind] if s
    ] if z and len(z) > 140
]

embeddable0 = {
    alice: {
        "tweets": filter_fn(alice, "tweets"),
        "replies": filter_fn(alice, "replies")
    } for alice in messages
}

embeddable = {k: v for k, v in embeddable0.items() if len(v["tweets"]) + len(v["replies"]) > 0}

print(sum(x[1] for x in [(k, len(v["tweets"]) + len(v["replies"])) for k, v in embeddable.items()]))

for alice, data in embeddable.items():
    for kind, msgs in data.items():
        for msg in msgs:
            obj = (alice, kind, msg)
            to_embed.append(obj)

print(len(to_embed))
mapped = executor_openai_embeddings.map(wrapper, to_embed)

Make nearest neighbors index once embeddings are computed

In [None]:
User = str
Embedding = List[float] | np.ndarray
EmbeddedMsgs = Dict[str, List[Tuple[str, Embedding]]]
FlatMsg = Tuple[User, str, str]

def load_json_data(filepath: str) -> Dict[User, EmbeddedMsgs]:
    with open(filepath, 'r') as f:
        return json.load(f)

def preprocess_data(data: Dict[User, EmbeddedMsgs]) -> Tuple[List[FlatMsg], Embedding]:
    """
    Flattens the JSON data and extracts embeddings and keys.
    :param data: JSON data containing the embeddings.
    :return: A tuple containing the keys and embeddings.
    """
    keys = []
    embeddings = []
    for author, msg_types in data.items():
        for msg_type, messages in msg_types.items():
            for message, embedding in messages:
                keys.append((author, msg_type, message))
                embeddings.append(embedding)
    return keys, np.array(embeddings)

def create_faiss_index(embeddings: List[Embedding], index_path: str) -> faiss.IndexFlatL2:
    """
    Creates a Faiss index, adds the embeddings, and saves it to disk.
    :param embeddings: The embeddings to add to the index.
    :param index_path: The path to save the index.
    :return: The Faiss index.
    """
    if len(embeddings) == 0:
        return
    dim = embeddings.shape[1] if isinstance(embeddings[0], np.ndarray) else len(embeddings[0])
    index = faiss.IndexFlatL2(dim)
    if isinstance(embeddings[0], list):
        index.add([np.array(e) for e in embeddings])
    else:
        index.add(embeddings)
    faiss.write_index(index, index_path)  # Save the index to disk
    return index

def load_faiss_index(index_path: str) -> faiss.IndexFlatL2:
    """
    Loads the Faiss index from disk.
    :param index_path: The path to load the index from.
    :return: The loaded Faiss index.
    """
    return faiss.read_index(index_path)

def find_nearest(queries: List[T], index: faiss.IndexFlatL2, keys: List[FlatMsg], embedder: Callable[[T], Embedding], top_n: int) -> List[Tuple[FlatMsg, float]]:
    """
    Finds the top_n nearest neighbors to the query embedding.
    :param queries: The queries to find nearest neighbors for.
    :param index: The Faiss index.
    :param keys: The keys corresponding to the embeddings in the index.
    :param embedder: The function to embed the queries.
    :param top_n: The number of nearest neighbors to return.
    :return: A list of tuples containing the keys and distances of the nearest neighbors.
    """
    Dist, Idx = index.search(np.array([embedder(query) for query in queries]), top_n)
    return [(keys[i], float(distance)) for i, distance in zip(Idx[0], Dist[0])]

json_path = 'embeddings.json'
index_path = 'faiss_index.bin'

data = embeddings
# data=load_json_data(json_path)
try:
    index = load_faiss_index(index_path)
    print("Loaded Faiss index from disk.")
    with open(json_path, 'r') as f:
        keys = [tuple(key) for key in json.load(f)]
except Exception as e:
    print("Faiss index not found, creating a new one...")
    keys, embeds = preprocess_data(data)
    index = create_faiss_index(embeds, index_path)
    with open(index_path, 'w') as f:
        json.dump(keys, f)
print("Created and saved Faiss index to disk.")

In [48]:
query = """The canvas of experience does exist in an ultimate sense, I think, but its division of itself into 'people' with normative obligations is a self-stabilizing fiction—just like biological species—that can only exist in a conventional sense"""
N=80
nearest_messages = find_nearest(np.array([query]), index, keys, lambda x: np.array(openai_embed(x)), N)
print(f"Top {N} nearest messages to '{query}':")
for i, (key, distance) in enumerate(nearest_messages):
    print(f"\t({i}) Keys: {key}, Distance: {distance}")

Top 80 nearest messages to 'The canvas of experience does exist in an ultimate sense, I think, but its division of itself into 'people' with normative obligations is a self-stabilizing fiction—just like biological species—that can only exist in a conventional sense':
	(0) Keys: ('psychiel', 'tweets', "The canvas of experience does exist in an ultimate sense, I think, but its division of itself into 'people' with normative obligations is a self-stabilizing fiction—just like biological species—that can only exist in a conventional sense."), Distance: 0.022105978801846504
	(1) Keys: ('tr_babb', 'replies', "even if we think everything in the cyan circle has an experience (as a ~panpsychist, I do), there are things to be aware of besides one's own existence, and I think most experiences are probably of the former type and not the latter"), Distance: 0.7925213575363159
	(2) Keys: ('RadiantNous', 'replies', 'does really nothing exist as such? surely there\'s at least the observer, the first-p

In [None]:
# with open("record.json", "r") as file:
#     record = json.load(file)

# record.users["alice"]: {
#     "data": {
#         "followers_count": int,        | the number of followers alice has
#         "friends_count": int,          | the number of users alice follows
#         "name": str,                   | the user's name
#         "screen_name": str,            | the user's screen name
#         "description": str,            | the user's description
#         "profile_image": str,          | the user's profile image
#         ...
#     },
#     "depth": int,                      | alice's degrees of separation from the root user
#     "follows_to": List[str],           | the users that alice follows
#     "follows_from": List[str],         | the users that follow alice
#     "cursors": {
#         "follows_to": Optional[str],   | the cursor for alice's follows list, if it wasn't completely fetched
#         "follows_from": Optional[str]  | the cursor for alice's followers list, if it wasn't completely fetched
#     }
# }

# with open("descriptions.json", "r") as file:
#     descriptions = json.load(file)

# descriptions[alice]: {
#     "confidence": int,                | the confidence score for the description
#     "description": str,               | the description of the user
#     "interests": List[str],           | the user's interests
#     "aptitudes": List[{               | the user's aptitudes
#         "field": str,                 |     the field of expertise
#         "level": int                  |     the user's proficiency in the field, on a scale of 1 to 5
#     }]
# }

# with open("messages.json", "r") as file:
#     messages = json.load(file)

# messages[alice]: {
#     "tweets": List[str],             | the user's tweets
#     "replies": List[str],            | the user's replies
#     "cursors": {
#         "tweets": Optional[str],     | the cursor for the user's tweets, if it wasn't completely fetched
#         "replies": Optional[str]     | the cursor for the user's replies, if it wasn't completely fetched
#     }
# }

# record.json: `<str>: {str}: <str>: (<str>: Any), int, [str], (<str>: str?)`
#           502 first-order connections
#           * stop at 1000 followers and 1000 follows
#           => 330,382 total users (across all orders) tabulated
#
# messages.json: `{str}: <str>: [str], (<str>: str?)`
#           2,000 second-order connections (195 inaccessible, too little data for 66)
#           * stop at 200 tweets and 400 replies per user (no metadata kept)
#           => 584,859 total tweets and replies
#           => 79,142,511 chars, 20,038,493 tokens ($0.40 for text-embedding-3-small)
#
# descriptions.json: `{str}: <str>: int, str, [str], [<str>: str, int]`
#           evaluation of all second-order connections with sufficient data
#           => 1,739 users with descriptions
