In [2]:
import create_embedding as ce
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

In [3]:
import json
import pandas as pd

In [4]:
import convokit
corpus_dir = "data/Arknights_plot/corpus"
corpus = convokit.model.corpus.Corpus(corpus_dir)

In [3]:
corpus.print_summary_stats()

Number of Speakers: 2031
Number of Utterances: 88493
Number of Conversations: 6405


## Task : create embedding using openAI for ALL utterance

This is the sample to use OpenAI embedding model on utterance. It write the embedding into a json file with key to utterance id in real-time.

**Warning**: this is the final code I used on the entire dataset, costing around $.34; please try on smaller dataset to avoid unexpected expenses.

In [4]:
def process_utterance(client, utterance, model="text-embedding-3-large"):
    """
    Given an utterance, returns the id and the embedding of the utterance
    """
    text = utterance.text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    return utterance.id, embedding


In [7]:
def generate_embeddings(client, corpus, num_workers=10, model="text-embedding-3-large", output_file="utterance_embeddings.json"):
    # Initialize a lock for thread-safe writing to the file
    lock = Lock()

    # Open the output file in write mode
    with open(output_file, 'w') as f:
        f.write('{')  # Start the JSON object

    def process_and_write_utterance(client, utterance, model):
        utterance_id, embedding = process_utterance(client, utterance, model)
        # Convert the embedding to a string representation for JSON
        embedding_str = json.dumps(embedding)
        # Prepare the JSON entry for this utterance
        json_entry = f'"{utterance_id}": {embedding_str},'

        # Use the lock to ensure thread-safe writing to the file
        with lock:
            with open(output_file, 'a') as f:
                f.write(json_entry)

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Create a future for each utterance in the corpus
        futures = [executor.submit(process_and_write_utterance, client, utt, model) for utt in corpus.iter_utterances()]

        # Use tqdm to create a progress bar
        for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing Utterances"):
            pass  # Each future writes its result to the file upon completion

    # Correct the final JSON format by removing the last comma and closing the JSON object
    with open(output_file, 'rb+') as f:
        f.seek(-1, 2)  # Go to the last character
        f.truncate()  # Remove the last comma
        f.write(b'}')  # Close the JSON object

    print(f"Embeddings saved to {output_file}")

In [8]:
client = OpenAI()
num_workers = 50  # Set the number of workers as desired
generate_embeddings(client, corpus, num_workers)


Processing Utterances: 100%|██████████| 88493/88493 [11:29<00:00, 128.30it/s]

Embeddings saved to utterance_embeddings.json





#### Here, the resulting json is extremly large, using `ijson` to manage it.

In [1]:
import ijson

In [2]:
utt_embed_dir = "data\\Arknights_plot\\embedding\\utterance_embeddings.json"

In [4]:
with open(utt_embed_dir, 'r') as file:
    # Parse the JSON objects one by one
    parser = ijson.items(file, 'item')
    
    # Iterate over the JSON objects
    for item in parser:
        # Process each JSON object as needed
        print(item)

In [18]:
# The result file is VERY LARGE 
# using ijson to read the file

# take a look at the first 5 lines

import ijson

def read_embeddings(filename):
    with open(filename, 'rb') as f:
        keys = ijson.items(f, 'item.key')  # Get all keys
        f.seek(0)  # Reset file pointer to the beginning
        values = ijson.items(f, 'item.value')  # Get all values

        for utterance_id, embedding in zip(keys, values):
            print(f"Utterance ID: {utterance_id}, Embedding: {embedding}")




In [19]:
read_embeddings(utt_embed_dir)

### Sample usage: A machine learning classification task

In [10]:
interested_characters = ["Amiya", "Blaze", "Gavial"]

In [15]:
def filter_embeddings(filename, characters):
    filtered_embeddings = {}
    with open(filename, 'rb') as f:
        keys = ijson.items(f, 'item.key')
        f.seek(0)  # Reset file pointer to the beginning
        values = ijson.items(f, 'item.value')

        for utterance_id, embedding in zip(keys, values):
            character_name = utterance_id.split("_")[0]  # Assuming the utterance ID format includes the character name
            if character_name in characters:
                filtered_embeddings[utterance_id] = embedding
    return filtered_embeddings

In [16]:
interested_embeddings = filter_embeddings(utt_embed_dir, interested_characters)

In [17]:
interested_embeddings

{}

In [12]:
import numpy as np

X = []  # Feature matrix
y = []  # Labels

for utterance_id, embedding in interested_embeddings.items():
    character_name = utterance_id.split("_")[0]
    X.append(embedding)
    y.append(character_name)

X = np.array(X)
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Encoding labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Training a classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# Making predictions and evaluating
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# PCA for dimensionality reduction
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# Visualization
plt.figure(figsize=(10, 7))
for i, character in enumerate(interested_characters):
    idx = y == character
    plt.scatter(X_reduced[idx, 0], X_reduced[idx, 1], label=character)
plt.legend()
plt.title("Character Embeddings Visualization")
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.show()