In [None]:
import openai
import pandas as pd

COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDINGS_MODEL = "text-embedding-ada-002"
openai.api_key = ""

In [None]:
# Count the number of tokens if necessary
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


In [None]:
raw_transcript_df = pd.read_csv("raw_transcript.csv")
raw_transcript_df["sentence"] = raw_transcript_df["output"].str.split('.')
exploded_df = raw_transcript_df.explode("sentence")

In [None]:
from openai.embeddings_utils import get_embedding

exploded_df = exploded_df[exploded_df['sentence'].str.len() > 0]
exploded_df['embedding'] = exploded_df['sentence'].apply(lambda row: get_embedding(row, engine=EMBEDDINGS_MODEL))
exploded_df.to_csv('embeddings_transcript.csv')

In [None]:
#k-means clustering
import numpy as np
from sklearn.cluster import KMeans

exploded_parent_df = pd.read_csv("embeddings_transcript.csv")
clustered_text_df = pd.DataFrame(columns=['url', 'title', 'cluster', 'aggregated_text'])

for item in exploded_parent_df['url'].unique():
    embedding_df = exploded_parent_df.loc[exploded_parent_df['url'] == item].copy()
    embedding_df["embedding"] = embedding_df.embedding.apply(eval).apply(np.array)  # convert string to numpy array
    matrix = np.vstack(embedding_df.embedding.values)

    n_clusters = 10 #arbitrary

    kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    kmeans.fit(matrix)
    labels = kmeans.labels_
    embedding_df["cluster"] = labels

    combined_df = embedding_df.groupby(['url', 'title', 'cluster'])['sentence'].apply('. '.join).reset_index()
    combined_df['aggregated_text'] = combined_df['title'] + ', ' + combined_df["sentence"]
    combined_df = combined_df.drop(['sentence'], axis=1)
    clustered_text_df = pd.concat([clustered_text_df,combined_df])

clustered_text_df.to_csv('clustered_text.csv')

In [None]:
clustered_text_df = pd.read_csv("clustered_text.csv")

clustered_text_df['embedding'] = clustered_text_df['aggregated_text'].apply(lambda row: get_embedding(row, engine=EMBEDDINGS_MODEL))
clustered_text_df.to_csv('clustered_embeddings.csv')

In [None]:
from openai.embeddings_utils import cosine_similarity

clustered_embeddings_df = pd.read_csv('clustered_embeddings.csv')
clustered_embeddings_df['embedding'] = clustered_embeddings_df['embedding'].apply(eval).apply(np.array)

question1 = "How did Adobe perform this quarter?"
question1_vector = get_embedding(question1, engine=EMBEDDINGS_MODEL)

question2 = "How is PagerDuty using and planning to use AIOps?"
question2_vector = get_embedding(question2, engine=EMBEDDINGS_MODEL)

clustered_embeddings_df["similarities"] = clustered_embeddings_df['embedding'].apply(lambda x: cosine_similarity(x, question2_vector))
sorted_embeddings = clustered_embeddings_df.sort_values("similarities", ascending=False).head(3)

sorted_embeddings

In [None]:
context = []
for i, row in sorted_embeddings.iterrows():
  context.append(row['aggregated_text'][:1300])  # limit the number of tokens per matched sequence to 1300 tokens

text = "\n".join(context)
context = text
text

In [None]:
system_prompt = f"""Answer the following question using only the context provided. Answer in the style of a financial analyst. If you don't know the answer for certain, say I don't know."""

user_prompt = f"""
Context:
{context}

Q: {question2}
A:"""

openai.ChatCompletion.create(
    temperature=0.2,
    max_tokens=700,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
)["choices"][0]["message"]