In [None]:
import pandas as pd
import openai
import os
import tiktoken

In [None]:
prompt = "What does the start-up company Pentera do and who invested in it?"

response = openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=500,
    model="text-davinci-003"
)
print(response["choices"][0]["text"].strip(" \n"))

In [None]:
df = pd.read_csv("unicorns.csv") 

In [None]:
df.head()

In [None]:
import ast 
def summary(company,crunchbase_url,city,country,industry,investor_list):
    investors = 'The investors in the company are'
     
    for investor in ast.literal_eval(investor_list):
        investors += f" {investor}, "

    text = f"{company} has headquarters in {city} in {country} and is in the field of {industry}. {investors}. You can find more information at {crunchbase_url}"

    return text 

In [None]:
df['summary'] = df.apply(lambda df: summary(df['Company'],df['Crunchbase Url'],df['City'],df['Country'],df['Industry'],df['Investors']),axis=1)

In [None]:
df.head()

In [None]:
df.summary[0]

In [None]:
import tiktoken

def num_tokens_from_string(string, encoding_name):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
num_tokens_from_string(df['summary'][0],encoding_name='cl100k_base')

In [None]:
df['token_count'] = df['summary'].apply(lambda text: num_tokens_from_string(text,'cl100k_base'))

In [None]:
df.head()

In [None]:
df['token_count'].sum() * 0.0004 / 1000

In [None]:
# 8191 is the limit for ada embeddings. SO, I tested if there are more tokens for any summary.
df[df['token_count'] > 8191]

In [None]:
#Create the function to get embeddings
def get_embedding(text):
  # Note how this function assumes you already set your Open AI key!
    result = openai.Embedding.create(
      model='text-embedding-ada-002',
      input=text
    )
    return result["data"][0]["embedding"]


In [None]:
get_embedding(df['summary'][0])

In [None]:
# Then, I make the embedding call. It takes a bit time.
df['embedding'] = df['summary'].apply(get_embedding)
#df.to_csv('unicorns_with_embeddings.csv',index=False)

In [None]:
# Read back the dataframe with embeddings.
#df = pd.read_csv('unicorns_with_embeddings.csv')
df.head()

In [None]:
# Let's calculate the document similarity then.
prompt = "What does the company Pentera do and who invested in it?"

In [None]:
# Get the embedding of prompt to calculate similarity
prompt_embedding = get_embedding(prompt)

In [None]:
import numpy as np
# There are other services/programs for larger amount of vectors
# Take a look at vector search engines like Pinecone or Weaviate
def vector_similarity(vec1,vec2):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(vec1), np.array(vec2))

In [None]:
df["prompt_similarity"] = df['embedding'].apply(lambda vector: vector_similarity(vector, prompt_embedding))

In [None]:
df.head()

In [None]:
df.to_csv('unicorns_with_embeddings_and_similarity.csv')

In [None]:
df.sort_values("prompt_similarity", ascending=False).head()

In [None]:
df['prompt_similarity'].idxmax()

In [None]:
df.iloc[4]['summary']

## Combine this information with ChatGPT API

In [None]:
response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = [
        {"role":"system", "content":"You are an assistant that gives info about startups."},
        {"role":"assistant", "content":f"{summary}.Valuation of Pentera in December 2022 is about $1.5 bn"},
        {"role":"user", "content":"Considering the information above, can you please tell me their final valuation with the general information about Pentera?"}
    ],
    max_tokens = 256
)

In [None]:
print(response['choices'][0]['message']['content'])