## Imports and Functions

In [22]:
import requests
import pandas
import pickle
import gzip
import os
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
import openai
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
load_dotenv()

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = os.environ.get('OPENAI_API_KEY')

def get_key_words(element_content, prompt):
    """
        Returns keywords extracted from element content
    """

    text = prompt + " " + element_content
    max_tokens = 4097

    if len(text)> 4097*2.5:
        text = text[:int(max_tokens*2)]
        print("TRUNCATED: ", len(text))
    
    try:
        response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": text}
                ]
            )

        key_words = response["choices"][0]["message"]["content"]
    except Exception as e:
        print("ERROR: ", e)
        key_words = ""
    return key_words

def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
) -> list:
    """
        Returns embedding from string
    """
    embedding = get_embedding(string, model)
    
    return embedding

def get_embeddings_data(file: str):
    with gzip.open(file, 'rb') as f:
        # Loads data from .gzip file
        data = pickle.load(f)
        
    return data

def get_source_elements(file: str):
    try:
        with open(file, 'rb') as handle:
            source_elements = pickle.load(handle)
        
    except:
        print("No source elements file found")
    
    return source_elements

def get_element_info(element):
    return "{'name': "+element['name']+", 'description': "+str(element['content'])+"}" 

def get_recommendations(input_embedding, source_data, max_distance: float = 2.0, max_courses: int = 10, print_results: bool = False):
    source_embeddings = []
    recommended_elements = []
    for element in source_data:
        source_embeddings.append(element['embeddings'])
      
    # get distances between the source embedding and other embeddings (function from embeddings_utils.py)
    distances = distances_from_embeddings(input_embedding, source_embeddings, distance_metric="cosine")
    
    # get indices of nearest neighbors (function from embeddings_utils.py)
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    # print k nearest neighbors:
    k_counter = 0
    for i in indices_of_nearest_neighbors:
        # stop after printing out k articles
        if k_counter >= max_courses:
            break
            
        if distances[i]>max_distance:
            break
        k_counter += 1

        if print_results == True:
            # print out the similar strings and their distances
            print(
                f"""
            --- Recommendation #{k_counter} (nearest neighbor {k_counter}) ---
            Element: {source_data[i]['name']}
            Distance: {distances[i]:0.3f}"""
            )
        
        recommended_elements.append({"course":source_data[i]['name'], "distance": str(round(distances[i],3))})

    return recommended_elements

## Load source Embeddings data

In [4]:
embeddings_data_file = "embeddings_data.gz"
embeddings_data = get_embeddings_data(embeddings_data_file)

## Define Input Element
Here you can create your own input element, or choose from one of the source elements

In [19]:

choice = input("Do you want to:  1) Create your own input element       2) Choose one from source elements      Type 1 or 2")

if choice == "1":
    element_name = input("Type the name of the element")
    element_content = input("Type the content of the element")

    input_element = {
        'id': "100", 
        'name':  element_name,
        'content': element_content
    }
    print("Input element: ", input_element)
elif choice == "2":
    source_elements_file = "source.pickle"
    source_elements = get_source_elements(source_elements_file)

    print("The following elements represent online courses:")
    i = 1
    for element in source_elements:
        print(str(i) + ": " + element['name'])
        i = i + 1
    chosen_index = input("Choose the element that you want to use as input")

    while((chosen_index.isnumeric() == False) or (int(chosen_index)>len(source_elements))):
        print("Choice must be just a number and must be between the possible choices")
        chosen_index = input("Choose the element that you want to use as input")

    chosen_index = int(chosen_index)

    input_element = source_elements[chosen_index - 1]
    print("Input Element: " + input_element['name'])

else:
    print("Invalid choice, exiting")



The following elements represent online courses:
1: Data Analytics
2: Databases
3: Cybersecurity
4: Machine learning
5: Web development
6: Digital marketing
Input Element: Databases


## Define prompt for input element keywords generation

Here you must define the prompt that ChatGPT will use to generate the keywords for your input element.\
You can use the default prompt only if the input element represents an online course.

In [20]:
default_promtp = "This is the information about an online course. You must generate 5 words that describe as precisely as possible the main topic of the course. You must write the 5 words in one single line."

print("Default prompt: " + default_promtp)
choice = input("Do you want to: 1) customize the prompt   2) use the default prompt \n Only use default prompt if the input element represents an online course ")

if choice == "1":
    prompt = input("Enter new prompt. The format should be as close to the default prompt as possible")

else:
    prompt = default_promtp
    print("Using default prompt!")


Default prompt: This is the information about an online course. You must generate 5 words that describe as precisely as possible the main topic of the course. You must write the 5 words in one single line.
Using default prompt!


## Get recommended elements based on input element

In [23]:
# Get input element info formated
input_element_info = get_element_info(input_element)

# Get keywords from input element
input_element_keywords = get_key_words(input_element_info, prompt)
print("Keywords from input element: ", input_element_keywords)

# Get embeddings from input element:
input_element_embeddings = embedding_from_string(input_element_keywords)
print("Embeddings generated!")

# Get recommended source elements from input element:
recommended_elements = get_recommendations(input_element_embeddings, embeddings_data, print_results=True)

Keywords from input element:  Database Development, SQL, Python, NoSQL, Relational Mapping.
Embeddings generated!

            --- Recommendation #1 (nearest neighbor 1) ---
            Element: Databases
            Distance: 0.055

            --- Recommendation #2 (nearest neighbor 2) ---
            Element: Web development
            Distance: 0.176

            --- Recommendation #3 (nearest neighbor 3) ---
            Element: Data Analytics
            Distance: 0.201

            --- Recommendation #4 (nearest neighbor 4) ---
            Element: Machine learning
            Distance: 0.202

            --- Recommendation #5 (nearest neighbor 5) ---
            Element: Digital marketing
            Distance: 0.213

            --- Recommendation #6 (nearest neighbor 6) ---
            Element: Cybersecurity
            Distance: 0.231
