## Imports and Functions

In [1]:
# imports
import pandas
import pickle
from googletrans import Translator, constants
import requests
import pandas
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import gzip
import json


translator = Translator()

from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
import openai
from dotenv import load_dotenv
import os
load_dotenv()

from config import *

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = os.environ.get('OPENAI_API_KEY')


def get_key_words(element_content, prompt):
    """
        Returns keywords extracted from element content
    """

    text = prompt + " " + element_content
    max_tokens = 4097

    if len(text)> 4097*2.5:
        text = text[:int(max_tokens*2)]
        print("TRUNCATED: ", len(text))
    
    try:
        response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": text}
                ]
            )

        key_words = response["choices"][0]["message"]["content"]
    except Exception as e:
        print("ERROR: ", e)
        key_words = ""
    return key_words


def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
) -> list:
    """
        Returns embedding from string
    """
    embedding = get_embedding(string, model)
    
    return embedding
    
def get_element_info(element):
    return "{'name': "+element['name']+", 'description': "+str(element['content'])+"}" 
    



## Load elements from pickle

In [2]:
source_file = source_elements_file
source_elements = []
try:
    with open(source_file, 'rb') as handle:
        source_elements = pickle.load(handle)
except:
    print("No source file found")

## Get Keywords from source elements

### Define prompt

Here you must declare the instruction that will be sent to ChatGPT to get the keywords.
The prompt should be adapted to what your elements represent.
Below is an example of a prompt that is adapted to elements that represent online courses.

In [7]:
element_type = "online blog post"
prompt = f"This is the information about a {element_type}. You must generate 5 words that describe as precisely as possible the main topic of the {element_type}. You must write the 5 words in one single line."

### Generate keywords

In [12]:
keywords_data = {'id': [], 'name': [], 'keywords': []}
keywords_file = source_keywords_file

print(source_elements)
try:
    with open(keywords_file, 'rb') as handle:
        keywords_data = pickle.load(handle)
        
except:
    print("No current keywords found")
    
print("TOTAL ELEMENTS: ", len(source_elements))
current_keywords = len(keywords_data['name'])
counter = current_keywords
for element in source_elements[current_keywords:]:
    print("element: "+str(counter)+" of "+str(len(source_elements)))
    element_info = get_element_info(element)
    keywords = get_key_words(element_info, prompt)
    
    if keywords!="":
        keywords_data['id'].append(element['id'])
        keywords_data['name'].append(element['name'])
        keywords_data['keywords'].append(keywords)
        with open(keywords_file, 'wb') as handle:
            pickle.dump(keywords_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        break
        
    counter = counter + 1
    

[{'id': 1, 'name': 'Data Analytics', 'content': 'Data Scientists try to make sense of the data that’s all around us. Taking a data science course can help you make informed decisions, create beautiful visualizations, and even try to predict future events through Machine Learning. If you’re curious about what you can learn about the world using the data produced every day, then data science might be for you!'}, {'id': 1, 'name': 'Databases', 'content': 'Learn database development from our best-in-class instructors. Our courses help you skill up in SQL, Python, NoSQL, Object relational mapping and more. Our authors show you how to gather the information needed, analyze the requirements, design a schema and implement the final solution.'}, {'id': 1, 'name': 'Cybersecurity', 'content': 'Today’s interconnected world makes everyone more susceptible to cyber-attacks. Whether you’re attracted to the relativity new world of cybersecurity as a professional, or just interested in protecting yours

## Write Keywords in excel

In [5]:
## Writting the KEY WORDS in excel:
keywords_excel_file = source_keywords_excel_file
    
with open(keywords_file, 'rb') as handle:
    keywords_data = pickle.load(handle)
    
df = pandas.DataFrame(keywords_data)

 
writer = pandas.ExcelWriter(keywords_excel_file, engine='xlsxwriter')
df.to_excel(writer, sheet_name = "EN")

writer.close()

## Generate element embeddings from their Keywords

In [6]:
embeddings_file = embeddings_data_file

with open(keywords_file, 'rb') as handle:
    keywords_data_df= pickle.load(handle)

## Convert from dataframe to list of objects:
keywords_data = []
for i in range(len(keywords_data_df['name'])):
    name = keywords_data_df['name'][i]
    element_id = keywords_data_df['id'][i]
    element_keywords = keywords_data_df['keywords'][i]
    keywords_data.append({'id':element_id, 'name':name, 'keywords': element_keywords})
    
counter = 1
index = 0
for element in keywords_data:
    print("Element "+str(counter)+" of "+str(len(keywords_data)))
    embeddings = embedding_from_string(element['keywords'])
    keywords_data[index]["embeddings"] = embeddings
    
    with gzip.open(embeddings_file, 'wb') as f:
        # Write keywords data in zip file
        pickle.dump(keywords_data, f)
        
    counter = counter + 1
    index = index + 1
    



Element 1 of 7
Element 2 of 7
Element 3 of 7
Element 4 of 7
Element 5 of 7
Element 6 of 7
Element 7 of 7
