## Imports and Functions

In [65]:
# imports
import pandas
import pickle
from googletrans import Translator, constants
import requests
import pandas
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import gzip
import json


translator = Translator()

from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
import openai
from dotenv import load_dotenv
import os
load_dotenv()

from config import *

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = os.environ.get('OPENAI_API_KEY')


def get_key_words(element_content, prompt):
    """
        Returns keywords extracted from element content
    """

    text = prompt + " " + element_content
    max_tokens = 4097

    if len(text)> 4097*2.5:
        text = text[:int(max_tokens*2)]
        print("TRUNCATED: ", len(text))
    
    try:
        response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": text}
                ]
            )

        key_words = response["choices"][0]["message"]["content"]
    except Exception as e:
        print("ERROR: ", e)
        key_words = ""
    return key_words


def embedding_from_string(
    string: str,
    model: str = EMBEDDING_MODEL,
) -> list:
    """
        Returns embedding from string
    """
    embedding = get_embedding(string, model)
    
    return embedding
    
def get_element_info(element):
    return "{'name': "+element['name']+", 'description': "+str(element['content'])+"}" 
    



## Load Input Elements

In [5]:
input_elements = []
try:
    with open(input_elements_file, 'rb') as handle:
        input_elements = pickle.load(handle)
except:
    print("No input file found")

print("INPUT ELEMENTS: ", input_elements)

INPUT ELEMENTS:  [{'id': 1, 'name': 'Machine Learning', 'content': 'Basic concepts of machine learning, machine learning techniques, main machine learning models, introduction to bots, etc.'}, {'id': 2, 'name': 'Data Analytics', 'content': 'Basic concepts, main analytical models, business intelligence, web page analytics, machine learning techniques, etc.'}]


## Get Keywords from target elements

### Define Prompt

In [6]:
element_type = "online test"

failed_questions = "In choosing an algorithm, what is the basis of the stability criterion?,\
                    Which would be a possible function to normalize in R?,\
                    Which tool displays the performance of an algorithm that is used in supervised learning?, \
                    The main problems of supervised learning in machine learning are... (select two), \
                    The unstructured learning model in machine learning that aims to structure data in groups according to its similarity is called..., \
                    What is the name of each of the data available for analysis in machine learning?, \
                    What is the purpose of Supervised Learning?, \
                    What is TensorFlow?, \
                    When we need to apply a model that results in a continuous variable, which is the one that tends to be used first?, \
                    In the data preparation phase, how are missing numeric vectors usually represented?, \
                    Select a trait of reinforcement learning."

system_prompt = f"""You are an NLP AI that aims to generate keywords that summarize texts. In this case, you are given the information about the results of an online test done by a user. \
                You are given the name and description of the test, as well as the questions that the user has failed, delimited by triple backticks. failed questions: ```{failed_questions}```.\
                You must generate 5 words that describe as precisely as possible the topics that the user has to work on in order to get a better mark on the test. \
                It is VERY IMPORTANT that the first generated keyword describes the main topic of the test. The other keywords must be focused on the topics of the failed questions.
                You must write the 5 words in one single line. Here are the name and description of the test in JSON format:"""

### Get Keywords

In [7]:
input_element = input_elements[0]
element_info = get_element_info(input_element)

keywords = get_key_words(element_info, system_prompt)

print("KEYWORDS: ", keywords)

KEYWORDS:  Machine Learning, algorithm stability, R normalization, algorithm performance, supervised learning problems.


### Other option: One keyword per failed question

In [46]:
system_prompt_2 = f"""You are an NLP AI that aims to generate keywords that summarize texts. In this case, you are given the information about the results of an online test done by a user. \
                You are given the name and description of the test, as well as the questions that the user has failed, delimited by triple backticks. failed questions: ```{failed_questions}```.\
                You must generate keywords that describe as precisely as possible the topics that the user has to work on in order to get a better mark on the test. \
                It is VERY IMPORTANT that the first generated keyword describes the main topic of the test. Then you must generate one keyword per failed question, so that the keyword describes the topic of the question.\
                You must write all keywords in one single line. Here are the name and description of the test in JSON format:"""

input_element = input_elements[0]
element_info = get_element_info(input_element)

keywords_2 = get_key_words(element_info, system_prompt_2)

print("KEYWORDS 2: ", keywords_2)

KEYWORDS 2:  Machine Learning, Stability criterion, Normalization in R, Performance evaluation, Supervised learning problems, Unstructured learning model, Data analysis, Purpose of Supervised Learning, TensorFlow, Continuous variable model, Missing numeric vectors, Reinforcement learning


## Get Target Element Keywords

### Load Target Elements

In [14]:
target_elements = []
try:
    with open(source_elements_file, 'rb') as handle:
        target_elements = pickle.load(handle)
except:
    print("No target file found")

print("TARGET ELEMENTS: ")
for target_element in target_elements:
    print(target_element)

TARGET ELEMENTS: 
{'id': 1, 'name': 'Data Analytics', 'content': 'Data Scientists try to make sense of the data that’s all around us. Taking a data science course can help you make informed decisions, create beautiful visualizations, and even try to predict future events through Machine Learning. If you’re curious about what you can learn about the world using the data produced every day, then data science might be for you!'}
{'id': 1, 'name': 'Databases', 'content': 'Learn database development from our best-in-class instructors. Our courses help you skill up in SQL, Python, NoSQL, Object relational mapping and more. Our authors show you how to gather the information needed, analyze the requirements, design a schema and implement the final solution.'}
{'id': 1, 'name': 'Cybersecurity', 'content': 'Today’s interconnected world makes everyone more susceptible to cyber-attacks. Whether you’re attracted to the relativity new world of cybersecurity as a professional, or just interested in p

### Define Prompt

In [80]:
element_type = "online course"
prompt_1 = f"You are an NLP AI that aims to generate keywords that summarize texts.\
        In this case, you are given the name and description of an {element_type}.\
        You must generate 5 keywords that describe as precisely as possible the main topic of the {element_type}.\
        It is VERY IMPORTANT that the first generated keyword describes the main topic of the {element_type}.\
        It is VERY IMPORTANT that each keyword relates specifically to the main topic of the {element_type}.\
        You must write the 5 keywords in one single line."

prompt_2 = f"You are an NLP AI that aims to generate keywords that summarize texts.\
        In this case, you are given the name and description of an {element_type}.\
        You must extract the topics treated by the {element_type}.\
        Then, you must generate one keyword per topic extracted so that the keyword summarizes that specific topic.\
        You must write all the final keywords in one single line."

prompt_3 = f"Please generate a list of keywords that summarize the topics covered in the {element_type} named 'Machine Learning' with the following description:\
        'Demand for machine learning skills is growing fast. With Pluralsight, you’ll learn everything from the fundamentals to advanced topics like neural networks, natural language processing, and unsupervised learning models. Master machine learning with help from real-world experts.'\
        You must write all the keywords in one single line."

prompt_4 = f"Please extract key technical concepts and topics from the {element_type} named 'Machine Learning' with the following description:\
        'Demand for machine learning skills is growing fast. With Pluralsight, you’ll learn everything from the fundamentals to advanced topics like neural networks, natural language processing, and unsupervised learning models. Master machine learning with help from real-world experts.'\
        Provide a list of keywords that best represent the core subject matter covered in the course.\
        You must write all the keywords in one single line."

prompt_5 = f'Given the {element_type} "Machine Learing" described as follows:\
        "Demand for machine learning skills is growing fast. With Pluralsight, you’ll learn everything from the fundamentals to advanced topics like neural networks, natural language processing, and unsupervised learning models. Master machine learning with help from real-world experts.",\
        please identify and list the technical terms, concepts, and specific topics that are central to the content of the {element_type}.'

prompt_6 = f"You are an NLP AI that aims to generate keywords that summarize texts.\
        In this case, you are given the name and description of an {element_type}.\
        You must extract the key technical concepts and topics from the {element_type}.\
        Then, you must generate one keyword per topic extracted so that the keyword summarizes that specific topic.\
        All final keywords must be central to the content of the {element_type}.\
        You must write all the final keywords in one single line."

prompt_7 = f"You are an NLP AI that aims to generate keywords that summarize texts.\
        In this case, you are given the name and description of an {element_type}.\
        You must generate 5 keywords that summarize the key technical concepts and topics from the {element_type}.\
        It is VERY IMPORTANT that the first generated keyword describes the main topic of the {element_type}.\
        It is VERY IMPORTANT that all keywords are relevant to the main topic of the {element_type}.\
        You must write all the final keywords in one single line."


### Get Target Element Keywords

In [84]:
target_element = target_elements[2]

element_info = get_element_info(target_element)

keywords_target = get_key_words(element_info, prompt_7)

print("TARGET ELEMENT: ", target_element["name"])
print("TARGET KEYWORDS: ", keywords_target)

TARGET ELEMENT:  Cybersecurity
TARGET KEYWORDS:  Cybersecurity, cyber trends, threats, personal privacy, challenges


## Calculate Distance From Input To Target

In [86]:
# keywords_1 = "Machine Learning, Stability criterion, Normalization in R, Performance evaluation, Supervised learning problems, Unstructured learning model, Data analysis, Purpose of Supervised Learning, TensorFlow, Continuous variable model, Missing numeric vectors, Reinforcement learning"
# keywords_2 = ""

keywords_0 = "Machine Learning, neural networks, natural language processing, unsupervised learning models."
keywords_1 = "Machine Learning, Supervised learning, Stability criterion, Normalization in R"
keywords_2 = "Machine Learning, Stability criterion, Normalization in R, Performance evaluation, Supervised learning problems, Unstructured learning model, Data analysis, Purpose of Supervised Learning, TensorFlow, Continuous variable model, Missing numeric vectors, Reinforcement learning"

embs_0 = embedding_from_string(keywords_0)
embs_1 = embedding_from_string(keywords_1)
embs_2 = embedding_from_string(keywords_2)

distances = distances_from_embeddings(embs_0, [embs_1, embs_2], distance_metric="cosine")
print(distances)

[0.16708727458543138, 0.12897739088965376]
