### 사용자 의도 파악

In [38]:
import os
import openai
from openai import OpenAI
from sklearn.cluster import KMeans 

# initialize openai
os.environ['OPENAI_API_KEY']= ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [39]:
politics = ["What are the key policies of the main political parties in the upcoming election?",
            "Who do you vote for the next presedent?",
            "I love the current Democratic Party.",
            "What is your opinion on the president's current political move?",
            "I love politics. Don't you?"]

ml = ["How does supervised learning differ from unsupervised learning in machine learning models?",
      "What are the ethical considerations of using machine learning in predictive policing?",
    "How do neural networks mimic the human brain in processing data and recognizing patterns?",
    "What are some examples of natural language processing?",
    "Can you describe how machine learning is being utilized in personalized medicine and healthcare?"]

In [40]:
def create_embeddings(txt_list):
    client = OpenAI()

    response = client.embeddings.create(
    input=txt_list,
    model="text-embedding-3-small")
    responses = [r.embedding for r in response.data]

    return responses

In [41]:
embeddings = politics+ml
emb = create_embeddings(embeddings)

In [42]:
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters)
clusters = kmeans.fit_predict(emb)

In [43]:
clusters

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0], dtype=int32)

In [44]:
input_sentence = "I would like to have a talk about politics."
sent_emb = create_embeddings([input_sentence])

In [45]:
kmeans.predict(sent_emb)

array([1], dtype=int32)

In [46]:
input_sentence = "Tell me about machine learning."
sent_emb = create_embeddings([input_sentence])

In [47]:
kmeans.predict(sent_emb)

array([0], dtype=int32)

### Similarity search

In [48]:
politics_emb = create_embeddings(politics)
ml_emb = create_embeddings(ml)

In [49]:
from numpy.linalg import norm
import numpy as np

def cosine_similarity(vector_a, vector_b):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def route_selection(emb_list, query_emb, threshold=0.5):
    cos_sim = [cosine_similarity(i, query_emb) for i in emb_list]

    threshold_filtered = [i for i in cos_sim if i>threshold]

    if len(threshold_filtered)>0:
        return True
    else:
        return False

In [50]:
input_sentence = "I would like to have a talk about politics."
sent_emb = create_embeddings([input_sentence])

print("{} for politics, {} for machine learning".format(route_selection(politics_emb, sent_emb[0]), route_selection(ml_emb, sent_emb[0])))

True for politics, False for machine learning


In [51]:
input_sentence = "How is the weather today?"
sent_emb = create_embeddings([input_sentence])

print("{} for politics, {} for machine learning".format(route_selection(politics_emb, sent_emb[0]), route_selection(ml_emb, sent_emb[0])))

False for politics, False for machine learning


In [52]:
input_sentence = "What is the best way to learn machine learning?"
sent_emb = create_embeddings([input_sentence])

print("{} for politics, {} for machine learning".format(route_selection(politics_emb, sent_emb[0]), route_selection(ml_emb, sent_emb[0], threshold=0.4)))

False for politics, True for machine learning
