In [1]:
from few_shot_clustering.wrappers import LLMPairwiseClustering

from few_shot_clustering.dataloaders import load_clinc

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
cache_path = "./tmp/clinc_feature_cache.pkl"
features, labels, documents = load_clinc(cache_path)

prompt_suffix = "express the same general intent?"
text_type = "Utterance"

prompt = """You are tasked with clustering queries for a task-oriented dialog system based on whether they express the same general user intent. To do this, you will be given pairs of user queries and asked if they express the same general user need or intent.

Your task will be considered successful if the queries are clustered into groups that consistently express the same general intent.

Utterance #1: what's the spanish word for pasta
Utterance #2: how would they say butter in zambia

Given this context, do utterance #1 and utterance #2 likely express the same general intent? Yes

Utterance #1: roll those dice once
Utterance #2: can you roll an eight sided die and tell me what it comes up as

Given this context, do utterance #1 and utterance #2 likely express the same general intent? No

Utterance #1: how soon milk expires
Utterance #2: can you roll an eight sided die and tell me what it comes up as

Given this context, do utterance #1 and utterance #2 likely express the same general intent? Yes

Utterance #1: nice seeing you bye
Utterance #2: what was the date of my last car appointment

Given this context, do utterance #1 and utterance #2 likely express the same general intent? No"""


In [16]:
features = features[30:50]
labels = labels[:10]
documents = documents[:10]

In [20]:
labels[60:90]

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [13]:
import random
 

# Number of items to sample
sample_size = 100

# Take a sample from the original list
labels = random.sample(labels, sample_size)

# Sample indexes
indexes = random.sample(range(len(labels)), sample_size)

# Sample elements from each list using the sampled indexes
labels = [labels[i] for i in indexes]
features = [features[i] for i in indexes]
documents = [documents[i] for i in indexes]


In [4]:
features = features[:10]
labels = labels[:10]
documents = documents[:10]
cluster_assignments, constraints = LLMPairwiseClustering(features, documents, len(set(labels)) , prompt, text_type, prompt_suffix, max_feedback_given=10000, pckmeans_w=0.01, cache_file="tmp/clinc_cache_file.json", constraint_selection_algorithm="SimilarityFinder", kmeans_init="k-means++")

Collecting Constraints
9
PROMPT:
You are tasked with clustering queries for a task-oriented dialog system based on whether they express the same general user intent. To do this, you will be given pairs of user queries and asked if they express the same general user need or intent.

Your task will be considered successful if the queries are clustered into groups that consistently express the same general intent.

Utterance #1: what's the spanish word for pasta
Utterance #2: how would they say butter in zambia

Given this context, do utterance #1 and utterance #2 likely express the same general intent? Yes

Utterance #1: roll those dice once
Utterance #2: can you roll an eight sided die and tell me what it comes up as

Given this context, do utterance #1 and utterance #2 likely express the same general intent? No

Utterance #1: how soon milk expires
Utterance #2: can you roll an eight sided die and tell me what it comes up as

Given this context, do utterance #1 and utterance #2 likely e

ValueError: attempt to get argmin of an empty sequence

In [20]:
from few_shot_clustering.eval_utils import cluster_acc
import numpy as np

In [None]:

print(f"Accuracy: {cluster_acc(np.array(cluster_assignments), np.array(labels))}")

In [9]:
import os
from openai import OpenAI
client = OpenAI()
OpenAI.api_key = os.getenv('OPENAI_API_KEY')

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message.content)
message = completion.choices[0]

Hello! How can I assist you today?


In [7]:
x = "a"
len(x)

1

In [5]:
def some_function():
    x = 5
    y = 10
    z = x + y
    breakpoint()  # Debugger will be invoked here
    print(z)

some_function()

15


In [23]:
from few_shot_clustering.wrappers import LLMKeyphraseClustering
from InstructorEmbedding import INSTRUCTOR

from few_shot_clustering.dataloaders import load_clinc

# You can provide an optional file to cache the extracted features, 
# since these are a bit expensive to compute. Example:
# cache_path = "/tmp/clinc_feature_cache.pkl"
#
# This is not necessary, as shown below.


prompt = """I am trying to cluster task-oriented dialog system queries based on whether they express the same general user intent. To help me with this, for a given user query, provide a comprehensive set of keyphrases that could describe this query's intent. These keyphrases should be distinct from those that might describe queries with different intents. Generate the set of keyphrases as a JSON-formatted list.

Query: "how would you say fly in italian"

Keyphrases: ["translation", "translate"]

Query: "what does assiduous mean"

Keyphrases: ["definition", "define"]

Query: "find my cellphone for me!"

Keyphrases: ["location", "find", "locate", "tracking", "track"]"""

cache_path = "./tmp/clinc_feature_cache.pkl"
features, labels, documents = load_clinc(cache_path)

prompt_suffix = "express the same general intent?"
text_type = "Query"
encoder_model = INSTRUCTOR('hkunlp/instructor-large')

features = features[:100]
labels = labels[:100]
documents = documents[:100]

cluster_assignments = LLMKeyphraseClustering(features, documents, 150, prompt, text_type, encoder_model=encoder_model, prompt_for_encoder="Represent keyphrases for topic classification:", cache_file="./tmp/clinc_expansion_cache_file.json")

from few_shot_clustering.eval_utils import cluster_acc
import numpy as np
print(f"Accuracy: {cluster_acc(np.array(cluster_assignments), np.array(labels))}")



load INSTRUCTOR_Transformer
max_seq_length  512


0it [00:00, ?it/s]

PROMPT:
I am trying to cluster task-oriented dialog system queries based on whether they express the same general user intent. To help me with this, for a given user query, provide a comprehensive set of keyphrases that could describe this query's intent. These keyphrases should be distinct from those that might describe queries with different intents. Generate the set of keyphrases as a JSON-formatted list.

Query: "how would you say fly in italian"

Keyphrases: ["translation", "translate"]

Query: "what does assiduous mean"

Keyphrases: ["definition", "define"]

Query: "find my cellphone for me!"

Keyphrases: ["location", "find", "locate", "tracking", "track"]

Query: "how would you say fly in italian"

Keyphrases:


0it [2:12:43, ?it/s]


KeyboardInterrupt: 

In [12]:
import os
import pandas as pd
os.getcwd()

'C:\\Users\\Raouf\\Documents\\Studies\\M2 MLSD\\ppd\\code'

In [16]:

data = pd.read_json("../code/datasets/Tweets.txt", lines=True)
data

Unnamed: 0,text,cluster
0,brain fluid buildup delay giffords rehab,37
1,trailer talk week movie rite mechanic week opp...,14
2,rnc appoints chairman tampa convention effort ...,100
3,gbagbo camp futile cut ivory coast economy,110
4,chinese president lost translation powerful le...,61
...,...,...
2467,live moscow airport explosion,36
2468,supreme court refuse reinstate circuit global ...,89
2469,yemeni protester urged president ali abdullah ...,79
2470,indian navy coast guard rescue thai vessel pir...,107


In [10]:
import pickle

# Unpickle the list from the file
with open('../cluster-seed-words/clinc_clusters_seed_words.pkl', 'rb') as f:
    my_list = pickle.load(f)

print(my_list)  # Output: [1, 2, 3, 4, 5]

['Language Translation, Language Learning, Multilingual Communication', 'Financial, Banking, Transaction', 'Time Management,Productivity,Task Management', 'Definition, Define, Meaning', 'Philosophy,Existential,Meaning', 'Insurance, Policy, Update', 'Location Tracking, Lost Items, Assistance', 'Travel Advisory, Safety Information, Security Concerns', 'Vacation Request, Time Off, Assistance', 'Credit Score Improvement, Financial Impact, Creditworthiness', 'Information, Facts, Interesting', 'Language, Communication, Speak', 'Time Management,Payment Schedule, Salary and Paycheck', 'Delivery, Customer Service, Timeline', 'Time, Clock, Time Zone', 'Application Status, Update, Credit Card', 'Flight Status, Flight Update, Airlines', 'Random, Decision Making, Coin Flip', 'Name Change, Contact, Personalization', 'Origin, Location, Source', 'List Management, Shopping, Grocery', 'Assistance,Support,Capabilities', 'Decision Making,Uncertainty,Communications', 'Car Maintenance, Oil Change, Step-by-s

In [36]:
from gensim.models import KeyedVectors

# Load the pre-trained Word2Vec model
word2vec_path = './w2v-model/GoogleNews-vectors-negative300.bin.gz'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# List of keyphrases
og_keyphrases = ["Learning", "Deep Learning", "Neural Networks", "Natural Language Processing",
              "Computer Vision", "Reinforcement Learning", "Data Science", "Artificial Neural Networks",
              "Image Recognition", "Speech Recognition", "Chatbots", "Autonomous Vehicles", "Robotics",
              "Predictive Analytics", "Expert Systems"]
keyphrases = []
#keyphrases = [keyphrase.split(" ") if " " in keyphrase else keyphrase for keyphrase in og_keyphrases]
for keyphrase in og_keyphrases:
    if " " in keyphrase: 
        for k in keyphrase.split(" "):
            keyphrases.append(k)
    else:
        keyphrases.append(keyphrase)

# Get the vectors for each keyphrase
keyphrase_vectors = [word2vec_model[keyphrase] for keyphrase in keyphrases]

# Calculate the mean vector of all keyphrase vectors
mean_vector = sum(keyphrase_vectors) / len(keyphrase_vectors)

# Find the most similar keyphrases to the mean vector
similar_keyphrases = word2vec_model.similar_by_vector(mean_vector, topn=3)

# Extract the similar keyphrases
seed_words = [keyphrase for keyphrase, _ in similar_keyphrases]

print("Seed words for the cluster of keyphrases:")
print(seed_words)


KeyError: "Key 'Chatbots' not present"

In [1]:
from gensim.models import KeyedVectors
import numpy as np

# Load the pre-trained Word2Vec model
word2vec_path = './w2v-model/GoogleNews-vectors-negative300.bin.gz'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# List of keyphrases
keyphrases = ["Climate Change", "Global Warming", "Greenhouse Gas Emissions", "Renewable Energy", "Carbon Footprint", "Extreme Weather Events", "Sea Level Rise", "Deforestation", "Melting Ice Caps", "Sustainable Development", "Climate Action", "Mitigation Strategies", "Biodiversity Loss", "Ocean Acidification", "Paris Agreement"]


# Initialize a list to store keyphrase vectors
keyphrase_vectors = []

# Iterate through each keyphrase
for keyphrase in keyphrases:
    words = keyphrase.split()  # Split keyphrase into individual words
    valid_words = [word for word in words if word in word2vec_model]  # Filter out words not in vocabulary
    if valid_words:
        # Calculate the average vector for valid words
        keyphrase_vector = np.mean([word2vec_model[word] for word in valid_words], axis=0)
        keyphrase_vectors.append(keyphrase_vector)

# Calculate the mean vector of all valid keyphrase vectors
if keyphrase_vectors:
    mean_vector = sum(keyphrase_vectors) / len(keyphrase_vectors)

    # Find the most similar words to the mean vector
    similar_words = word2vec_model.similar_by_vector(mean_vector, topn=3)

    # Extract the similar words
    seed_words = [word for word, _ in similar_words]

    print("Seed words for the cluster of keyphrases:")
    print(seed_words)
else:
    print("No valid keyphrase vectors found. Unable to calculate seed words.")


Seed words for the cluster of keyphrases:
['Imaging_Techniques', 'Neural_Circuits', 'Single_Molecule']
