In [1]:
# Importing required libraries
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import re

In [2]:
# Function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text

In [3]:
# Defining the documents
D1 = ("Since OpenAI released its blockbuster bot ChatGPT in November, users have casually experimented with the tool, "
      "with even Insider reporters trying to simulate news stories or message potential dates. To older millennials who grew up with IRC chat rooms — a text instant message system — the personal tone of conversations with the bot can evoke the experience of chatting online. "
      "But ChatGPT, the latest in technology known as 'large language model tools,' doesn't speak with sentience and doesn't 'think' the way people do.")

D2 = ("Other tech companies like Google and Meta have developed their own large language model tools, which use programs that take in human prompts and devise sophisticated responses. "
      "OpenAI, in a revolutionary move, also created a user interface that is letting the general public experiment with it directly. Some recent efforts to use chat bots for real-world services have proved troubling — with odd results. "
      "The mental health company Koko came under fire this month after its founder wrote about how the company used GPT-3 in an experiment to reply to users.")

D3 = ("The founder of the controversial DoNotPay service, which claims its GPT-3-driven chat bot helps users resolve customer service disputes, also said an AI 'lawyer' would advise defendants in actual courtroom traffic cases in real time, though he later walked that back over concerns about its risks. "
      "Chat GPT is an AI Chatbot developed by Open AI. The chatbot has a language-based model that the developer fine-tunes for human interaction in a conversational manner. Effectively it’s a simulated chatbot primarily designed for customer service; people use it for various other purposes too though. "
      "These range from writing essays to drafting business plans, to generating code. But what is it and what can it really do?")

D4 = ("Chat GPT is an AI chatbot auto-generative system created by Open AI for online customer care. It is a pre-trained generative chat, which makes use of (NLP) Natural Language Processing. "
      "The source of its data is textbooks, websites, and various articles, which it uses to model its own language for responding to human interaction. "
      "The main feature of Chat GPT is generating responses like those humans would provide, in a text box. Therefore, it is suitable for chatbots, AI system conversations, and virtual assistants. However, it can also give natural answers to questions in a conversational tone and can generate stories, poems, and more. "
      "Moreover, it can: Write code, Write an article or blog post, Translate, Debug, Write a story/poem, Recommend chords and lyrics")

# Create a list of documents
documents = [D1, D2, D3, D4]

In [4]:
# Preprocess the documents
documents = [preprocess_text(doc) for doc in documents]


In [5]:
# Defining the query
query = "OpenAI chatbot ChatGPT"
query = preprocess_text(query)

In [6]:
# Creating a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

In [7]:
# Vectorize the documents and the query
X = vectorizer.fit_transform(documents)
query_vector = vectorizer.transform([query])

# Determine the optimal number of clusters using silhouette analysis
best_n_clusters = 2
best_silhouette_score = -1

for n_clusters in range(2, 4):  # Testing different cluster sizes
    model = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
    model.fit(X)
    cluster_labels = model.labels_
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_n_clusters = n_clusters

# Perform bisecting K-means clustering with the best number of clusters
model = KMeans(n_clusters=best_n_clusters, init="k-means++", random_state=42)
model.fit(X)


In [8]:
# Get the cluster assignment for the query
query_cluster = model.predict(query_vector)[0]

In [9]:
# Get the documents in the same cluster as the query
query_cluster_mask = model.labels_ == query_cluster
matching_documents = [document for i, document in enumerate(documents) if query_cluster_mask[i]]

In [10]:
# Print the matching documents
print("Retrieved documents:\n")
for i, document in enumerate(matching_documents):
    print(f"{i + 1} => {document}\n")

Retrieved documents:

1 => the founder of the controversial donotpay service which claims its gpt 3 driven chat bot helps users resolve customer service disputes also said an ai lawyer would advise defendants in actual courtroom traffic cases in real time though he later walked that back over concerns about its risks chat gpt is an ai chatbot developed by open ai the chatbot has language based model that the developer fine tunes for human interaction in conversational manner effectively it a simulated chatbot primarily designed for customer service people use it for various other purposes too though these range from writing essays to drafting business plans to generating code but what is it and what can it really do 

2 => chat gpt is an ai chatbot auto generative system created by open ai for online customer care it is pre trained generative chat which makes use of nlp natural language processing the source of its data is textbooks websites and various articles which it uses to model 