# Document Clustering

![alt text](cluster.jpg "Intro")

In [4]:
!conda install hdbscan


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import os

import hdbscan
import requests

import numpy as np
import pandas as pd
import plotly.express as px

from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
#from sklearn.manifold import TSNE
from dotenv import load_dotenv

load_dotenv()
import pickle


## Get 200 news articles from Hacker News 

In [2]:
response = requests.get('https://hacker-news.firebaseio.com/v0/topstories.json')
article_ids = response.json()

recent_articles = []
for id in article_ids:
    article_response = requests.get(f'https://hacker-news.firebaseio.com/v0/item/{id}.json')
    recent_articles.append(article_response.json())

docs = [article['title'] for article in recent_articles if len(article["title"]) > 20]

KeyboardInterrupt: 

In [None]:
with open('hackernews.pkl', 'wb') as f:
    pickle.dump(docs, f)

In [7]:
with open('hackernews.pkl', 'rb') as f:
    docs = pickle.load(f)

In [8]:
len(docs)

480

In [13]:
docs[3]

"Fine, I'll run a regression analysis but it won't make you happy"

## Generate embeddings from articles

In [14]:
embeddings = OpenAIEmbeddings(chunk_size=1000, openai_api_key="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").embed_documents(docs)

In [16]:
len(embeddings)

480

## Cluster documents, plot results, and store them in a dataframe

In [44]:
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True, min_samples=3, min_cluster_size=4).fit(embeddings)

In [45]:
#!pip install bhtsne

In [46]:
from bhtsne import tsne

In [47]:
#tsne = tsne(n_components=2, random_state=0)

df_tsne = (
    pd.DataFrame(tsne(np.array(embeddings)), columns=['x', 'y'])
    .assign(cluster=lambda df: hdb.labels_.astype(str))
    .query('cluster != "-1"')
    .sort_values(by='cluster')
)

fig = px.scatter(df_tsne, x='x', y='y', color='cluster')
fig.show()

In [55]:
df = pd.DataFrame({
    "title": docs,
    "cluster": hdb.labels_,
})
df = df.query("cluster != -1")
df[df["cluster"] == 9]

Unnamed: 0,title,cluster
26,Decentralized Artificial Intelligence,9
35,National Security Agency is starting an artifi...,9
149,AnyMAL: An Efficient and Scalable Any-Modality...,9
199,Farm robots inspired by ant brains,9
214,"Conversation with Zuckerberg, this time we tal...",9
334,Don’t Blame AI. Plagiarism Is Turning Digital ...,9
387,'Counterfeit people': The danger posed by Meta...,9
424,"Behind the AI boom, an army of overseas worker...",9
451,OpenAI and Jony Ive in talks to raise $1B from...,9
455,Rethinking Autonomous Driving with Large Langu...,9


## Create cluster topics from documents in each cluster

In [59]:
def get_prompt():
    system_template = "You're an expert tech journalist. You're helping me write short (4 or words max) but compelling topic title for groups of news articles."
    human_template = "Using the following articles, write a topic title that summarizes them.\n\nARTICLES:{articles}\n\nTOPIC TITLE:"

    return ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ],
        input_variables=["articles"],
    )


articles_str = "\n\n".join(docs) 

prompt = get_prompt()

for c in df.cluster.unique():
    chain = LLMChain(
        llm=ChatOpenAI(openai_api_key="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",temperature=0, model_name="gpt-4"), prompt=prompt, verbose=False
    )
    articles_str = "\n".join(
        [
            f"{article['title']}\n"
            for article in df.query(f"cluster == {c}").to_dict(orient="records")
        ]
    )
    result = chain.run(
        {
            "articles": articles_str,
        }
    )
    df.loc[df.cluster == c, "topic_title"] = result

In [52]:
df["topic_title"].value_counts()

"Hacker News: AI, Tech Innovations & Discussions"                23
"AI Revolution: Opportunities, Challenges, and Controversies"    11
"Reviving Retro Tech: Raspberry Pi & Gaming"                      9
"Advancements & Controversies in Physics"                         8
"Digital Privacy: Global Controversies"                           6
"Google Jamboard: 2024 Shutdown"                                  6
"Rust Language: Rising & Revolutionizing"                         5
"Apple's Bing Acquisition Saga"                                   5
"Imminent Discovery: Alien Life"                                  4
"Climate Crisis: Water Woes"                                      4
"Web Browsers: Privacy & Predicaments"                            3
Name: topic_title, dtype: int64

In [60]:
df[df["cluster"]==9]

Unnamed: 0,title,cluster,topic_title
26,Decentralized Artificial Intelligence,9,"""AI Revolution: Opportunities, Challenges, and..."
35,National Security Agency is starting an artifi...,9,"""AI Revolution: Opportunities, Challenges, and..."
149,AnyMAL: An Efficient and Scalable Any-Modality...,9,"""AI Revolution: Opportunities, Challenges, and..."
199,Farm robots inspired by ant brains,9,"""AI Revolution: Opportunities, Challenges, and..."
214,"Conversation with Zuckerberg, this time we tal...",9,"""AI Revolution: Opportunities, Challenges, and..."
334,Don’t Blame AI. Plagiarism Is Turning Digital ...,9,"""AI Revolution: Opportunities, Challenges, and..."
387,'Counterfeit people': The danger posed by Meta...,9,"""AI Revolution: Opportunities, Challenges, and..."
424,"Behind the AI boom, an army of overseas worker...",9,"""AI Revolution: Opportunities, Challenges, and..."
451,OpenAI and Jony Ive in talks to raise $1B from...,9,"""AI Revolution: Opportunities, Challenges, and..."
455,Rethinking Autonomous Driving with Large Langu...,9,"""AI Revolution: Opportunities, Challenges, and..."


In [56]:
c = 9
with pd.option_context("display.max_colwidth", None):
    print(df.query(f"cluster == {c}").topic_title.values[0])
    display(df.query(f"cluster == {c}").head())

AttributeError: 'DataFrame' object has no attribute 'topic_title'