In [1]:
import pandas as pd
import numpy as np
import chromadb

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name='all-mpnet-base-v2')

In [None]:
client = chromadb.HttpClient(host='localhost',port=8000)

In [None]:
vectorstore = Chroma(
    client=client,
    collection_name="RAG-Child",
    embedding_function=embedding_function,
)    

In [None]:
collection = client.get_collection(name='RAG-Child')

### Pre processing

In [None]:
type=[]
doc_id=[]

for meta in collection.get()['metadatas']:
    type.append(meta['Type'])
    doc_id.append(meta['doc_id'])

In [None]:
df = pd.DataFrame(
    np.array([doc_id,type,collection.get()['documents']]).T,
    columns=['doc_id','type','summary']
)

In [None]:
import re
import nltk
from nltk.corpus import stopwords

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
preprocessed_texts = [preprocess_text(text) for text in df.summary]
df['pre_proecss_summary'] = preprocessed_texts

In [None]:
df.head(2)

### Embeddings

In [None]:
# Step 2: Load the model
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

In [None]:
labels = df.type

In [None]:
embeddings = model.encode(df.pre_proecss_summary)

In [None]:
embeddings.shape

In [None]:
# Normalize the embeddings to unit length
corpus_embeddings = embeddings /  np.linalg.norm(embeddings, axis=1, keepdims=True)

### PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=50)  # Reduce to 50 dimensions
reduced_embeddings = pca.fit(corpus_embeddings)

In [None]:
sum(reduced_embeddings.explained_variance_ratio_)

In [None]:
reduced_embeddings = pca.fit_transform(corpus_embeddings)

In [None]:
reduced_embeddings.shape

### Kmeans

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
from sklearn.cluster import KMeans

# Select the number of clusters by means of SSE
SSE = []
range_n_clusters = range(2,15)
for nclust in range_n_clusters:
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust, 
                init='random', # or 'k-means++'
                n_init=15, 
                max_iter=500, 
                random_state=0)
    # Generate K-means clustering
    km.fit(reduced_embeddings)
    SSE.append(km.inertia_)
    
plt.plot(range_n_clusters, SSE, marker='o')
plt.xlabel('Number of clusters K')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.show()

In [None]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters, random_state=42)
clustering_model.fit(reduced_embeddings)
cluster_assignments = clustering_model.labels_

In [None]:
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_embeddings = tsne.fit_transform(reduced_embeddings)

In [None]:
plt.figure(figsize=(8, 6))
for i in range(num_clusters):
    points = tsne_embeddings[cluster_assignments == i]
    plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {i}')

plt.title('t-SNE Visualization of Clusters')
plt.legend()
plt.show()

In [None]:
# Step 7: Analyze and print the clustering results
clusters = [[] for _ in range(num_clusters)]
for text, cluster_id in zip(df.summary, cluster_assignments):
    clusters[cluster_id].append(text)