# Clusterization test in books
### Objective: Test the formation of clusters in bigger documents (books) and then ask a question to identify the best cluster.

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [46]:
import os
import pandas as pd
import openai
from dotenv import load_dotenv
import PyPDF2  # Import PyPDF2

# Load environment variables
load_dotenv()

def extract_text_from_pdf(pdf_path,num_paginas = 100):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        pages_text = []
        for page_num in range(min(num_paginas, len(reader.pages))):  # Extract from the first num_paginas pages or total pages if less than 100
            page = reader.pages[page_num]
            page_text = page.extract_text() if page.extract_text() else ''  # Extract text or return empty string if None
            pages_text.append(page_text)
    return pages_text

def split_into_chunks(pages_text, chunk_size):
    chunks = []  # To hold chunks of concatenated page texts
    page_ranges = []  # To hold corresponding page ranges for each chunk

    current_chunk = ''  # Accumulates text for the current chunk
    chunk_start_page = 0  # Tracks the start page index of the current chunk
    for i, page_text in enumerate(pages_text):
        # Check if the current page fits into the current chunk
        if len(current_chunk) + len(page_text) <= chunk_size:
            # Add the page text to the chunk
            current_chunk += page_text
            # If it's the last page, close off the current chunk
            if i == len(pages_text) - 1:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i))
        else:
            # If the current chunk is not empty, save it before starting a new one
            if current_chunk:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i - 1))
            # Start a new chunk with the current page
            # But if the page itself exceeds the chunk size, handle it specially
            if len(page_text) > chunk_size:
                # Break down the large page into subchunks, if necessary
                for start in range(0, len(page_text), chunk_size):
                    end = start + chunk_size
                    chunks.append(page_text[start:end])
                    page_ranges.append((i, i))  # This subchunk still corresponds to the current page
                current_chunk = ''  # Reset for the next chunk after handling large page
                chunk_start_page = i + 1  # Next chunk will start from the following page
            else:
                # Otherwise, just start the new chunk normally
                current_chunk = page_text
                chunk_start_page = i
            # Handle case where this is the last page and its text hasn't been added to chunks
            if i == len(pages_text) - 1 and len(page_text) <= chunk_size:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i))

    return chunks, page_ranges




def generate_embeddings_openai(text, openai_api_key=os.getenv('OPEN_AI_API_KEY')):
    openai.api_key = openai_api_key
    try:
        response = openai.Embedding.create(
            input=text,  # Ensure 'input' is correct according to the OpenAI API documentation
            model="text-embedding-ada-002"
        )
        embedding_list = response['data'][0]['embedding']
        return embedding_list
    except Exception as e:
        print(f"An error occurred: {e}")
        return []  # Return an empty list if there's an error

# Function to create the dataset
def create_dataset(book_paths, chunk_size=1000,num_paginas = 100):
    rows = []
    
    for book_path in book_paths:
        print(f"Processing {book_path}")
        pages_text = extract_text_from_pdf(book_path,num_paginas)
        chunks, page_ranges = split_into_chunks(pages_text, chunk_size)
        
        for chunk, (start_page, end_page) in zip(chunks, page_ranges):
            embedding = generate_embeddings_openai(chunk)
            rows.append({
                'document_name': os.path.basename(book_path),
                'chunk_text': chunk,
                'init_page': start_page + 1,  # Adding 1 to make page numbers human-readable
                'end_page': end_page + 1,    # Adding 1 to make page numbers human-readable
                'embedding': embedding
            })
    
    return pd.DataFrame(rows)

# List of your book paths
book_paths = [
    'livros/Harry Potter e a Pedra Filosofal.pdf',
    'livros/Harry Potter e a Câmara Secreta.pdf',
    'livros/Harry Potter e o prisioneiro de Azkaban.pdf',
    'livros/harry potter e a pedra filosofal.pdf',
    'livros/Harry Potter e o Cálice de Fogo.pdf',
    'livros/Harry Potter e a Ordem da Fênix.pdf',
    'livros/Harry Potter e o Enigma do Príncipe.pdf',
    'livros/Harry Potter e as Relíquias da Morte.pdf'    
]




In [47]:
# Create the dataset
dataset = create_dataset(book_paths)

# Save to CSV
dataset.to_csv('hp_data_set_1000.csv', index=False)


Processing livros/Harry Potter e a Pedra Filosofal.pdf
Processing livros/Harry Potter e a Câmara Secreta.pdf
Processing livros/Harry Potter e o prisioneiro de Azkaban.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/Harry Potter e o Cálice de Fogo.pdf
Processing livros/Harry Potter e a Ordem da Fênix.pdf
Processing livros/Harry Potter e o Enigma do Príncipe.pdf
Processing livros/Harry Potter e as Relíquias da Morte.pdf


In [48]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go

# Load the dataset
def embbed_and_plot(df):
    df['embedding'] = df['embedding'].apply(lambda x: eval(x))  # Convert stringified lists back to Python lists
    df['first_15_words'] = df['chunk_text'].apply(lambda x: ' '.join(str(x).split()[:15]))

    # Convert list of embeddings into a 2D numpy array and scale
    X = np.array(df['embedding'].tolist())
    scaler = StandardScaler().fit(X)  # Fit scaler to the data
    X_scaled = scaler.transform(X)  # Apply scaling

    # Perform PCA to reduce the embeddings to 2 dimensions
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(X_scaled)  # Apply PCA to scaled data

    # Add the principal components back to the dataframe
    df['principal component 1'] = principalComponents[:, 0]
    df['principal component 2'] = principalComponents[:, 1]

    # Plot initialization
    fig = go.Figure()
    colors = {
        'Harry Potter e a Pedra Filosofal.pdf': 'red',
        'Harry Potter e a Câmara Secreta.pdf': 'green',
        'Harry Potter e o prisioneiro de Azkaban.pdf': 'blue',
        'Harry Potter e o Cálice de Fogo.pdf': 'yellow',
        'Harry Potter e a Ordem da Fênix.pdf': 'cyan',
        'Harry Potter e o Enigma do Príncipe.pdf': 'magenta',
        'Harry Potter e as Relíquias da Morte.pdf': 'grey'
    }
    centroids = {}

    # Add traces for each book
    for book, color in colors.items():
        book_df = df[df['document_name'] == book]
        fig.add_trace(go.Scatter(
            x=book_df['principal component 1'], 
            y=book_df['principal component 2'],
            text=book_df['first_15_words'],  # Display on hover
            mode='markers',
            marker_color=color,
            name=book
        ))
        book_df = df[df['document_name'] == book]
        centroid = book_df[['principal component 1', 'principal component 2']].mean().values
        centroids[book] = centroid  # Store the centroid
        fig.add_trace(go.Scatter(
            x=[centroid[0]], 
            y=[centroid[1]],
            text=[book],  # Book name for centroid hover
            mode='markers+text',
            marker_symbol='x',
            marker_size=12,
            marker_color='black',
            showlegend=False,
            textposition="top center"
        ))

    # Customize layout
    fig.update_layout(
        title='PCA of Book Embeddings with Centroids',
        xaxis_title='Principal Component 1',
        yaxis_title='Principal Component 2',
        legend_title='Book Name'
    )

    # Show plot
    fig.show()
    
    return pca,df,fig,centroids,scaler





In [49]:
pca,df,fig,centroids,scaler = embbed_and_plot(pd.read_csv('hp_data_set_1000.csv'))

# 2 - Query

In [50]:
import os 
import openai
def generate_embeddings_openai(text, openai_api_key=os.getenv('OPEN_AI_API_KEY')):
    openai.api_key = openai_api_key
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    embedding = response['data'][0]['embedding']
    return np.array(embedding)

In [51]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

def query_and_plot(pca,fig,scaler,query_text = "harry você é um bruxo"):
    query_embedding = generate_embeddings_openai(query_text)  # Get embedding using OpenAI
    query_embedding_scaled = scaler.transform(np.array(query_embedding).reshape(1, -1))  # Scale using the same scaler
    query_pca = pca.transform(query_embedding_scaled)  # Apply PCA

    # Add the query to the plot for visualization
    fig.add_trace(go.Scatter(
        x=[query_pca[0, 0]], 
        y=[query_pca[0, 1]],
        text=query_text,  # Display the actual query text
        mode='markers+text',
        marker_symbol='star',  # Change marker symbol for better visibility
        marker_size=20,  # Increase marker size
        marker_color="lime",  # Use a bright color for visibility
        showlegend=False,
        textposition="top center",  # Adjust text position for clarity
        textfont=dict(  # Customize font properties for better visibility
            family="Arial, sans-serif",
            size=12,  # Increase text size
            color="blue",  # Choose a color that contrasts well with the marker color
        ),
    ))
    fig.show()
    return query_embedding,query_pca
    



In [52]:
query_embedding,query_pca = query_and_plot(pca,fig,scaler)

In [53]:
# Assuming your query's PCA coordinates are stored in query_pca

# Function to calculate Euclidean distance
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

def query_to_centroid_distances(query_pca, centroids):
    distances = {}
    for book, centroid in centroids.items():
        dist = euclidean_distance(query_pca, centroid)
        distances[book] = dist
    return distances




In [54]:

# Print the distances
distances = query_to_centroid_distances(query_pca, centroids)
for book, distance in sorted(distances.items(), key=lambda item: item[1]):
    print(f"Distance from query to centroid of '{book}': {distance}")

Distance from query to centroid of 'Harry Potter e o Enigma do Príncipe.pdf': 4.0465532563859545
Distance from query to centroid of 'Harry Potter e o prisioneiro de Azkaban.pdf': 4.792477223510781
Distance from query to centroid of 'Harry Potter e a Câmara Secreta.pdf': 5.138529289471549
Distance from query to centroid of 'Harry Potter e a Ordem da Fênix.pdf': 19.074458224588508
Distance from query to centroid of 'Harry Potter e a Pedra Filosofal.pdf': 20.13236855117465
Distance from query to centroid of 'Harry Potter e o Cálice de Fogo.pdf': nan
Distance from query to centroid of 'Harry Potter e as Relíquias da Morte.pdf': nan


# 3 - Similarity of chunk heuristic

Heuristic for finding chunks:

1. Rank the distance from the query to the centroids of the documents.

2. Use a distance threshold:

    2.a. If the distance threshold is exceeded, only use that centroid; 

    2.b. if it's lower, use all books within the threshold for comparison.

In [55]:
def find_closest_books(query_embedding, centroids, distance_threshold=30):
    distances = {}
    for book, centroid in centroids.items():
        dist = euclidean_distance(query_embedding, centroid)
        distances[book] = dist
    sorted_distances = sorted(distances.items(), key=lambda item: item[1])
    books_to_compare = [book for book, distance in sorted_distances if distance - sorted_distances[0][1] <= distance_threshold]
    return books_to_compare

# Find the closest books to the query


In [56]:
closest_books = find_closest_books(query_pca[0], centroids,30)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")

The closest books to the query are: Harry Potter e o Enigma do Príncipe.pdf ; Harry Potter e o prisioneiro de Azkaban.pdf ; Harry Potter e a Câmara Secreta.pdf ; Harry Potter e a Ordem da Fênix.pdf ; Harry Potter e a Pedra Filosofal.pdf


In [57]:
from scipy.spatial.distance import cosine
import numpy as np

def compare_similarity(query_embedding, chunks, book_name):
    # Ensure the query embedding is a 1-D numpy array
    query_vec = np.array(query_embedding).flatten()
    
    # Dictionary to store similarities
    similarities = {}

    # Calculate cosine similarity for each chunk
    for index, row in chunks.iterrows():
        chunk_embedding = np.array(row['embedding']).flatten()  # Convert chunk embedding to 1-D numpy array
        # Compute cosine similarity (cosine returns the distance, so subtract from 1)
        cos_sim = 1 - cosine(query_vec, chunk_embedding)
        # Store the similarity along with chunk text and book name
        similarities[(row['chunk_text'], book_name)] = cos_sim

    return similarities


In [58]:
def find_best_chunk(query_embedding, closest_books, df):
    best_similarity = float('-inf')  # Initialize with the lowest similarity
    best_chunk_index = None  # To store the index of the best chunk

    for book in closest_books:
        book_chunks = df[df['document_name'] == book]
        # Calculate similarities for all chunks in the current book
        similarities = compare_similarity(query_embedding, book_chunks, book)
        
        # Find the best chunk for the current book
        for (chunk_text, _), cos_sim in similarities.items():
            if cos_sim > best_similarity:
                best_similarity = cos_sim
                # Find the index of the chunk with the best similarity
                best_chunk_index = book_chunks[book_chunks['chunk_text'] == chunk_text].index[0]

    # Retrieve the entire row for the best chunk using the index
    if best_chunk_index is not None:
        best_chunk_info = df.loc[best_chunk_index]
        return best_chunk_info, best_similarity
    else:
        return None, None  # In case no best chunk was found


In [59]:
best_chunk_info, best_similarity = find_best_chunk(query_embedding, closest_books, df)
clean_chunk_text = best_chunk_info['chunk_text'].replace('\t', ' ')

from termcolor import colored

header = colored(f"{'Best Chunk Info':^60}", 'blue', 'on_white', attrs=['bold'])  # Blue text on white background, bold
divider = colored('-' * 60, 'red')  # Red divider line

# Format each row with left alignment, ensuring each piece of info is neatly aligned, with different colors
similarity_row = colored(f"{'Best Similarity:':<20}", 'yellow') + str(best_similarity)
document_row = colored(f"{'Document Name:':<20}", 'green') + best_chunk_info['document_name']
text_row = colored(f"{'Chunk Text:':<20}", 'cyan') + clean_chunk_text + "..."
init_page_row = colored(f"{'Initial Page:':<20}", 'magenta') + str(best_chunk_info['init_page'])
end_page_row = colored(f"{'End Page:':<20}", 'magenta') + str(best_chunk_info['end_page'])

# Print each part of the table
print(header)
print(divider)
print(similarity_row)
print(document_row)
print(text_row)
print(init_page_row)
print(end_page_row)

#print(f"First 15 Words: {best_chunk_info['first_15_words']}")
#print(f"Principal Component 1: {best_chunk_info['principal component 1']:.6f}")
#print(f"Principal Component 2: {best_chunk_info['principal component 2']:.6f}")




[1m[107m[34m                      Best Chunk Info                       [0m
[31m------------------------------------------------------------[0m
[33mBest Similarity:    [0m0.8774808917628497
[32mDocument Name:      [0mHarry Potter e a Câmara Secreta.pdf
[36mChunk Text:         [0mbruxo ou bruxa? Conseguiria até se alegrar com a visão do seu arqui-inimigo, Draco
Malfoy, só para ter certeza de que tudo não passara de um sonho...
Não que o ano todo em Hogwarts tivesse sido uma brincadeira. No finzinho do
último trimestre, Harry se vira frente a frente com Lorde Voldemort em pessoa. O
bruxo poderia ser um destroço do que fora, mas ainda inspirava terror, ainda era
astuto, ainda estava decidido a retomar o poder. Harry escorregara por entre as
garras de Voldemort uma segunda vez, mas fora por um triz, e mesmo agora, semanas
depois, Harry continuava a acordar à noite, encharcado de suor frio, imaginando
onde estaria Voldemort neste momento, lembrando-se do seu rosto lívido, dos se

In [17]:
import os

import csv
import json
from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPEN_AI_API_KEY'))

# Load your API key from an environment variable or directly insert it here (not recommended for security reasons)



# Read the description from a text file
with open('assistant_description.txt', 'r', encoding='utf-8') as file:
    description_text = file.read()
    
print(description_text)
print(len(description_text))

# Convert your TSV data to the required JSONL format for the OpenAI file upload
with open('e.tsv', 'r', encoding='utf-8') as tsvfile, open('e.jsonl', 'w', encoding='utf-8') as jsonlfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    for row in reader:
        json_line = {"prompt": row["original_text"], "completion": row["strong_text"]}
        jsonlfile.write(json.dumps(json_line) + '\n')

# Upload the file to OpenAI
file = client.files.create(
  file=open("e.jsonl", "rb"),
  purpose='fine-tune'
)

# Create an assistant using the OpenAI API with the uploaded file
assistant = client.beta.assistants.create(
  name="Data visualizer",
  description=description_text,
  model="gpt-4-turbo-preview"
  #tools=[{"type": "retrieval"}],
  #file_ids=[file.id]
)

# Print the ID of the newly created assistant
print(assistant.id)


siga estas regras:
Apostos:Manter enumerativos e recapitulativos. Separar especificadores.
Para Oracões:
Relativas:Separar explicativas e restritivas.
Adverbiais:Separar causais, comparativas, concessivas, consecutivas e finais. Manter condicionais e proporcionais. Inverter conformativas. Separar temporais.
Coordenadas:Dividir assindéticas; fazer o mesmo para sindéticas conforme tipo.
Reduzidas:Manter infinitivos, gerúndios, particípios.
Voz Passiva: Converter para ativa.
Use e.jsonl para exemplos
502
asst_miJ3JEtjtUtHoWP7CYG07Fzm


In [21]:
thread = client.beta.threads.create()

In [22]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Crie um texto em portugues e resuma com o assistente"
)

In [51]:
import time
df = pd.read_csv('hp_data_set_1000.csv')
def simplify_text(text):
    # Initialize the assistant run (replace with your actual initialization code)
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id,
        instructions="Simplifique esse texto usando o assistente"
        )
  
    # Check the run status and wait for completion
    while run.status in ['queued', 'in_progress', 'cancelling']:
        time.sleep(1)  # Wait for 1 second
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id
        )
  
    # If completed, retrieve the messages
    if run.status == 'completed':
        messages_response = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        
        messages = client.beta.threads.messages.list(
        thread_id=thread.id
        )

        try:   
            assistant_response = messages.data[0].content[0].text.value
            return assistant_response
        except IndexError:
            return " "




# Apply the function to your DataFrame
df['simplified_text'] = df['chunk_text'].apply(simplify_text)

KeyboardInterrupt: 

In [None]:
df.to_csv('hp_data_set_1000_simplified.csv', index=False)

# 4 - Test with smaller chunks

In [60]:
dataset2 = create_dataset(book_paths,200)

# Save to CSV
dataset2.to_csv('hp_dataset_200.csv', index=False)


Processing livros/Harry Potter e a Pedra Filosofal.pdf
Processing livros/Harry Potter e a Câmara Secreta.pdf
Processing livros/Harry Potter e o prisioneiro de Azkaban.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/Harry Potter e o Cálice de Fogo.pdf
Processing livros/Harry Potter e a Ordem da Fênix.pdf
Processing livros/Harry Potter e o Enigma do Príncipe.pdf
Processing livros/Harry Potter e as Relíquias da Morte.pdf


In [61]:
df,fig,centroids,scaler = embbed_and_plot(df  = pd.read_csv('hp_dataset_200.csv'))

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<string>, line 1)

In [None]:
query_embedding,query_pca = query_and_plot(fig,scaler,query_text='harry você é um bruxo')


In [None]:
closest_books = find_closest_books(query_pca[0], centroids)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")


The closest books to the query are: perdido em marte.pdf


In [None]:
best_chunk_info, best_similarity = find_best_chunk(query_embedding, closest_books, df)
clean_chunk_text = best_chunk_info['chunk_text'].replace('\t', ' ')

from termcolor import colored

header = colored(f"{'Best Chunk Info':^60}", 'blue', 'on_white', attrs=['bold'])  # Blue text on white background, bold
divider = colored('-' * 60, 'red')  # Red divider line

# Format each row with left alignment, ensuring each piece of info is neatly aligned, with different colors
similarity_row = colored(f"{'Best Similarity:':<20}", 'yellow') + str(best_similarity)
document_row = colored(f"{'Document Name:':<20}", 'green') + best_chunk_info['document_name']
text_row = colored(f"{'Chunk Text:':<20}", 'cyan') + clean_chunk_text + "..."
init_page_row = colored(f"{'Initial Page:':<20}", 'magenta') + str(best_chunk_info['init_page'])
end_page_row = colored(f"{'End Page:':<20}", 'magenta') + str(best_chunk_info['end_page'])

# Print each part of the table
print(header)
print(divider)
print(similarity_row)
print(document_row)
print(text_row)
print(init_page_row)
print(end_page_row)

#print(f"First 15 Words: {best_chunk_info['first_15_words']}")
#print(f"Principal Component 1: {best_chunk_info['principal component 1']:.6f}")
#print(f"Principal Component 2: {best_chunk_info['principal component 2']:.6f}")




[1m[107m[34m                      Best Chunk Info                       [0m
[31m------------------------------------------------------------[0m
[33mBest Similarity:    [0m0.8626173606155557
[32mDocument Name:      [0mperdido em marte.pdf
[36mChunk Text:         [0mMarte não é desconhecido para mim. Estou aqui há muito tempo. Mas, até
hoje, o Hab nunca havia ficado fora do meu campo de visão. Talvez você
ache que isso não faz diferença, mas faz.
À medida que eu avançava rumo ao local em que o GTR estava
enterrado, me dei conta de que Marte é um deserto estéril e de que estou
completamente
 sozinho aqui. Eu já sabia disso, é claro. Mas existe uma
diferença entre saber e de fato vivenciar algo. À minha volta, não há nada,
apenas poeira, pedras e um deserto i...
[35mInitial Page:       [0m72
[35mEnd Page:           [0m72


# 5 - even smaller chunks

In [66]:
dataset=create_dataset(book_paths,chunk_size=1500,num_paginas = 100)
dataset.to_csv('hp_dataset_1500.csv', index=False)


Processing livros/Harry Potter e a Pedra Filosofal.pdf
Processing livros/Harry Potter e a Câmara Secreta.pdf
Processing livros/Harry Potter e o prisioneiro de Azkaban.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/Harry Potter e o Cálice de Fogo.pdf
Processing livros/Harry Potter e a Ordem da Fênix.pdf
Processing livros/Harry Potter e o Enigma do Príncipe.pdf
Processing livros/Harry Potter e as Relíquias da Morte.pdf


In [71]:
df,fig,centroids,scaler = embbed_and_plot(df  = pd.read_csv('hp_data_set_1000.csv'))


ValueError: too many values to unpack (expected 4)

In [None]:
query_embedding,query_pca = query_and_plot(fig,scaler)


In [None]:
closest_books = find_closest_books(query_pca[0], centroids)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")


In [None]:
best_chunk_info, best_similarity = find_best_chunk(query_embedding, closest_books, df)
clean_chunk_text = best_chunk_info['chunk_text'].replace('\t', ' ')

from termcolor import colored

header = colored(f"{'Best Chunk Info':^60}", 'blue', 'on_white', attrs=['bold'])  # Blue text on white background, bold
divider = colored('-' * 60, 'red')  # Red divider line

# Format each row with left alignment, ensuring each piece of info is neatly aligned, with different colors
similarity_row = colored(f"{'Best Similarity:':<20}", 'yellow') + str(best_similarity)
document_row = colored(f"{'Document Name:':<20}", 'green') + best_chunk_info['document_name']
text_row = colored(f"{'Chunk Text:':<20}", 'cyan') + clean_chunk_text + "..."
init_page_row = colored(f"{'Initial Page:':<20}", 'magenta') + str(best_chunk_info['init_page'])
end_page_row = colored(f"{'End Page:':<20}", 'magenta') + str(best_chunk_info['end_page'])

# Print each part of the table
print(header)
print(divider)
print(similarity_row)
print(document_row)
print(text_row)
print(init_page_row)
print(end_page_row)

#print(f"First 15 Words: {best_chunk_info['first_15_words']}")
#print(f"Principal Component 1: {best_chunk_info['principal component 1']:.6f}")
#print(f"Principal Component 2: {best_chunk_info['principal component 2']:.6f}")




In [68]:
!pip install hdbscan

Collecting hdbscan
  Using cached hdbscan-0.8.33.tar.gz (5.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cython<3,>=0.27 (from hdbscan)
  Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
Using cached Cython-0.29.37-py2.py3-none-any.whl (989 kB)
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (pyproject.toml): started
  Building wheel for hdbscan (pyproject.toml): finished with status 'done'
  Created wheel for hdbscan: filename=hdbscan-0.8.33-cp311-cp311-win_amd64.whl size=595848 sha256=3e85ea96574594c04f477bb719d65e93c375bc6e4b8d4d9350c3b9d56303a432
  Stored in directory: c:\users\cliente\appdata\local\pip\cache\wheels\4e\8c\6f\d0495e4e40cbd27

In [55]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import hdbscan  # Import HDBSCAN
import plotly.express as px
# Load the datasetimport pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import hdbscan  # Make sure you have HDBSCAN installed

# Function to embed text and plot clusters
def embed_and_plot_hdb(df):
    df['embedding'] = df['embedding'].apply(lambda x: eval(x))  # Convert stringified lists back to Python lists
    df['first_15_words'] = df['chunk_text'].apply(lambda x: ' '.join(str(x).split()[:15]))

    # Convert list of embeddings into a 2D numpy array and scale
    X = np.array(df['embedding'].tolist())
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)

    # Perform PCA to reduce the embeddings to 2 dimensions
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(X_scaled)

    # Add the principal components back to the dataframe
    df['principal component 1'] = principalComponents[:, 0]
    df['principal component 2'] = principalComponents[:, 1]

    # HDBSCAN Clustering
    clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
    df['cluster'] = clusterer.fit_predict(principalComponents)

    # Plot initialization
    fig = go.Figure()

    # Define a color palette
    color_palette = px.colors.qualitative.Set1  # You can change this to other palettes
    
    # Assign a color to each cluster, including noise
    cluster_colors = {cluster: color_palette[i % len(color_palette)] for i, cluster in enumerate(sorted(df['cluster'].unique()))}

    # Use 'lightgrey' for noise points (cluster label -1)
    cluster_colors[-1] = 'lightgrey'

    # Add traces for each cluster
    for cluster in sorted(df['cluster'].unique()):
        cluster_df = df[df['cluster'] == cluster]
        fig.add_trace(go.Scatter(
            x=cluster_df['principal component 1'], 
            y=cluster_df['principal component 2'],
            text=cluster_df['first_15_words'],  # Display on hover
            mode='markers',
            marker=dict(
                color=cluster_colors[cluster],  # Use assigned color
                size=8,  # You can adjust the size as needed
                line=dict(width=1, color='DarkSlateGrey')  # Border for the markers
            ),
            name=f'Cluster {cluster if cluster != -1 else "Noise"}'
        ))

    # Customize layout
    fig.update_layout(
        title='PCA of Book Embeddings with HDBSCAN Clusters',
        xaxis_title='Principal Component 1',
        yaxis_title='Principal Component 2',
        legend_title='Cluster'
    )

    # Show plot
    fig.show()

    return pca, df, fig, clusterer, scaler


In [56]:
embed_and_plot_hdb(pd.read_csv('hp_data_set_1000.csv'))


(PCA(n_components=2),
                                   document_name  \
 0          Harry Potter e a Pedra Filosofal.pdf   
 1          Harry Potter e a Pedra Filosofal.pdf   
 2          Harry Potter e a Pedra Filosofal.pdf   
 3          Harry Potter e a Pedra Filosofal.pdf   
 4          Harry Potter e a Pedra Filosofal.pdf   
 ...                                         ...   
 2290  Harry Potter e as Relíquias da Morte.pdf   
 2291  Harry Potter e as Relíquias da Morte.pdf   
 2292  Harry Potter e as Relíquias da Morte.pdf   
 2293  Harry Potter e as Relíquias da Morte.pdf   
 2294  Harry Potter e as Relíquias da Morte.pdf   
 
                                              chunk_text  init_page  end_page  \
 0     DADOS\tDE\tODINRIGHT\nSobre\ta\tobra:\nA\tpres...          2         2   
 1     conhecimento,\te\tnão\nmais\tlutando\tpor\tdin...          2         2   
 2     Converted\tby\t\nePubtoPDFDADOS\tDE\tCOPYRIGHT...          3         5   
 3     Título\tOriginal:\tHa