# Clusterization test in books
### Objective: Test the formation of clusters in bigger documents (books) and then ask a question to identify the best cluster.

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [86]:
import os
import pandas as pd
import openai
from dotenv import load_dotenv
import PyPDF2  # Import PyPDF2

# Load environment variables
load_dotenv()

# Function to extract text from the first 100 pages of a PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        pages_text = []
        for page_num in range(min(100, len(reader.pages))):  # Extract from the first 100 pages or total pages if less than 100
            page = reader.pages[page_num]
            page_text = page.extract_text() if page.extract_text() else ''  # Extract text or return empty string if None
            pages_text.append(page_text)
    return pages_text

def split_into_chunks(pages_text, chunk_size):
    chunks = []  # To hold chunks of concatenated page texts
    page_ranges = []  # To hold corresponding page ranges for each chunk

    current_chunk = ''  # Accumulates text for the current chunk
    chunk_start_page = 0  # Tracks the start page index of the current chunk
    for i, page_text in enumerate(pages_text):
        # Check if the current page fits into the current chunk
        if len(current_chunk) + len(page_text) <= chunk_size:
            # Add the page text to the chunk
            current_chunk += page_text
            # If it's the last page, close off the current chunk
            if i == len(pages_text) - 1:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i))
        else:
            # If the current chunk is not empty, save it before starting a new one
            if current_chunk:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i - 1))
            # Start a new chunk with the current page
            # But if the page itself exceeds the chunk size, handle it specially
            if len(page_text) > chunk_size:
                # Break down the large page into subchunks, if necessary
                for start in range(0, len(page_text), chunk_size):
                    end = start + chunk_size
                    chunks.append(page_text[start:end])
                    page_ranges.append((i, i))  # This subchunk still corresponds to the current page
                current_chunk = ''  # Reset for the next chunk after handling large page
                chunk_start_page = i + 1  # Next chunk will start from the following page
            else:
                # Otherwise, just start the new chunk normally
                current_chunk = page_text
                chunk_start_page = i
            # Handle case where this is the last page and its text hasn't been added to chunks
            if i == len(pages_text) - 1 and len(page_text) <= chunk_size:
                chunks.append(current_chunk)
                page_ranges.append((chunk_start_page, i))

    return chunks, page_ranges




def generate_embeddings_openai(text, openai_api_key=os.getenv('OPEN_AI_API_KEY')):
    openai.api_key = openai_api_key
    try:
        response = openai.Embedding.create(
            input=text,  # Ensure 'input' is correct according to the OpenAI API documentation
            model="text-embedding-ada-002"
        )
        embedding_list = response['data'][0]['embedding']
        return embedding_list
    except Exception as e:
        print(f"An error occurred: {e}")
        return []  # Return an empty list if there's an error

# Function to create the dataset
def create_dataset(book_paths, chunk_size=1000):
    rows = []
    
    for book_path in book_paths:
        print(f"Processing {book_path}")
        pages_text = extract_text_from_pdf(book_path)
        chunks, page_ranges = split_into_chunks(pages_text, chunk_size)
        
        for chunk, (start_page, end_page) in zip(chunks, page_ranges):
            embedding = generate_embeddings_openai(chunk)
            rows.append({
                'document_name': os.path.basename(book_path),
                'chunk_text': chunk,
                'init_page': start_page + 1,  # Adding 1 to make page numbers human-readable
                'end_page': end_page + 1,    # Adding 1 to make page numbers human-readable
                'embedding': embedding
            })
    
    return pd.DataFrame(rows)

# List of your book paths
book_paths = [
    'livros/duna.pdf',
    'livros/eu robo.pdf',
    'livros/guerra dos tronos.pdf',
    'livros/harry potter e a pedra filosofal.pdf',
    'livros/origem das especies.pdf',
    'livros/perdido em marte.pdf'
]




In [87]:
# Create the dataset
dataset = create_dataset(book_paths)

# Save to CSV
dataset.to_csv('book_chunks_dataset.csv', index=False)
print(len(dataset))

dataset2 = create_dataset(book_paths, chunk_size=500)


dataset2.to_csv('book_chunks_dataset500.csv', index=False)
print(len(dataset2))


Processing livros/duna.pdf
Processing livros/eu robo.pdf
Processing livros/guerra dos tronos.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/origem das especies.pdf
Processing livros/perdido em marte.pdf
1055
Processing livros/duna.pdf
Processing livros/eu robo.pdf
Processing livros/guerra dos tronos.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/origem das especies.pdf
Processing livros/perdido em marte.pdf
1888


In [32]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

# Load the dataset
df = pd.read_csv('book_chunks_dataset.csv')

# Assuming 'embedding' column is stored as a string representation of a list, convert back to actual list
df['embedding'] = df['embedding'].apply(lambda x: eval(x))

# Extract the first 15 words from each chunk for the hover information
df['first_15_words'] = df['chunk_text'].apply(lambda x: ' '.join(str(x).split()[:15]))


# Convert list of embeddings into a 2D numpy array
X = np.array(df['embedding'].tolist())

# Standardizing the features (important for PCA)
X = StandardScaler().fit_transform(X)

# Perform PCA to reduce the embeddings to 2 dimensions
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)

# Add the principal components to the dataframe
df['principal component 1'] = principalComponents[:, 0]
df['principal component 2'] = principalComponents[:, 1]

# Plot using Plotly
fig = go.Figure()

# Color map for different books
colors = {'duna.pdf': 'red', 'eu robo.pdf': 'green', 'guerra dos tronos.pdf': 'blue', 
          'harry potter.pdf': 'yellow', 'origem das especies.pdf': 'cyan', 
          'perdido em marte.pdf': 'magenta'}

# Add traces for each book
for book, color in colors.items():
    book_df = df[df['document_name'] == book]
    fig.add_trace(go.Scatter(
        x=book_df['principal component 1'], 
        y=book_df['principal component 2'],
        text=book_df['first_15_words'],  # This will be shown on hover
        mode='markers',
        marker_color=color,
        name=book
    ))

# Customize layout
fig.update_layout(
    title='PCA of Book Embeddings',
    xaxis_title='Principal Component 1',
    yaxis_title='Principal Component 2',
    legend_title='Book Name'
)

# Show plot
fig.show()


In [33]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go

# Load the dataset
def embbed_and_plot(df  = pd.read_csv('book_chunks_dataset.csv')):
    df['embedding'] = df['embedding'].apply(lambda x: eval(x))  # Convert stringified lists back to Python lists
    df['first_15_words'] = df['chunk_text'].apply(lambda x: ' '.join(str(x).split()[:15]))

    # Convert list of embeddings into a 2D numpy array and scale
    X = np.array(df['embedding'].tolist())
    scaler = StandardScaler().fit(X)  # Fit scaler to the data
    X_scaled = scaler.transform(X)  # Apply scaling

    # Perform PCA to reduce the embeddings to 2 dimensions
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(X_scaled)  # Apply PCA to scaled data

    # Add the principal components back to the dataframe
    df['principal component 1'] = principalComponents[:, 0]
    df['principal component 2'] = principalComponents[:, 1]

    # Plot initialization
    fig = go.Figure()
    colors = {'duna.pdf': 'red', 'eu robo.pdf': 'green', 'guerra dos tronos.pdf': 'blue',
            'harry potter.pdf': 'yellow', 'origem das especies.pdf': 'cyan',
            'perdido em marte.pdf': 'magenta'}
    centroids = {}

    # Add traces for each book
    for book, color in colors.items():
        book_df = df[df['document_name'] == book]
        fig.add_trace(go.Scatter(
            x=book_df['principal component 1'], 
            y=book_df['principal component 2'],
            text=book_df['first_15_words'],  # Display on hover
            mode='markers',
            marker_color=color,
            name=book
        ))
        book_df = df[df['document_name'] == book]
        centroid = book_df[['principal component 1', 'principal component 2']].mean().values
        centroids[book] = centroid  # Store the centroid
        fig.add_trace(go.Scatter(
            x=[centroid[0]], 
            y=[centroid[1]],
            text=[book],  # Book name for centroid hover
            mode='markers+text',
            marker_symbol='x',
            marker_size=12,
            marker_color='black',
            showlegend=False,
            textposition="top center"
        ))

    # Customize layout
    fig.update_layout(
        title='PCA of Book Embeddings with Centroids',
        xaxis_title='Principal Component 1',
        yaxis_title='Principal Component 2',
        legend_title='Book Name'
    )

    # Show plot
    fig.show()
    
    return df,fig,centroids,scaler





In [34]:
df,fig,centroids,scaler = embbed_and_plot()

# 2 - Query

In [35]:
import os 
import openai
def generate_embeddings_openai(text, openai_api_key=os.getenv('OPEN_AI_API_KEY')):
    openai.api_key = openai_api_key
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    embedding = response['data'][0]['embedding']
    return np.array(embedding)

In [36]:
def query_and_plot(fig,scaler,query_text = "estou perdido em marte"):
    query_embedding = generate_embeddings_openai(query_text)  # Get embedding using OpenAI
    query_embedding_scaled = scaler.transform(np.array(query_embedding).reshape(1, -1))  # Scale using the same scaler
    query_pca = pca.transform(query_embedding_scaled)  # Apply PCA

    # Add the query to the plot for visualization
    fig.add_trace(go.Scatter(
        x=[query_pca[0, 0]], 
        y=[query_pca[0, 1]],
        text=["Query: estou perdido em marte"],  # Display the actual query text
        mode='markers+text',
        marker_symbol='star',  # Change marker symbol for better visibility
        marker_size=20,  # Increase marker size
        marker_color="lime",  # Use a bright color for visibility
        showlegend=False,
        textposition="top center",  # Adjust text position for clarity
        textfont=dict(  # Customize font properties for better visibility
            family="Arial, sans-serif",
            size=12,  # Increase text size
            color="blue",  # Choose a color that contrasts well with the marker color
        ),
    ))
    fig.show()
    return query_embedding,query_pca
    



In [38]:
query_embedding,query_pca = query_and_plot(fig,scaler)

In [39]:
# Assuming your query's PCA coordinates are stored in query_pca

# Function to calculate Euclidean distance
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2)) ** 2))

def query_to_centroid_distances(query_pca, centroids):
    distances = {}
    for book, centroid in centroids.items():
        dist = euclidean_distance(query_pca, centroid)
        distances[book] = dist
    return distances




In [40]:

# Print the distances
distances = query_to_centroid_distances(query_pca, centroids)
for book, distance in sorted(distances.items(), key=lambda item: item[1]):
    print(f"Distance from query to centroid of '{book}': {distance}")

Distance from query to centroid of 'eu robo.pdf': 5.080078119885639
Distance from query to centroid of 'perdido em marte.pdf': 5.98259611807344
Distance from query to centroid of 'duna.pdf': 15.263633769322736
Distance from query to centroid of 'guerra dos tronos.pdf': 27.255224358010715
Distance from query to centroid of 'harry potter.pdf': nan
Distance from query to centroid of 'origem das especies.pdf': 32.07970497655995


# 3 - Similarity of chunk heuristic

Heuristic for finding chunks:

1. Rank the distance from the query to the centroids of the documents.

2. Use a distance threshold:

    2.a. If the distance threshold is exceeded, only use that centroid; 

    2.b. if it's lower, use all books within the threshold for comparison.

In [41]:
def find_closest_books(query_embedding, centroids, distance_threshold=1.5):
    distances = {}
    for book, centroid in centroids.items():
        dist = euclidean_distance(query_embedding, centroid)
        distances[book] = dist
    sorted_distances = sorted(distances.items(), key=lambda item: item[1])
    books_to_compare = [book for book, distance in sorted_distances if distance - sorted_distances[0][1] <= distance_threshold]
    return books_to_compare

# Find the closest books to the query


In [42]:
closest_books = find_closest_books(query_pca[0], centroids)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")

The closest books to the query are: eu robo.pdf ; perdido em marte.pdf


In [43]:
from scipy.spatial.distance import cosine
import numpy as np

def compare_similarity(query_embedding, chunks, book_name):
    # Ensure the query embedding is a 1-D numpy array
    query_vec = np.array(query_embedding).flatten()
    
    # Dictionary to store similarities
    similarities = {}

    # Calculate cosine similarity for each chunk
    for index, row in chunks.iterrows():
        chunk_embedding = np.array(row['embedding']).flatten()  # Convert chunk embedding to 1-D numpy array
        # Compute cosine similarity (cosine returns the distance, so subtract from 1)
        cos_sim = 1 - cosine(query_vec, chunk_embedding)
        # Store the similarity along with chunk text and book name
        similarities[(row['chunk_text'], book_name)] = cos_sim

    return similarities


In [99]:
def find_best_chunk(query_embedding, closest_books, df):
    best_similarity = float('-inf')  # Initialize with the lowest similarity
    best_chunk_index = None  # To store the index of the best chunk

    for book in closest_books:
        book_chunks = df[df['document_name'] == book]
        # Calculate similarities for all chunks in the current book
        similarities = compare_similarity(query_embedding, book_chunks, book)
        
        # Find the best chunk for the current book
        for (chunk_text, _), cos_sim in similarities.items():
            if cos_sim > best_similarity:
                best_similarity = cos_sim
                # Find the index of the chunk with the best similarity
                best_chunk_index = book_chunks[book_chunks['chunk_text'] == chunk_text].index[0]

    # Retrieve the entire row for the best chunk using the index
    if best_chunk_index is not None:
        best_chunk_info = df.loc[best_chunk_index]
        return best_chunk_info, best_similarity
    else:
        return None, None  # In case no best chunk was found


In [None]:
best_chunk, best_book, best_similarity, start_page, end_page = find_best_chunk(query_embedding, closest_books, df)
print(f"Best matching chunk (from {best_book}, pages {start_page} to {end_page}): {best_chunk[:100]}...")  # Showing first 100 characters for brevity
print(f"Similarity: {best_similarity}")

# 4 - Test with smaller chunks

In [111]:
dataset2 = create_dataset(book_paths,50)

# Save to CSV
dataset2.to_csv('book_chunks_dataset_dataset50.csv', index=False)


Processing livros/duna.pdf
Processing livros/eu robo.pdf
Processing livros/guerra dos tronos.pdf
Processing livros/harry potter e a pedra filosofal.pdf
Processing livros/origem das especies.pdf
Processing livros/perdido em marte.pdf
An error occurred: The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID req_2508e31341cdcc8fdd283badbf16beb8 in your message.) {
  "error": {
    "message": "The server had an error while processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID req_2508e31341cdcc8fdd283badbf16beb8 in your message.)",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'The server had an error while processing your request. Sorry about that! You can re

In [90]:
df,fig,centroids,scaler = embbed_and_plot(df  = pd.read_csv('book_chunks_dataset500.csv'))

In [91]:
query_embedding,query_pca = query_and_plot(fig,scaler)


In [92]:
closest_books = find_closest_books(query_pca[0], centroids)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")


The closest books to the query are: perdido em marte.pdf


In [110]:
best_chunk_info, best_similarity = find_best_chunk(query_embedding, closest_books, df)
clean_chunk_text = best_chunk_info['chunk_text'].replace('\t', ' ')

from termcolor import colored

header = colored(f"{'Best Chunk Info':^60}", 'blue', 'on_white', attrs=['bold'])  # Blue text on white background, bold
divider = colored('-' * 60, 'red')  # Red divider line

# Format each row with left alignment, ensuring each piece of info is neatly aligned, with different colors
similarity_row = colored(f"{'Best Similarity:':<20}", 'yellow') + str(best_similarity)
document_row = colored(f"{'Document Name:':<20}", 'green') + best_chunk_info['document_name']
text_row = colored(f"{'Chunk Text:':<20}", 'cyan') + clean_chunk_text + "..."
init_page_row = colored(f"{'Initial Page:':<20}", 'magenta') + str(best_chunk_info['init_page'])
end_page_row = colored(f"{'End Page:':<20}", 'magenta') + str(best_chunk_info['end_page'])

# Print each part of the table
print(header)
print(divider)
print(similarity_row)
print(document_row)
print(text_row)
print(init_page_row)
print(end_page_row)

#print(f"First 15 Words: {best_chunk_info['first_15_words']}")
#print(f"Principal Component 1: {best_chunk_info['principal component 1']:.6f}")
#print(f"Principal Component 2: {best_chunk_info['principal component 2']:.6f}")




[1m[107m[34m                      Best Chunk Info                       [0m
[31m------------------------------------------------------------[0m
[33mBest Similarity:    [0m0.8626173606155557
[32mDocument Name:      [0mperdido em marte.pdf
[36mChunk Text:         [0mMarte não é desconhecido para mim. Estou aqui há muito tempo. Mas, até
hoje, o Hab nunca havia ficado fora do meu campo de visão. Talvez você
ache que isso não faz diferença, mas faz.
À medida que eu avançava rumo ao local em que o GTR estava
enterrado, me dei conta de que Marte é um deserto estéril e de que estou
completamente
 sozinho aqui. Eu já sabia disso, é claro. Mas existe uma
diferença entre saber e de fato vivenciar algo. À minha volta, não há nada,
apenas poeira, pedras e um deserto i...
[35mInitial Page:       [0m72
[35mEnd Page:           [0m72


# 5 - even smaller chunks

In [113]:
df,fig,centroids,scaler = embbed_and_plot(df  = pd.read_csv('book_chunks_dataset_dataset50.csv'))


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16704,) + inhomogeneous part.

In [None]:
query_embedding,query_pca = query_and_plot(fig,scaler)


In [None]:
closest_books = find_closest_books(query_pca[0], centroids)
print(f"The closest books to the query are: {' ; '.join(closest_books)}")


In [None]:
best_chunk_info, best_similarity = find_best_chunk(query_embedding, closest_books, df)
clean_chunk_text = best_chunk_info['chunk_text'].replace('\t', ' ')

from termcolor import colored

header = colored(f"{'Best Chunk Info':^60}", 'blue', 'on_white', attrs=['bold'])  # Blue text on white background, bold
divider = colored('-' * 60, 'red')  # Red divider line

# Format each row with left alignment, ensuring each piece of info is neatly aligned, with different colors
similarity_row = colored(f"{'Best Similarity:':<20}", 'yellow') + str(best_similarity)
document_row = colored(f"{'Document Name:':<20}", 'green') + best_chunk_info['document_name']
text_row = colored(f"{'Chunk Text:':<20}", 'cyan') + clean_chunk_text + "..."
init_page_row = colored(f"{'Initial Page:':<20}", 'magenta') + str(best_chunk_info['init_page'])
end_page_row = colored(f"{'End Page:':<20}", 'magenta') + str(best_chunk_info['end_page'])

# Print each part of the table
print(header)
print(divider)
print(similarity_row)
print(document_row)
print(text_row)
print(init_page_row)
print(end_page_row)

#print(f"First 15 Words: {best_chunk_info['first_15_words']}")
#print(f"Principal Component 1: {best_chunk_info['principal component 1']:.6f}")
#print(f"Principal Component 2: {best_chunk_info['principal component 2']:.6f}")


