In this code i'm aiming to process text from PDF documents & chunk it with an overlap, create embeddings for the words in the text, store these embeddings in a database, and then retrieve and use these embeddings to answer queries.

MAIN STEPS :
-Fetching from the pdf using Pdfminer
-Lowercase, cleaning the text removing ponctuation
-The cleaned text is divided into chunks that have specified size with overlap.
-Saving into a file
-Embed For each word and stored into a Database
-Embeding reading back from the database to respond to queries
-Clean Process & embed the query
-The queries are entred by the user

In [6]:
from pdfminer.high_level import extract_text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import string
import sqlite3
import nltk
import os
import numpy as np

# Download resources for tokenizing and stopword
nltk.download('punkt')
nltk.download('stopwords')

def fetch_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

def preprocess_text(text):
    text = text.lower()
    # Tokenize text into words
    tokens = word_tokenize(text)
    # Define the stopword to be removed 
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    # Remove stopwords and punctuation from tokens
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    # Return tokens instead of joining back into string for Word2Vec training
    return tokens

def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def save_chunks_to_file(chunks, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for chunk in chunks:
            file.write(chunk + '\n')

def save_embeddings_to_database(embeddings, database_path):
    # Connect to SQLite database
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    # Create a table for the embeddings if it doesn't exist
    c.execute('''CREATE TABLE IF NOT EXISTS embeddings (word TEXT, embedding BLOB)''')
    for word, embedding in embeddings.items():
        # Convert the embedding to a bytes-like object
        embedding_blob = np.array(embedding).tobytes()
        # Insert the word and its embedding into the database
        c.execute("INSERT INTO embeddings (word, embedding) VALUES (?, ?)", (word, embedding_blob))
    conn.commit()
    conn.close()

def generate_word2vec_embeddings(tokenized_text):
    # Train Word2Vec model on tokenized text
    model = Word2Vec(sentences=[tokenized_text], vector_size=100, window=5, min_count=1, workers=4)
    embeddings = {word: model.wv[word] for word in model.wv.key_to_index}
    return embeddings

def process_pdfs():
    books = [
        {"title": "PDF1", "pdf_path": "infos/pdf1.pdf"},
        {"title": "PDF2", "pdf_path": "infos/pdf2.pdf"},
    ]
    
    
    all_tokens = []
    for book in books:
        pdf_path = book["pdf_path"]
        text = fetch_text_from_pdf(pdf_path)
        tokens = preprocess_text(text)
        all_tokens.extend(tokens)

    # Use the list of all tokens for Word2Vec training
    embeddings = generate_word2vec_embeddings(all_tokens)
    save_embeddings_to_database(embeddings, "embeddings.db")
    print("Embeddings saved to embeddings.db")

process_pdfs()

ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\Users\21264\OneDrive\Bureau\stage_code\.venv\lib\site-packages\scipy\linalg\__init__.py)

In [7]:
import sqlite3
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess the query
def preprocess_query(query):
    query = query.lower()
    tokens = word_tokenize(query)
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

def load_embeddings_from_database(database_path):
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute("SELECT word, embedding FROM embeddings")
    embeddings = {}
    for row in c.fetchall():
        word, embedding_bytes = row
        if embedding_bytes is not None:
            embedding = np.frombuffer(embedding_bytes, dtype=np.float64)  
            embeddings[word] = embedding
    conn.close()
    return embeddings

# Function to read embeddings
def read_embeddings():
    embeddings = load_embeddings_from_database("embeddings.db")
    print("Number of embeddings loaded:", len(embeddings))
    return embeddings

# Load embeddings from database
embeddings = read_embeddings()

# Function to calculate embedding for the query
def get_embedding_for_query(query, embeddings):
    processed_query = preprocess_query(query)
    query_words = processed_query.split()
    query_embedding = np.zeros(next(iter(embeddings.values())).shape)
    for word in query_words:
        word_embedding = embeddings.get(word)
        if word_embedding is not None:
            query_embedding += word_embedding
    return query_embedding

# Main function
def main():
    query = input("Enter your query: ")
    query_embedding = get_embedding_for_query(query, embeddings)
    if np.any(query_embedding):
        print("Embedding for query:", query_embedding)
        print("The answer is:")
        print("The query was:", query)
        print("The query embedding was:", query_embedding)
    else:
        print("Embedding not found for the query.")

# Call the main function
if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


OperationalError: no such table: embeddings