In [4]:
from pdfminer.high_level import extract_text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import string
import sqlite3
import nltk
import os
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

def fetch_text_from_pdf(pdf_path):
    text = extract_text(pdf_path)
    return text

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    words = text.split()
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def save_chunks_to_file(chunks, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for chunk in chunks:
            file.write(chunk + '\n')

def save_embeddings_to_database(embeddings, database_path):
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS embeddings (word TEXT, embedding BLOB)''')
    for word, embedding in embeddings.items():
        c.execute("INSERT INTO embeddings (word, embedding) VALUES (?, ?)", (word, embedding))
    conn.commit()
    conn.close()

def process_pdfs():
    books = [
        {"title": "PDF1", "pdf_path": "infos/pdf1.pdf"},
        {"title": "PDF2", "pdf_path": "infos/pdf2.pdf"},
    ]

    all_text = ""
    for book in books:
        pdf_path = book["pdf_path"]
        text = fetch_text_from_pdf(pdf_path)
        all_text += text + " "

    preprocessed_text = preprocess_text(all_text)
    chunks = chunk_text(preprocessed_text)
    save_chunks_to_file(chunks, "chunked_text.txt")
    print("Text chunked and saved to chunked_text.txt")

    embeddings = {word: np.random.rand(100) for word in preprocessed_text.split()}
    save_embeddings_to_database(embeddings, "embeddings.db")
    print("Embeddings saved to embeddings.db")

process_pdfs()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text chunked and saved to chunked_text.txt
Embeddings saved to embeddings.db


In [1]:
import sqlite3
import numpy as np

def load_embeddings_from_database(database_path):
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute("SELECT word, embedding FROM embeddings")
    embeddings = {}
    for row in c.fetchall():
        word, embedding_bytes = row
        if embedding_bytes is not None:
            embedding = np.frombuffer(embedding_bytes, dtype=np.float64)  
            embeddings[word] = embedding
    conn.close()
    return embeddings

def read_embeddings():
    embeddings = load_embeddings_from_database("embeddings.db")
    print("Number of embeddings loaded:", len(embeddings))
    return embeddings

read_embeddings()

Number of embeddings loaded: 3912


{'european': array([0.7667253 , 0.37914913, 0.63054687, 0.02192171, 0.7893276 ,
        0.2519169 , 0.30949905, 0.42973265, 0.65453401, 0.42269404,
        0.12664327, 0.76415175, 0.10821781, 0.47422589, 0.8085451 ,
        0.21090245, 0.81918145, 0.5153964 , 0.41808456, 0.30631421,
        0.33267555, 0.18590099, 0.65270459, 0.96168945, 0.83278719,
        0.54367592, 0.71623253, 0.58638043, 0.51727186, 0.25068058,
        0.52295926, 0.77518796, 0.65681086, 0.55136519, 0.93500361,
        0.47638609, 0.63458183, 0.25073671, 0.16742038, 0.45393638,
        0.90450684, 0.03138465, 0.18778431, 0.0435912 , 0.62700905,
        0.10525234, 0.60218821, 0.11981057, 0.44835849, 0.63635964,
        0.9742632 , 0.60931691, 0.25765896, 0.13443516, 0.99679817,
        0.15999461, 0.44973089, 0.74775612, 0.48857702, 0.38732987,
        0.58598088, 0.31164296, 0.12810703, 0.92885858, 0.0281233 ,
        0.48759813, 0.80180388, 0.10612668, 0.3799141 , 0.99964721,
        0.1194703 , 0.80531051, 0.90

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')


def preprocess_query(query):
    query = query.lower()
    tokens = word_tokenize(query)
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

def get_embedding_for_query(query, embeddings):
    processed_query = preprocess_query(query)
    query_words = processed_query.split()
    query_embedding = np.zeros(next(iter(embeddings.values())).shape)
    for word in query_words:
        word_embedding = embeddings.get(word)
        if word_embedding is not None:
            query_embedding += word_embedding
    return query_embedding


def main():
    embeddings = read_embeddings()
    query = input("Enter your query: ")
    query_embedding = get_embedding_for_query(query, embeddings)
    if np.any(query_embedding):
        print("Embedding for query:", query_embedding)
    else:
        print("Embedding not found for the query.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of embeddings loaded: 3912
Embedding for query: [0.68655571 0.87979383 0.83424242 1.4036927  0.57161455 1.08477741
 0.26801851 1.10863881 0.39474407 1.72928008 0.73697307 1.53816068
 0.77853735 1.58554982 0.73840299 1.18564719 0.65912096 1.18233179
 1.19016737 1.16447212 0.63355192 0.5390645  1.61630406 0.82396819
 0.93704555 0.83847995 1.40592965 1.52211332 0.21416204 0.95796045
 0.82650326 1.23231543 1.79291469 0.54101719 0.82942987 1.47187648
 1.01891343 0.52251463 0.84245435 0.143063   1.59098547 0.87458322
 0.83808107 0.94878541 1.39443882 1.03178418 0.26345463 0.53846496
 1.77733234 1.15206486 1.36773421 1.11410331 1.83704288 0.70142429
 0.71890317 0.26001846 1.36438498 0.16125803 0.4417589  0.92759539
 1.5982139  1.48662475 0.26157246 1.29738814 0.35201741 0.96891651
 0.30507134 0.85047823 0.83558282 1.45249986 0.64673011 0.68037273
 0.91016793 0.74863336 0.88667717 1.77227496 1.24811577 1.16176187
 0.87718977 1.14494334 1.21766775 0.40506189 0.5817683  0.41687516
 1.3928

In [8]:
import sqlite3
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess the query
def preprocess_query(query):
    query = query.lower()
    tokens = word_tokenize(query)
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    return " ".join(tokens)

# Function to load embeddings from the database
def load_embeddings_from_database(database_path):
    conn = sqlite3.connect(database_path)
    c = conn.cursor()
    c.execute("SELECT word, embedding FROM embeddings")
    embeddings = {}
    for row in c.fetchall():
        word, embedding_bytes = row
        if embedding_bytes is not None:
            embedding = np.frombuffer(embedding_bytes, dtype=np.float64)  
            embeddings[word] = embedding
    conn.close()
    return embeddings

# Function to read embeddings
def read_embeddings():
    embeddings = load_embeddings_from_database("embeddings.db")
    print("Number of embeddings loaded:", len(embeddings))
    return embeddings

# Load embeddings from database
embeddings = read_embeddings()

# Function to calculate embedding for the query
def get_embedding_for_query(query, embeddings):
    processed_query = preprocess_query(query)
    query_words = processed_query.split()
    query_embedding = np.zeros(next(iter(embeddings.values())).shape)
    for word in query_words:
        word_embedding = embeddings.get(word)
        if word_embedding is not None:
            query_embedding += word_embedding
    return query_embedding

# Main function
def main():
    query = input("Enter your query: ")
    query_embedding = get_embedding_for_query(query, embeddings)
    if np.any(query_embedding):
        print("Embedding for query:", query_embedding)
        # Use the query embedding to generate an answer with the Language Model
        # Replace this part with your actual LM implementation
        print("Placeholder answer generated by Language Model:")
        print("The query was:", query)
        print("The query embedding was:", query_embedding)
    else:
        print("Embedding not found for the query.")

# Call the main function
if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\21264\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of embeddings loaded: 3912
Embedding for query: [0.31878881 0.67145959 0.09491484 0.66150668 0.00367472 0.30194658
 0.12944304 0.35310358 0.0730894  0.7471159  0.56239595 0.54528684
 0.39939944 0.88154431 0.14869576 0.71813468 0.59243671 0.93249348
 0.57648225 0.78521805 0.41255318 0.40279258 0.85533805 0.01458796
 0.9155921  0.51738195 0.42922087 0.64569241 0.06331725 0.91313496
 0.58197437 0.83248099 0.79595754 0.22731882 0.66901746 0.4947503
 0.55395681 0.42622392 0.31982083 0.11728679 0.64862345 0.01726175
 0.48901826 0.20544044 0.86991393 0.73285846 0.08538066 0.03555209
 0.97930705 0.55265203 0.63748812 0.49962232 0.86732132 0.25664001
 0.24519888 0.13495361 0.8959084  0.11340226 0.32974604 0.26071801
 0.72715293 0.96221385 0.1208725  0.802721   0.10671672 0.62804241
 0.05636882 0.32548801 0.14610289 0.54979822 0.60365584 0.19138183
 0.40201631 0.49073447 0.26190799 0.99746458 0.76241343 0.37969107
 0.33505765 0.33441121 0.9835686  0.31350562 0.47718996 0.23323711
 0.94436

In [None]:
from safetensors import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2-medium"  # Choose a model size based on your resources
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate response
def generate_response(query):
    input_text = "Query: " + query + " Answer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    
    # Decode and return response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Main function
def main():
    query = input("Enter your query: ")
    response = generate_response(query)
    print("Generated response:", response)

# Call the main function
if __name__ == "__main__":
    main()