In [2]:
%pip install PyPDF2

Collecting PyPDF2
  Obtaining dependency information for PyPDF2 from https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl.metadata
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
   ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
   - -------------------------------------- 10.2/232.6 kB ? eta -:--:--
   ------ -------------------------------- 41.0/232.6 kB 495.5 kB/s eta 0:00:01
   ------------------------ --------------- 143.4/232.6 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 232.6/232.6 kB 1.4 MB/s eta 0:00:00
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [205]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
import pandas as pd
import PyPDF2
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\albin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
llm = OllamaLLM(model='gemma2:2b')

In [4]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = "".join([page.extract_text() for page in reader.pages])
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

def split_text_into_chunks(text, max_chunk_size=1000):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [5]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')
pdf_directory = "./food/"
df_documents = pd.DataFrame(columns=['path', 'text_chunks', 'embeddings'])

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        print(filename)
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        chunks = split_text_into_chunks(text)
        document_embeddings = encoder.encode(chunks)
        new_row = pd.DataFrame({'path': [pdf_path], 'text_chunks': [chunks], 'embeddings': [document_embeddings]})
        df_documents = pd.concat([df_documents, new_row], ignore_index=True)

df_documents
 

Detox_Diets_For_Dummies(r).pdf
Lifestyle_to_Health_Vegan_Cookbook_Recipes.pdf
Low-Calorie_Dieting_For_Dummies(r).pdf
The_Everythingr_Big_Book_of_Fat_Bombs_200_Irresistible_Low_ca.pdf
The_Vegan_Cookbook_for_Athletes_by_Nicolas_Benfatto_2019.pdf
Vegan Pressure Cooker Cookbook.pdf


Unnamed: 0,path,text_chunks,embeddings
0,./food/Detox_Diets_For_Dummies(r).pdf,"[Dr. Gerald Don Wootan, DO, M.Ed., Board-certi...","[[-0.030656353, 0.04133187, 0.03410362, 0.0546..."
1,./food/Lifestyle_to_Health_Vegan_Cookbook_Reci...,[Lifestyle\nto Health\nRecipesLifestyle\nto He...,"[[-0.042540703, -0.017190995, 0.010452322, 0.0..."
2,./food/Low-Calorie_Dieting_For_Dummies(r).pdf,"[by Susan McQuillan, MS, RDLow-Calorie\nDietin...","[[-0.076273605, -0.014882827, 0.013734911, 0.0..."
3,./food/The_Everythingr_Big_Book_of_Fat_Bombs_2...,"[Letter to the Reader\nDear Reader ,\nT wo yea...","[[0.005529354, 0.06740676, 0.06533004, 0.08768..."
4,./food/The_Vegan_Cookbook_for_Athletes_by_Nico...,[The V egan Cookbook For Athletes\n \n \nNico...,"[[-3.8143986e-05, -0.0020152247, -0.072555885,..."
5,./food/Vegan Pressure Cooker Cookbook.pdf,[Sri Satguru Jagjit Singh Ji eLibrary ...,"[[-0.056346007, 0.0012037745, -0.052860245, 0...."


In [82]:
all_embeddings = np.vstack(df_documents['embeddings'].tolist())
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(all_embeddings)

In [83]:
def find_most_similar_chunks(query, top_k=3):
    query_embedding = encoder.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = []
    total_chunks = sum(len(chunks) for chunks in df_documents['text_chunks'])
    for i, idx in enumerate(indices[0]):
        if idx < total_chunks:
            doc_idx = 0
            chunk_idx = idx
            while chunk_idx >= len(df_documents['text_chunks'].iloc[doc_idx]):
                chunk_idx -= len(df_documents['text_chunks'].iloc[doc_idx])
                doc_idx += 1
            results.append({
                'document': df_documents['path'].iloc[doc_idx],
                'chunk': df_documents['text_chunks'].iloc[doc_idx][chunk_idx],
                'distance': distances[0][i]
            })
    return results

In [99]:
def agent_question(context):
    prompt = """
    Role: You are a person who wants to eat right. 
    Context: {context}\n
    Task: You have to analyse an excerpt from context and generate a relevant and interesting question based on the provided context that user might ask. It has to be only 1 sentence.\n
    Question:"""
    prompt_gemma = ChatPromptTemplate.from_template(prompt)
    chain = prompt_gemma | llm
    question = chain.invoke({"context": context})
    return question.strip()


In [100]:
def agent_answer(question, context):
    prompt = """Context: {context}\n
        Role: You are a great expert in the field of vegetarianism and recipes for vegans. You can see how recipes are built, which flavor combinations are the most popular.\n
        Task: Answer the question based on the context provided. Your answer has to be short and logical \n
        Question: {question}\n
        Answer: """
    prompt_gemma = ChatPromptTemplate.from_template(prompt)
    chain = prompt_gemma | llm
    answer = chain.invoke({"context": context, "question": question})
    return answer.strip()

def query_documents(question):
    similar_chunks = find_most_similar_chunks(question)
    context = " ".join([result['chunk'].replace("\n", "") for result in similar_chunks])
    response = agent_answer(question, context)
    return response, similar_chunks

In [86]:
import random

In [134]:
num_chunks = sum(len(chunks) for chunks in df_documents['text_chunks'])
def generate_random_chunks(num):
    s = []
    for i in range(100):
        q = random.randint(5, num-2)
        if q not in s:
            s.append(q)
        else:
            while q in s:
                q = random.randint(0, num)
            s.append(q)
    return s
selected_chunks = generate_random_chunks(num_chunks)
first_chunk_portion = selected_chunks[:25]
second_chunk_portion = selected_chunks[25:50]
third_chunk_portion = selected_chunks[50:75]
fourth_chunk_portion = selected_chunks[75:]


In [135]:
def find_exact_chunk(num):
    for _ in df_documents['text_chunks']:
        if len(_) > num+1:
            return "\n".join([_[num], _[num+1]])
        else:
            num -= len(_)

In [137]:
def generate_synthetic_dataset(chunkster, num_samples):
    dataset = []
    for i in tqdm(range(num_samples)):
        # Генерация вопроса
        _ = chunkster[i]
        context = find_exact_chunk(_)
        question = agent_question(context)
        
        # Поиск ответа
        answer, relevant_chunks = query_documents(question)
        
        dataset.append({
            "question": question,
            "answer": answer,
            "relevant_chunks": relevant_chunks
        })
    return pd.DataFrame(dataset)

In [138]:
synth_data_1 = generate_synthetic_dataset(first_chunk_portion, 25)

100%|██████████| 25/25 [31:06<00:00, 74.67s/it]


In [140]:
synth_data_1.to_csv('synth1.csv', index=False)

In [141]:
synth_data_2 = generate_synthetic_dataset(second_chunk_portion, 25)

100%|██████████| 25/25 [32:14<00:00, 77.37s/it]


In [142]:
synth_data_2.to_csv('synth2.csv', index=False)

In [143]:
synth_data_3 = generate_synthetic_dataset(third_chunk_portion, 25)

100%|██████████| 25/25 [31:48<00:00, 76.35s/it]


In [144]:
synth_data_3.to_csv('synth3.csv', index=False)

In [145]:
synth_data_4 = generate_synthetic_dataset(fourth_chunk_portion, 25)

100%|██████████| 25/25 [30:38<00:00, 73.53s/it]


In [146]:
synth_data_4.to_csv('synth4.csv', index=False)

In [200]:
synthetic_dataset = pd.concat([synth_data_1, synth_data_2, synth_data_3, synth_data_4])

In [201]:
def del_quest(text):
    text = text.replace('Question:', '')
    return text.strip()

synthetic_dataset['question'] = synthetic_dataset['question'].apply(del_quest)
synthetic_dataset = synthetic_dataset.reset_index().drop(columns='index', axis=1)

In [266]:
synthetic_dataset.to_csv('synthetic_dataset.csv', index=False)
synthetic_dataset

Unnamed: 0,question,answer,relevant_chunks
0,How can I ensure my grocery shopping process c...,"While convenient, fast-paced grocery shopping ...",[{'document': './food/Detox_Diets_For_Dummies(...
1,How can I strategically use rewards throughout...,The text emphasizes rewarding yourself for pro...,[{'document': './food/Low-Calorie_Dieting_For_...
2,How can I incorporate more mindfulness practic...,The text suggests starting small by incorporat...,[{'document': './food/Low-Calorie_Dieting_For_...
3,What are the potential benefits of getting a h...,A hemoglobin A1c (HgbA1c) test measures your a...,[{'document': './food/Detox_Diets_For_Dummies(...
4,What is the significance of multiplying calori...,Multiplying calorie needs by activity levels a...,[{'document': './food/Low-Calorie_Dieting_For_...
...,...,...,...
95,How can I minimize my exposure to nitrogen dio...,"To minimize nitrogen dioxide exposure, priorit...",[{'document': './food/Detox_Diets_For_Dummies(...
96,How can adjusting my drinking habits and suppl...,Adjusting your drinking habits (like not drink...,[{'document': './food/Detox_Diets_For_Dummies(...
97,Why are high-protein diets so restrictive and ...,High-protein diets restrict foods high in carb...,[{'document': './food/Low-Calorie_Dieting_For_...
98,Given that arsenic is found in many common hou...,"To minimize your exposure to arsenic, prioriti...",[{'document': './food/Detox_Diets_For_Dummies(...


In [240]:
from sklearn.model_selection import train_test_split

In [251]:
llm = OllamaLLM(model='gemma2:2b')

In [252]:
X_train, X_test, y_train, y_test = train_test_split(synthetic_dataset['question'], synthetic_dataset['answer'], test_size=0.2, random_state=10)

In [253]:
before_finetuning_answers_test = []
for i in tqdm(X_test.values):
    prompt = """Below is an instruction that describes your role, paired with an input that provides question you need to answer. Write a response that appropriately completes the request.
        Instruction: You are a great expert in the field of vegetarianism and recipes for vegans. You can see how recipes are built, which flavor combinations are the most popular.\n
        Input: {question}
        Response: """
    prompt_gemma = ChatPromptTemplate.from_template(prompt)
    chain = prompt_gemma | llm
    answer = chain.invoke({"question": i})
    before_finetuning_answers_test.append(answer.strip())

100%|██████████| 20/20 [14:17<00:00, 42.86s/it]


In [260]:
mean_score = []
for i in range(20):
    reference_embeddings = encoder.encode(y_test.values[i], convert_to_tensor=True)
    generated_embeddings = encoder.encode(before_finetuning_answers_test[i], convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(generated_embeddings, reference_embeddings)
    mean_score.append(cosine_scores.diag().mean().item())

print('Mean score STS: ', np.array(mean_score).mean())

Mean score STS:  0.7004510939121247


In [264]:
for i,v in enumerate(before_finetuning_answers_test):
    print(i)
    print(v)
    print('------------------------------------------------------------------------------------------------')

0
This is a pretty tricky question to answer without more information!  Here's why, and what I need from you to give you the best response:

**Why it's tricky:**

* **No recipe provided:** I don't have access to specific recipes for vegan stuffed roasts. The time will vary drastically depending on the ingredients and style of stuffing.
* **Pressure cooker specifics:** Even with a general recipe, cook times in a pressure cooker are influenced by things like:
    *  The size of the roast
    *  Amount and type of filling (beans, grains, veggies all have different cooking times)
    *  Specific pressure cooker model 

**To give you an accurate answer, please provide:**

* **A link to a recipe:** If possible, find the exact vegan stuffed roast recipe you're interested in. 
* **Details about the filling:** What is the stuffing made of? (e.g., lentils and vegetables, wild rice with mushrooms)


**General Tips for pressure cooking:**

Even without a specific recipe, here's what I can tell you