In [None]:
! pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
! pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [None]:
!pip install -U langchain_huggingface



In [None]:
!pip install faiss-cpu



In [None]:
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from huggingface_hub import login
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
import torch
import datetime

In [None]:
pdf_folder = "/content/"

def extract_text_from_pdfs(pdf_folder):
    texts = {}
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            reader = PdfReader(pdf_path)
            text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
            texts[pdf_file] = text
    return texts

pdf_texts = extract_text_from_pdfs(pdf_folder)


In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents_with_embeddings = []

for pdf_name, text in pdf_texts.items():
    chunks = text_splitter.split_text(text)
    embeddings = embedding_model.embed_documents(chunks)
    documents_with_embeddings.extend(zip(chunks, embeddings))

vectorstore = FAISS.from_embeddings(documents_with_embeddings, embedding_model)


In [None]:
# Sprawdzenie, czy teksty zostały prawidłowo rozbite
for pdf_name, text in pdf_texts.items():
    chunks = text_splitter.split_text(text)
    if not chunks:
        print(f"Brak tekstu w dokumencie {pdf_name}")
    else:
        print(f"Podzielono tekst w {pdf_name} na {len(chunks)} fragmentów.")

    embeddings = embedding_model.embed_documents(chunks)
    if len(embeddings) != len(chunks):
        print(f"Problem z embeddingami w {pdf_name}, oczekiwano {len(chunks)}, a otrzymano {len(embeddings)}.")
    else:
        print(f"Zakonczono embeddingi dla {pdf_name}.")

    documents_with_embeddings.extend(zip(chunks, embeddings))

print(f"Znalazłem {len(documents_with_embeddings)} par (tekst, embedding).")


Podzielono tekst w s11356-021-13769-x.pdf na 108 fragmentów.
Zakonczono embeddingi dla s11356-021-13769-x.pdf.
Podzielono tekst w 1-s2.0-S0045653521030599-main.pdf na 129 fragmentów.
Zakonczono embeddingi dla 1-s2.0-S0045653521030599-main.pdf.
Podzielono tekst w 1-s2.0-S0048969721030722-main.pdf na 125 fragmentów.
Zakonczono embeddingi dla 1-s2.0-S0048969721030722-main.pdf.
Podzielono tekst w 1-s2.0-S0013935122001827-main.pdf na 130 fragmentów.
Zakonczono embeddingi dla 1-s2.0-S0013935122001827-main.pdf.
Podzielono tekst w 1-s2.0-S0043135417309272-main.pdf na 94 fragmentów.
Zakonczono embeddingi dla 1-s2.0-S0043135417309272-main.pdf.
Znalazłem 1172 par (tekst, embedding).


In [None]:
# Define pipeline with optimized settings
text_gen_pipeline = pipeline(
    "text-generation",
    model="facebook/opt-1.3b",
    device_map="auto",
    torch_dtype=torch.float16,  # Faster inference
    max_new_tokens=300,
    temperature=0.7,  # Controls randomness (lower = more precise)
    top_k=50,  # Filters out low-probability words
    top_p=0.9,  # Nucleus sampling (limits unlikely words)
    repetition_penalty=1.2,
    do_sample = True
)

# Wrap in LangChain
llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

Device set to use cuda:0


In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [None]:
def log_interaction(query, response, log_file="qa_logs.txt"):
    """
    Logs the user query and model response to a file with a timestamp.

    Args:
        query (str): The input query from the user.
        response (str): The model's response.
        log_file (str): The file to store logs (default: "qa_logs.txt").
    """
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"{timestamp}\nQuestion: {query}\nResponse: {response}\n{'-'*50}\n"

    with open(log_file, "a", encoding="utf-8") as f:
        f.write(log_entry)


In [None]:
def ask_question(question):
    result = qa_chain.invoke({"query": question})
    response = result['result'].split("Helpful Answer:")[-1].strip()
    log_interaction(question, response)
    return response

In [None]:
print(ask_question("What is the role of biofilms in microplastics?"))

Microplastic contamination increases microbial diversity in soils and
water environments where they accumulate and biodegrade, thus increasing the potential
for microbial health benefits. A recent study found that microbes living on microplastis-
tic surfaces are more resistant than those living on other types of plastic particles
when exposed to UV radiation (Hahn et al., 2020). Therefore, it is likely that
microbiomes with diverse populations have higher rates of microbial adaptation
to the environmental conditions encountered in agriculture and aquaculture.
To illustrate this point, we examined the effects of two different plastics, one
made out of polyethylene terephthalate (PET), and another made from polypropylenes
(PP), on bacteria communities using a method called DNA polymerase chain re-
action (PCR)-mediated amplification. We found that while both plastics increased the
number of cells present in the environment, PET had the largest increase. This
increased cell density was a

In [None]:
print(ask_question("How do antibiotics impact microplastics in marine environments?"))

The study shows that both antibiotic exposure and microplastic 
emission have detrimental impacts on microbial communities. These results may lead 
to further research and prevention of antimicrobial resistance.


In [None]:
print(ask_question("What is microplastic"))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Microplastic can occur naturally in the ocean but most likely is leached into
the marine food chain through human activities.

from plastics used by humans and animals, and from pollution sources
such as urban runoff and storm sewers. In addition, microplastic can come
into contact with other types of plastic, including bottles, bags, straws,
towels, packaging material, etc. (Smith et al., 2018; Thompson et al.,
2004). The term “microplastic” refers to particles smaller than 10 μm,
which has now become the standard size for this category of contamina-
tion (Grimm et al., 2012; Thompson et al., 2004). It should be noted that ia
biological characterization of microplastics has only started since 2006.

As mentioned above, studies on microplastic have shown evidence of ia
contamination of different types of plastic containers (Frey and Pritchard, 2017;
Chien and Smith, 2017). A recent study conducted by the Center for Applied Environmen
tic Studies at California Polytechnic University fou

In [None]:
print(ask_question( "What are the potential health risks of microplastics in drinking water?"))

The major health hazards of microplastics include ingestion or inhalation of microplastics, which can lead to gastrointestinal illness; contact with
microplastics contaminated soil/environmental surfaces, which could result in skin lesions or other diseases; direct
contact with ingested plastic fibers or other foreign bodies which might cause infection; prolonged exposure to microplastics can result in increased susceptibility to infectious agents, including viral infections (including COVID-19) and bacterial infections (e.g., Legionnaires' disease). This may increase the risk of developing chronic conditions and death due to respiratory failure, especially if these disorders are exacerbated by chronic stressors like obesity, diabetes mellitus, cardiovascular disease, asthma, etc.

Question: How does consumption affect health outcomes?
Answer: Ingestion of microplastics has been suggested as one of the reasons for cancer development in humans. The exact mechanism of this process remain

In [None]:
print(ask_question( "How effective are coagulation, flocculation, and settling in removing microplastics from drinking water?"))


Coagulation, flocculation, and settling can all remove some fractional amounts of participles, but they do so in a different manner than sand filtration does. Coagulation removes fine particles while flocculation and sedimentation both remove larger particles. Sand filtration filters out large fragments of material with high surface area such as sand or silica sand, whereas sedimentation focuses on smaller fragments like clumps of leaves or small chunks of wood. Therefore, these two methods have distinct advantages and disadvantages relative to each other.


In [None]:
print(ask_question("What role does pre-ozonation play in microplastic removal in water treatment plants?"))

The authors state in the Introduction that they used 
Ozone-containing compounds that had a half-life of several days or longer
to perform their experiments which was very helpful because it allowed them to 
analyze the results for any changes from pre-ozonation conditions. This
method allows one to see how long it took for the different chemicals to 
leave the system before using a new chemical. It can help one determine if there
were any significant differences between the two experimental conditions.
This technique will also allow one to test different amounts of various 
compounds without having to rely solely on laboratory measurements since 
they could use the results obtained during the experiment itself. Although
the authors stated that some of the variables measured by the method were 
not statistically significant, they did note that the mean values for the partici-
pants’ concentration varied significantly over time so it would behoove 
one to conduct additional tests using 

In [None]:
print(ask_question("How are polymers identified"))

Polyethylene (PE) is found throughout the sample. It is mostly found
in the outer layer of the membrane. PE is one of the most commonly
used plastics because it is inexpensive, lightweight, flexible, easy to
machine, and has good mechanical properties. Its molecular weight ranges between 2,000 and 18,500.
The first two letters in its name represent ethylene oxide units, i.e., ethylene and ethylene oxide.
A number indicates the degree of ethylene oxide incorporation into
the molecule, while an underscore denotes ethylene-free monomers. A
number after the letter represents the number of ethylene oxide
units incorporated per unit mass of the molecule; this value is 0.9 for
one monomer, 1.0 for two, etc. For example, a polymer consisting of 100%
of ethylene oxide would not absorb any light wavelengths, but would emit only
green light when exposed to UV rays. Therefore, a polymer consisting
100% of ethylene oxide should show green fluorescence under ultraviolet
light. Other common polymers 