<h2 style="color:darkblue;"> <strong>RAG but better - MultiQueryRetriever</strong> 🚀</h2>

In [6]:
import os
import psycopg2
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

from langchain_community.document_loaders.pdf import PyPDFium2Loader

In [7]:
async def load_all_pdfs_from_folder(folder_path: str):
    folder = Path(folder_path)
    
    if not folder.is_dir():
        raise FileNotFoundError(f"The folder {folder_path} does not exist.")

    pdf_files = list(folder.glob("*.pdf"))
    
    if not pdf_files:
        raise FileNotFoundError("No PDF files found in the specified folder.")

    all_pages = []

    for pdf_file in pdf_files:
        loader = PyPDFium2Loader(str(pdf_file))
        
        title = pdf_file.stem
        
        async for page in loader.alazy_load():
            page.metadata['title'] = title
            all_pages.append(page)

    return all_pages


In [8]:
folder_path = "data"
pages = await load_all_pdfs_from_folder(folder_path)

In [9]:
print(type(pages[0]), len(pages))

<class 'langchain_core.documents.base.Document'> 99


In [10]:
{page.metadata.get('title') for page in pages}

{'AI Meets the Classroom: When Does ChatGPT Harm Learning?',
 'Generative AI Can Harm Learning'}

### **Chunk documents**

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [12]:
chunks = text_splitter.split_documents(pages)

In [13]:
chunks[0]

Document(metadata={'source': 'data/Generative AI Can Harm Learning.pdf', 'page': 0, 'title': 'Generative AI Can Harm Learning'}, page_content='Generative AI Can Harm Learning\r\nHamsa Bastani,1∗ Osbert Bastani,2∗ Alp Sungu,1∗†\r\nHaosen Ge,3 Ozge Kabakcı, ¨ 4 Rei Mariman\r\n1Operations, Information and Decisions, University of Pennsylvania\r\n2Computer and Information Science, University of Pennsylvania\r\n3Wharton AI & Analytics, University of Pennsylvania\r\n4Budapest British International School\r\n∗These authors (H.B., O.B., A.S.) contributed equally.\r\n†To whom correspondence should be addressed; E-mail: alpsungu@wharton.upenn.edu.\r\nGenerative artificial intelligence (AI) is poised to revolutionize how humans\r\nwork, and has already demonstrated promise in significantly improving hu\x02man productivity. However, a key remaining question is how generative AI af\x02fects learning, namely, how humans acquire new skills as they perform tasks.\r\nThis kind of skill learning is crit

In [18]:
len(chunks)

143

### **Embed chunks**

In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

text = [chunk.page_content for chunk in chunks]

embeddings = model.encode(text)
print(embeddings.shape)

  from tqdm.autonotebook import tqdm, trange


In [29]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

text = [chunk.page_content for chunk in chunks]
embeddings = model.encode(text)

query = "How students are cheating with the help of AI"

query_embedding = model.encode([query])

similarities = cosine_similarity(query_embedding, embeddings)

# Find the most relevant chunk(s) (e.g., top 5)
top_k = 5 
top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]  # Sorting by similarity

relevant_texts = [text[i] for i in top_k_indices]

context = "\n".join(relevant_texts)

In [30]:
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"Here you have a context that can help you answer the question: {context} The Question: {query} Answer:"
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="Students are increasingly leveraging AI, particularly large language models (LLMs) like ChatGPT, to their advantage in education. While AI can be a helpful tool for understanding difficult concepts and improving learning, it can also facilitate academic dishonesty. Here are some of the ways students might misuse AI for cheating:\n\n1. **Automated Completion of Assignments**: Students can use AI to generate responses or solve problems directly related to their homework or assignments. By inputting prompts or questions into an AI system, the model can generate text or code that students submit without engaging with the material themselves.\n\n2. **Math and Scientific Problem Solving**: In subjects like mathematics or physics, students can input complex problem descriptions into AI tools to get step-by-step solutions or final answers, bypassing the learning process required to solve these problems independently.\n\n3. **Plagiarism and Essay Writing**: Studen

### **Connect to PostgreSQL**

In [None]:
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")

# Connect to PostgreSQL
try:
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    print("Connected to the database!")
except Exception as e:
    print(f"Error connecting to the database: {e}")
    exit()

cur = conn.cursor()

try:
    cur.execute("""
        SELECT table_name 
        FROM information_schema.tables 
        WHERE table_schema = 'public'
    """)
    tables = cur.fetchall()
    
    print("\nTables and their columns in the database:")
    for table in tables:
        table_name = table[0]
        print(f"\nTable: {table_name}")
        
        cur.execute(f"""
            SELECT column_name
            FROM information_schema.columns
            WHERE table_name = '{table_name}'
        """)
        columns = cur.fetchall()
        
        print("Columns:")
        for column in columns:
            print(f" - {column[0]}")
except Exception as e:
    print(f"Error fetching table or column names: {e}")

cur.close()
conn.close()