In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "your_api_key"

In [None]:
import google.generativeai as genai
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
print("API configured successfully")

In [None]:
!pip install -q google-genai

In [None]:
from google import genai
import os

client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain machine learning in simple words"
)

print(response.text)

In [None]:
for m in client.models.list():
    print(m.name)

In [None]:
#Install Libraries
!pip install requests beautifulsoup4 sentence-transformers faiss-cpu openai

In [None]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [None]:
# step1::Web Scraping
url = "https://en.wikipedia.org/wiki/Machine_learning"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract only main content area
content = soup.find("div", {"id": "mw-content-text"})

# Remove unwanted tags
for script in content(["script", "style", "sup", "table"]):
    script.decompose()

text = content.get_text(separator=" ")

print(text[:1000])

In [None]:
# step2::text into chunks
def chunk_text(text, chunk_size=500):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

chunks = chunk_text(text)

print("Total chunks:", len(chunks))
print("First chunk preview:\n", chunks[0])

In [None]:
# step3::Generate Embeddings

model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert chunks into embeddings
embeddings = model.encode(chunks)

print("Embedding shape:", embeddings.shape)

In [None]:
# step4::Create FAISS Vector Database
dimension = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(dimension)

# Add embeddings to index
index.add(np.array(embeddings))

print("Total vectors stored in FAISS:", index.ntotal)

In [None]:
# step 5::Ask Question & Retrieve
question = "What is machine learning?"

# Convert question to embedding
question_embedding = model.encode([question])

# Retrieve top 3 similar chunks
k = 3
distances, indices = index.search(np.array(question_embedding), k)

retrieved_chunks = [chunks[i] for i in indices[0]]

print("Retrieved Chunks:\n")

for i, chunk in enumerate(retrieved_chunks):
    print(f"Chunk {i+1}:\n")
    print(chunk[:500])
    print("\n--------------------\n")

In [None]:
!pip install google-generativeai

In [None]:
print(question)

In [None]:
#step 6::Connect to OpenAI
import google.generativeai as genai
import os

# Configure API
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Load model
model_llm = genai.GenerativeModel("gemini-2.5-flash")

# Combine retrieved chunks
context = " ".join(retrieved_chunks)

prompt = f"""
Answer the question using ONLY the context below.
If the answer is not in the context, say you don't know.

Context:
{context}

Question:
{question}
"""

response = model_llm.generate_content(prompt)

print("Final Answer:\n")
print(response.text)

In [None]:
print("Sources:")
for i, chunk in enumerate(retrieved_chunks):
    print(f"Source chunk {i+1}")

In [None]:
# Day 2:: full Retrieval-Augmented Generation (RAG) pipeline

In [None]:
# step1::install Required Dependencies
!pip install -q langchain langchain-community langchain-text-splitters faiss-cpu pypdf google-generativeai

In [None]:
import os
import faiss
import numpy as np
import google.generativeai as genai

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [None]:
os.environ["GOOGLE_API_KEY"] = "your_api_key"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

print("API configured successfully")

In [None]:
# step2:: Load PDF
loader = PyPDFLoader("/content/sample.pdf")
documents = loader.load()

print("Number of pages loaded:", len(documents))

In [None]:
# step3:: Split Document into Text Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(documents)

print("Number of chunks created:", len(chunks))

In [None]:
# step4:: Install  Google GenAI SDK
!pip install -q google-genai

In [None]:
# Step 5:: Configure Gemini API Client and Embedding Model
from google import genai
import numpy as np
import os

# Set API Key
os.environ["GOOGLE_API_KEY"] = "your_api_key"

# Create client
client = genai.Client()

# Correct embedding model from your list
embedding_model = "models/gemini-embedding-001"

def get_embedding(text):
    response = client.models.embed_content(
        model=embedding_model,
        contents=text
    )
    return response.embeddings[0].values

In [None]:
# Step 6:: Generate Embeddings for All Document Chunks
chunk_embeddings = []

for chunk in chunks:
    emb = get_embedding(chunk.page_content)
    chunk_embeddings.append(emb)

chunk_embeddings = np.array(chunk_embeddings).astype("float32")

print("Embedding shape:", chunk_embeddings.shape)

In [None]:
# Step 7:: Create FAISS Vector Index for Semantic Search
import faiss

dimension = chunk_embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

print("Total vectors stored in FAISS:", index.ntotal)

In [None]:
#Step 8:: Implement Semantic Retrieval Function
def retrieve(query, k=3):
    # Create query embedding
    query_embedding = client.models.embed_content(
        model=embedding_model,
        contents=query
    ).embeddings[0].values

    query_embedding = np.array([query_embedding]).astype("float32")

    # Search in FAISS
    distances, indices = index.search(query_embedding, k)

    print("Retrieved chunk indices:", indices)

    retrieved_texts = [chunks[i].page_content for i in indices[0]]

    return retrieved_texts

In [None]:
results = retrieve("What is the objective of the proposed model?")

In [None]:
# Step 9:: Implement Retrieval-Augmented Answer Generation
def generate_answer(query):
    retrieved_chunks = retrieve(query)

    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
Answer the question using ONLY the context below.
If the answer is not in the context, say you don't know.

Context:
{context}

Question:
{query}
"""

    response = client.models.generate_content(
        model="models/gemini-2.5-flash",
        contents=prompt
    )

    return response.text

In [None]:
# Step 10:: Execute Full RAG Pipeline with User Question
question = "What are the future works suggested?"

answer = generate_answer(question)

print("Final Answer:\n")
print(answer)

In [None]:
# day3 progress::structured and unstructed data handling

In [None]:
import pandas as pd

# step1::Create sample structured dataset
data = {
    "Model": ["CNN-A", "CNN-B", "CNN-C"],
    "Accuracy": [91.2, 94.8, 88.5],
    "Dataset": ["HAM10000", "ISIC", "HAM10000"]
}

df = pd.DataFrame(data)

# Save as CSV
df.to_csv("structured_data.csv", index=False)

print("Sample CSV created successfully!")
df

In [None]:
# step2::Convert Rows to Text Format
structured_texts = []

for _, row in df.iterrows():
    row_text = " | ".join([f"{col}: {row[col]}" for col in df.columns])
    structured_texts.append(row_text)

print("Example structured text:")
print(structured_texts[0])

In [None]:
# step3::Generate Structured Embeddings
structured_embeddings = []

for text in structured_texts:
    emb = get_embedding(text)
    structured_embeddings.append(emb)

structured_embeddings = np.array(structured_embeddings).astype("float32")

print("Structured Embedding Shape:", structured_embeddings.shape)

In [None]:
# step4::Create FAISS Index for Structured Data
import faiss

structured_dimension = structured_embeddings.shape[1]

structured_index = faiss.IndexFlatL2(structured_dimension)
structured_index.add(structured_embeddings)

print("Structured vectors stored:", structured_index.ntotal)

In [None]:
# step5::Query Type Detection
def detect_query_type(query):

    structured_keywords = [
        "accuracy",
        "percentage",
        "highest",
        "lowest",
        "greater than",
        "less than",
        "maximum",
        "minimum"
    ]

    query_lower = query.lower()

    for word in structured_keywords:
        if word in query_lower:
            return "structured"

    return "unstructured"

In [None]:
def structured_answer(query):

    query = query.lower()

    # Example for model accuracy dataset
    if "highest" in query and "accuracy" in query:
        max_row = df.loc[df["Accuracy"].idxmax()]
        return f"{max_row['Model']} has the highest accuracy of {max_row['Accuracy']}%"

    if "lowest" in query and "accuracy" in query:
        min_row = df.loc[df["Accuracy"].idxmin()]
        return f"{min_row['Model']} has the lowest accuracy of {min_row['Accuracy']}%"

    return "I don't know."

In [None]:
# step6::Hybrid Retrieve Function
def hybrid_retrieve(query, k=3):

    query_type = detect_query_type(query)

    if query_type == "structured":
        print("Query Type: Structured")
        # Directly return structured answer instead of retrieval
        return [structured_answer(query)]

    else:
        print("Query Type: Unstructured")

        query_embedding = client.models.embed_content(
            model=embedding_model,
            contents=query
        ).embeddings[0].values

        query_embedding = np.array([query_embedding]).astype("float32")

        distances, indices = index.search(query_embedding, k)
        retrieved = [chunks[i].page_content for i in indices[0]]

        return retrieved

In [None]:
# step7::Generate Final Answer
def generate_hybrid_answer(query):

    retrieved_chunks = hybrid_retrieve(query)

    context = "\n\n".join(retrieved_chunks)

    prompt = f"""
Answer the question using ONLY the context below.
If numerical values are present, use exact values.
If answer is not in context, say you don't know.

Context:
{context}

Question:
{query}
"""

    response = client.models.generate_content(
        model="models/gemini-2.5-flash",
        contents=prompt
    )

    return response.text

In [None]:
# step8::Test Structured Query
question = "Which model has highest accuracy?"

answer = generate_hybrid_answer(question)

print("Final Answer:\n")
print(answer)

In [None]:
# step9::Test Unstructured Query
question = "What is the objective of the proposed model?"

answer = generate_hybrid_answer(question)

print("Final Answer:\n")
print(answer)

In [None]:
from google.colab import files
uploaded = files.upload()



In [None]:
!unzip amazon.csv.zip

In [None]:
import pandas as pd

df = pd.read_csv("amazon.csv")
df.head()

In [None]:
import pandas as pd
import numpy as np

# ---------------- LOAD CSV ----------------
df = pd.read_csv("amazon.csv")

print("CSV Loaded Successfully")
print("Columns:", df.columns.tolist())


# ---------------- CLEAN NUMERIC COLUMNS ----------------
def clean_numeric_column(col):
    return pd.to_numeric(
        df[col].astype(str)
        .str.replace("â‚¹", "", regex=False)
        .str.replace(",", "", regex=False)
        .str.replace("%", "", regex=False),
        errors="coerce"
    )

possible_numeric_cols = [
    "discounted_price",
    "actual_price",
    "discount_percentage",
    "rating",
    "rating_count"
]

for col in possible_numeric_cols:
    if col in df.columns:
        df[col] = clean_numeric_column(col)

print("\nNumeric conversion done.")
print(df.dtypes)


# ---------------- QUERY TYPE DETECTION ----------------
def detect_query_type(query):
    structured_keywords = [
        "highest", "lowest", "maximum", "minimum",
        "greater than", "less than", "average",
        "price", "rating", "count"
    ]

    query_lower = query.lower()

    for word in structured_keywords:
        if word in query_lower:
            return "structured"

    return "unstructured"


# ---------------- STRUCTURED ANSWER LOGIC ----------------
def structured_answer(query):

    query = query.lower()

    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns

    if len(numeric_cols) == 0:
        return "No numeric columns found in dataset."

    # Map simple words to actual column names
    column_map = {
        "price": "discounted_price",
        "actual price": "actual_price",
        "discount": "discount_percentage",
        "rating": "rating",
        "count": "rating_count"
    }

    selected_column = None

    for key in column_map:
        if key in query and column_map[key] in numeric_cols:
            selected_column = column_map[key]
            break

    if selected_column is None:
        return "I don't know."

    # Highest
    if "highest" in query or "maximum" in query:
        row = df.loc[df[selected_column].idxmax()]
        return f"{row['product_name']} has the highest {selected_column} of {row[selected_column]}"

    # Lowest
    if "lowest" in query or "minimum" in query:
        row = df.loc[df[selected_column].idxmin()]
        return f"{row['product_name']} has the lowest {selected_column} of {row[selected_column]}"

    # Average
    if "average" in query:
        avg = df[selected_column].mean()
        return f"The average {selected_column} is {round(avg, 2)}"

    return "I don't know."


# ---------------- HYBRID ANSWER ----------------
def generate_csv_rag_answer(query):

    qtype = detect_query_type(query)
    print("Query Type:", qtype)

    if qtype == "structured":
        return structured_answer(query)
    else:
        return "Unstructured queries not supported in this CSV-only demo."


# ---------------- TEST ----------------
question = "Which product has highest price?"
answer = generate_csv_rag_answer(question)

print("\nFinal Answer:\n")
print(answer)