In [111]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
import google.generativeai as genai
import os
import PyPDF2
import json
import requests
from bs4 import BeautifulSoup
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd

In [112]:
API_KEY = "AIzaSyAoceR9LL3iEHd94iQt9bIvGyxpTcWJEbY"  # we are using our gemini api key
genai.configure(api_key=API_KEY)

# FUNCTIONS NEEDED

In [113]:
# embedding function using Gemini API
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.api_key = API_KEY
        genai.configure(api_key=self.api_key)

    def __call__(self, input: Documents) -> Embeddings:
        model = 'models/embedding-001'
        title = "Custom query"
        embeddings = genai.embed_content(model=model,
                                         content=input,
                                         task_type="retrieval_document",
                                         title=title)["embedding"]
        return embeddings

In [114]:
# create a new Chroma database
def create_chroma_db(documents, name):
    chroma_client = chromadb.Client()
    embedding_function = GeminiEmbeddingFunction()
    
    # Ensure the collection is created afresh every session
    if name in [collection.name for collection in chroma_client.list_collections()]:
        chroma_client.delete_collection(name)
    db = chroma_client.create_collection(name=name, embedding_function=embedding_function)

    for i, d in enumerate(documents):
        db.add(
            documents=[d],
            ids=[str(i)]
        )
    return db



In [115]:
# get the most relevant passage based on a query
def get_relevant_passage(query, db):
    results = db.query(query_texts=[query], n_results=1)
    print(f"Query results: {results}")
    if results['documents'] and results['documents'][0]:
        passage = results['documents'][0][0]
    else:
        passage = None
    return passage




In [116]:
def peek_db(db, n=2):
    return db.peek(n)

In [117]:
# pdf scraping
def scrape_pdf_info(file_path):
    content = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)
            for page_num in range(num_pages):
                page = reader.pages[page_num]
                content += page.extract_text()
    except Exception as e:
        content = f"Error reading PDF file: {str(e)}"
    
    return {
        'title': os.path.basename(file_path),
        'content': content
    }



In [118]:
# website scraping
def scrape_website_info(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string if soup.title else 'No title'
        content = ' '.join([p.get_text() for p in soup.find_all('p')])
        return {
            'title': title,
            'content': content
        }
    except Exception as e:
        return {
            'title': 'Error',
            'content': f"Error scraping the website: {str(e)}"
        }



In [119]:
# youtube transcript scraping
def scrape_youtube_transcript(video_url):
    try:
        video_id = video_url.split('v=')[1]
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        content = ' '.join([entry['text'] for entry in transcript])
        return {
            'title': video_url,
            'content': content
        }
    except Exception as e:
        return {
            'title': 'Error',
            'content': f"Error scraping the YouTube video: {str(e)}"
        }


In [120]:
# summarization using gemini prompt
def summarize_text(content, query):
    prompt = f"""
    Here is some text from a research paper:
    {content}

    Based on this text, please answer the following question as accurately as possible:
    '{query}'
    If the text does not provide information relevant to the question, say "Impossible to answer your question based on the provided text."
    """
    
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)
    summary = response.text.strip()
    
    if "impossible to answer" in summary.lower():
        return "Impossible to answer your question based on the provided text."
    
    return summary

In [134]:
# this is our list of documents uploaded
documents = []

In [135]:
# upload any pdf with text

pdf_file_path = 'examplepaper.pdf'  # Replace with your PDF file path
scraped_data = scrape_pdf_info(pdf_file_path)
if not scraped_data['content'].startswith('Error'):
    documents.append(scraped_data['content'])
    print("PDF content scraped and added to the list successfully!")
else:
    print(scraped_data['content'])



PDF content scraped and added to the list successfully!


In [136]:
# website that contains informations in text
url = 'https://en.wikipedia.org/wiki/Artificial_intelligence'  # Replace with your URL
scraped_data = scrape_website_info(url)
if not scraped_data['content'].startswith('Error'):
    documents.append(scraped_data['content'])
    print("Website content scraped and added to the list successfully!")
else:
    print(scraped_data['content'])

Website content scraped and added to the list successfully!


In [137]:

# any youtube video containing inforamtions / captions 
youtube_url = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'  # Replace with your YouTube video URL
scraped_data = scrape_youtube_transcript(youtube_url)
if not scraped_data['content'].startswith('Error'):
    documents.append(scraped_data['content'])
    print("YouTube transcript scraped and added to the list successfully!")
else:
    print(scraped_data['content'])

YouTube transcript scraped and added to the list successfully!


In [138]:
#database with list
db = create_chroma_db(documents, "scraped_content_db")

In [139]:
db_peek = db.peek(3)
print("Peek structure:")
print(db_peek)

Peek structure:
{'ids': ['0', '1', '2'], 'embeddings': [[-0.03408586606383324, -0.068120576441288, -0.02251533977687359, 0.0047503202222287655, 0.053028229624032974, 0.015221666544675827, 0.019352354109287262, -0.04553069546818733, -0.0035319242160767317, 0.008304882794618607, 0.028453832492232323, 0.019440192729234695, 0.020291930064558983, -0.014979593455791473, 0.02082965523004532, -0.04330810531973839, 0.02417522668838501, 0.012240082956850529, -0.02884220890700817, 0.0031806202605366707, -0.008140042424201965, -0.030458876863121986, 0.06082773581147194, -0.04826107248663902, -0.04235158860683441, -0.014093087986111641, 0.0123619819059968, -0.044229280203580856, -0.08352917432785034, 0.025012407451868057, -0.06131940707564354, 0.04970098286867142, -0.03265230357646942, 0.0046973638236522675, 0.017127204686403275, -0.04926910623908043, -0.012759883888065815, 0.022929999977350235, -0.0025623294059187174, 0.032063376158475876, -0.03631212189793587, -0.04786653816699982, 0.027719428762

In [140]:
peek_data = [{"id": id, "content": content} for id, content in zip(db_peek['ids'], db_peek['documents'])]

In [141]:
# df of the data 
df = pd.DataFrame(peek_data, columns=['id', 'content'])
print("Database Peek DataFrame:")
print(df)

Database Peek DataFrame:
  id                                            content
0  0  Deep Learning Approaches for Sentiment Analysi...
1  1  \n Artificial intelligence (AI), in its broade...
2  2  [Music] we're no strangers to love you know th...


In [142]:
# relevant passage from the database
def get_relevant_passage(query, db):
    results = db.query(query_texts=[query], n_results=1)
    if not results['documents']:
        return None
    passage = results['documents'][0][0]
    return passage

In [145]:
# example 
query = "summarize what is artifical intelligence"
relevant_passage = get_relevant_passage(query, db)

In [146]:
if relevant_passage:
    summary = summarize_text(relevant_passage, query)
    print("Summary:")
    print(summary)
else:
    print("No relevant passage found in the database.")

Summary:
Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It's a field of computer science that develops methods and software for machines to perceive their environment, learn, and act intelligently to maximize their chances of achieving defined goals. 

AI technology is widely used across various industries, governments, and scientific domains, with applications ranging from web search engines to autonomous vehicles, and even creative tools like ChatGPT. 

While many AI applications have become so integrated that they aren't perceived as AI anymore, the field itself continues to advance rapidly, with deep learning and generative AI models at the forefront of innovation. 

The ultimate goal of AI is to achieve general intelligence, enabling machines to perform any task humans can at least as well. To achieve this, AI researchers utilize techniques like search and mathematical optimization, formal logic, artificia