In [None]:
# pip install nltk sklearn
# python -m spacy download en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

def fixed_size_chunking(content, chunk_size=500):
    words = content.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]



def semantic_chunking(content):
    # Ensure you've downloaded the 'punkt' tokenizer model
    nltk.download('punkt')
    # Split the content into sentences
    sentences = nltk.sent_tokenize(content)
    paragraphs = []
    current_paragraph = []

    # Combine sentences into paragraphs (based on full stops or empty lines)
    for sent in sentences:
        current_paragraph.append(sent)
        # Assuming a paragraph ends with a period or empty line (this can be tweaked)
        if sent.endswith('.') or len(sent.strip()) == 0:
            paragraphs.append(" ".join(current_paragraph))
            current_paragraph = []

    # If there are remaining sentences that don't form a complete paragraph
    if current_paragraph:
        paragraphs.append(" ".join(current_paragraph))

    return paragraphs



def question_based_chunking(content, question):
    # Use NLTK's semantic chunking to get paragraphs or sentences
    chunks = semantic_chunking(content)

    # Use TF-IDF to calculate the similarity between the question and each chunk
    vectorizer = TfidfVectorizer().fit_transform([question] + chunks)
    vectors = vectorizer.toarray()
    
    # Calculate cosine similarity between the question and each chunk
    cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    
    # Select chunks with the highest similarity scores (you can adjust how many chunks to return)
    relevant_chunks = [chunks[i] for i in cosine_similarities.argsort()[::-1][:5]]
    
    return relevant_chunks

In [None]:
from bs4 import BeautifulSoup

def clean_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove unnecessary HTML tags, ads, etc.
    return soup.get_text()

In [None]:
import requests

def call_firework_api(chunk, question, api_key):
    url = "https://api.firework.com/generate"  # replace with actual API endpoint
    headers = {"Authorization": f"Bearer {api_key}"}
    data = {
        "chunk": chunk,
        "question": question
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()

In [None]:
import requests

def get_url_content(url, api_key):
    serp_url = f"https://serpapi.com/search?url={url}&api_key={api_key}"
    response = requests.get(serp_url)
    return response.text

In [None]:
from firework_api import call_firework_api
from serp_api import get_url_content
from content_preprocessor import clean_content
from chunking_algorithms import fixed_size_chunking, semantic_chunking, question_based_chunking


def _select_best_answer(answers):
    # Sort answers by relevance and return the most relevant one
    best_answer = max(answers, key=lambda x: x.get('relevance', 0))
    return best_answer['text'] if 'text' in best_answer else "No relevant answer found"

def generate_answer(url, question, api_key_serp, api_key_firework):
    # Retrieve and clean content
    raw_content = get_url_content(url, api_key_serp)
    clean_text = clean_content(raw_content)

    # Apply chunking methods
    fixed_chunks = fixed_size_chunking(clean_text)
    semantic_chunks = semantic_chunking(clean_text)
    question_chunks = question_based_chunking(clean_text, question)

    # Combine chunks for processing (or prioritize one type)
    all_chunks = fixed_chunks + semantic_chunks + question_chunks

    # Send each chunk to Firework API and collect responses
    answers = []
    for chunk in all_chunks:
        answer = call_firework_api(chunk, question, api_key_firework)
        answers.append(answer)

    # Evaluate and select the best answer
    return _select_best_answer(answers)

In [None]:
from rag_tool import generate_answer

def test_chunking_methods(urls, questions, api_key_serp, api_key_firework):
    for url, question in zip(urls, questions):
        answer = generate_answer(url, question, api_key_serp, api_key_firework)
        print(f"URL: {url}, Question: {question}, Answer: {answer}")