In [None]:
!python -m pip install aiohttp==3.9.5 beautifulsoup4==4.12.3 faiss_cpu==1.8.0 mistralai==0.4.0 nest_asyncio==1.6.0 numpy==1.26.4 pandas==2.2.2 python-dotenv==1.0.1 requests==2.32.3

In [4]:
import os 

MISTRALL_API_KEY = os.getenv('MISTRAL_API_KEY')

## Scraper Definitions

In [None]:
import aiohttp 
import asyncio 
import nest_asyncio
from bs4 import BeautifulSoup 
from concurrent.futures import ThreadPoolExecutor 
import requests 
import re 
import pandas as pd 
import faiss 
import numpy as np 
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

nest_asyncio.apply()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

total_results_to_fetch = 10 
chunk_size = 1000 

dataframe_out_path = "temp.csv"
faiss_index_path = "faiss_index.index"

mistral_api_key = MISTRALL_API_KEY 
client = MistralClient(api_key=mistral_api_key)

async def fetch(session, url, params=None):
    async with session.get(url, params=params, headers=headers) as response:
        return await response.text()

async def fetch_page(session, params, page_num, results):
    print(f"Fetching page: {page_num}")
    params["start"] = (page_num - 1) * params["num"]
    html = await fetch(session, "https://www.google.com/search", params=params)
    soup = BeautifulSoup(html, "html.parser")
    
    for result in soup.select(".tF2Cxc"):
        if len(results) >= total_results_to_fetch:
            break
        title = result.select_one(".DKV0Md").text
        links = result.select_one(".yuRUbf a")["href"]
        
        results.append({
            "title": title,
            "link": links
        })

async def fetch_content(session, url):
    async with session.get(url, headers=headers, timeout=30) as response:
        return await response.text()
    
async def fetch_all_content(urls): 
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_content(session, url) for url in urls]
        return await asyncio.gather(*tasks)
    
def get_all_text_from_url(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, "html.parser")
    for script in soup(["script", "soup"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = "\n".join(chunk for chunk in chunks if chunk)  
    return text

def split_text_into_chunks(text, chunk_size):
    sentences = re.split(r"(?<=[.!?]) +", text)
    chunks = []
    current_chunk = []

    for sentence in sentences: 
        if sum(len(s) for s in current_chunk) + len(sentence) < chunk_size:  
            current_chunk.append(" ".join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

async def process_text_content(texts, chunk_size):
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, split_text_into_chunks, text, chunk_size) for text in texts]
    return await asyncio.gather(*tasks)

async def get_embedding_from_mistral(client, text_chunk):
    response = client.embeddings(model="mistral-embed", input=text_chunk)
    return [embedding.embedding for embedding in response.data]

async def fetch_and_process_data(search_query):
    params = {
        "q": search_query,
        "num": 100,
        "gl": "uk",
        "hl": "en",
        "start": 0
    } 
    
    async with aiohttp.ClientSession() as session:
        page_num = 0
        results = []
        while len(results) < total_results_to_fetch:
            page_num += 1 
            await fetch_page(session, params, page_num, results)

        urls = [result["links"] for result in results]

        with ThreadPoolExecutor(max_workers=10) as executor:
            loop = asyncio.get_event_loop()
            texts = await asyncio.gather(
                *[loop.run_in_executor(executor, get_all_text_from_url, url) for url in urls]
            )

        chunks_list = await process_text_content(texts, chunk_size)

        embeddings_list = []
        for chunks in chunks_list: 
            embeddings = await get_embedding_from_mistral(client, chunks)  
            embeddings_list.append(embeddings)

        data = []
        for i, result in enumerate(results):
            if i >= len(embeddings_list):
                print(f"Error: No embeddings returned for result {i}")
                continue
            for j, chunk in enumerate(chunks_list[i]):
                if j >= len(embeddings_list[i]):
                    print(f"Error: No embedding returned for chunk {j} of result {i}")
                    continue
                data.append({
                    "title": result["title"],
                    "link": result["link"],
                    "chunk": chunk,
                    "embedding": embeddings_list[i][j]
                })\
        
        df = pd.DataFrame(data)
        df.to_csv(dataframe_out_path, index=False)

        # FAISS indexing
        dimension = len(embeddings_list[0][0])
        index = faiss.IndexFlatL2(dimension)

        embeddings = np.array([entry["embedding"] for entry in data], dtype=np.float32)
        index.add(embeddings)

        faiss.write_index(index, faiss_index_path)

await fetch_and_process_data("What is the latest news on Mistral AI?")