In [1]:
!python -m pip install aiohttp==3.9.5 beautifulsoup4==4.12.3 faiss_cpu==1.8.0 mistralai==0.4.0 nest_asyncio==1.6.0 numpy==1.26.4 pandas==2.2.2 python-dotenv==1.0.1 requests==2.32.3

Collecting mistralai==0.4.0
  Using cached mistralai-0.4.0-py3-none-any.whl.metadata (1.8 kB)
Collecting python-dotenv==1.0.1
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting requests==2.32.3
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting sniffio (from httpx<0.26,>=0.25->mistralai==0.4.0)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Using cached mistralai-0.4.0-py3-none-any.whl (19 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
   ---------------------------------------- 0.0/64.9 kB ? eta -:--:--
   ------ --------------------------------- 10.2/64.9 kB ? eta -:--:--
   ------------------ --------------------- 30.7/64.9 kB 435.7 kB/s eta 0:00:01
   ---------------------------------------- 64.9/64.9 kB 588.0 kB/s eta 0:00:00
Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Installing collected packages: sniffio, requests, python-dotenv, mis

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-readers-file 0.1.22 requires striprtf<0.0.27,>=0.0.26, which is not installed.
spacy 3.7.4 requires cymem<2.1.0,>=2.0.2, which is not installed.
spacy 3.7.4 requires langcodes<4.0.0,>=3.2.0, which is not installed.
spacy 3.7.4 requires murmurhash<1.1.0,>=0.28.0, which is not installed.
spacy 3.7.4 requires setuptools, which is not installed.
spacy 3.7.4 requires smart-open<7.0.0,>=5.2.1, which is not installed.
spacy 3.7.4 requires spacy-legacy<3.1.0,>=3.0.11, which is not installed.
spacy 3.7.4 requires spacy-loggers<2.0.0,>=1.0.0, which is not installed.
src 0.8.5 requires flask<4.0.0,>=3.0.3, which is not installed.
weasel 0.3.4 requires cloudpathlib<0.17.0,>=0.7.0, which is not installed.
weasel 0.3.4 requires smart-open<7.0.0,>=5.2.1, which is not installed.
yfinance 0.2.38 requires peewee>=3.16.2

In [2]:
import os 

MISTRALL_API_KEY = os.getenv('MISTRAL_API_KEY')

## Scraper Definitions

In [6]:
import aiohttp 
import asyncio 
import nest_asyncio
from bs4 import BeautifulSoup 
from concurrent.futures import ThreadPoolExecutor 
import requests 
import re 
import pandas as pd 
import faiss 
import numpy as np 
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

nest_asyncio.apply()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

total_results_to_fetch = 10 
chunk_size = 1000 

dataframe_out_path = "temp.csv"
faiss_index_path = "faiss_index.index"

mistral_api_key = MISTRALL_API_KEY 
client = MistralClient(api_key=mistral_api_key)

async def fetch(session, url, params=None):
    async with session.get(url, params=params, headers=headers) as response:
        return await response.text()

async def fetch_page(session, params, page_num, results):
    print(f"Fetching page: {page_num}")
    params["start"] = (page_num - 1) * params["num"]
    html = await fetch(session, "https://www.google.com/search", params=params)
    soup = BeautifulSoup(html, "html.parser")
    
    for result in soup.select(".tF2Cxc"):
        if len(results) >= total_results_to_fetch:
            break
        title = result.select_one(".DKV0Md").text
        links = result.select_one(".yuRUbf a")["href"]
        
        results.append({
            "title": title,
            "link": links
        })

async def fetch_content(session, url):
    async with session.get(url, headers=headers, timeout=30) as response:
        return await response.text()
    
async def fetch_all_content(urls): 
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_content(session, url) for url in urls]
        return await asyncio.gather(*tasks)
    
def get_all_text_from_url(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, "html.parser")
    for script in soup(["script", "soup"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = "\n".join(chunk for chunk in chunks if chunk)  
    return text

def split_text_into_chunks(text, chunk_size):
    sentences = re.split(r"(?<=[.!?]) +", text)
    chunks = []
    current_chunk = []

    for sentence in sentences: 
        if sum(len(s) for s in current_chunk) + len(sentence) < chunk_size:  
            current_chunk.append(" ".join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

async def process_text_content(texts, chunk_size):
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, split_text_into_chunks, text, chunk_size) for text in texts]
    return await asyncio.gather(*tasks)

async def get_embedding_from_mistral(client, text_chunk):
    response = client.embeddings(model="mistral-embed", input=text_chunk)
    return [embedding.embedding for embedding in response.data]

async def fetch_and_process_data(search_query):
    params = {
        "q": search_query,
        "num": 100,
        "gl": "uk",
        "hl": "en",
        "start": 0
    } 
    
    async with aiohttp.ClientSession() as session:
        page_num = 0
        results = []
        while len(results) < total_results_to_fetch:
            page_num += 1 
            await fetch_page(session, params, page_num, results)

        print(results)
        urls = [result["link"] for result in results]

        with ThreadPoolExecutor(max_workers=10) as executor:
            loop = asyncio.get_event_loop()
            texts = await asyncio.gather(
                *[loop.run_in_executor(executor, get_all_text_from_url, url) for url in urls]
            )

        chunks_list = await process_text_content(texts, chunk_size)

        embeddings_list = []
        for chunks in chunks_list: 
            embeddings = await get_embedding_from_mistral(client, chunks)  
            embeddings_list.append(embeddings)

        data = []
        for i, result in enumerate(results):
            if i >= len(embeddings_list):
                print(f"Error: No embeddings returned for result {i}")
                continue
            for j, chunk in enumerate(chunks_list[i]):
                if j >= len(embeddings_list[i]):
                    print(f"Error: No embedding returned for chunk {j} of result {i}")
                    continue
                data.append({
                    "title": result["title"],
                    "link": result["link"],
                    "chunk": chunk,
                    "embedding": embeddings_list[i][j]
                })\
        
        df = pd.DataFrame(data)
        df.to_csv(dataframe_out_path, index=False)

        # FAISS indexing
        dimension = len(embeddings_list[0][0])
        index = faiss.IndexFlatL2(dimension)

        embeddings = np.array([entry["embedding"] for entry in data], dtype=np.float32)
        index.add(embeddings)

        faiss.write_index(index, faiss_index_path)

await fetch_and_process_data("What is the latest news on Mistral AI?")

Fetching page: 1
[{'title': 'News | Mistral AI | Frontier AI in your hands', 'link': 'https://mistral.ai/news/'}, {'title': 'Mistral AI raises 600 mln euros in latest funding round', 'link': 'https://www.reuters.com/technology/artificial-intelligence/mistral-ai-raises-600-mln-euros-latest-funding-round-2024-06-11/'}, {'title': 'Mistral AI warns of lack of data centres and training ...', 'link': 'https://www.euronews.com/next/2024/06/14/mistal-ai-warns-of-lack-of-data-centres-training-capacity-in-europe'}, {'title': 'News | Mistral AI | Frontier AI in your hands | Page 2', 'link': 'https://mistral.ai/news/page/2/'}, {'title': "Mistral AI, France's Startup Darling, Takes Aim at the US ...", 'link': 'https://www.bloomberg.com/news/articles/2024-05-28/mistral-ai-france-s-startup-darling-takes-aim-at-the-us-market'}, {'title': 'French AI start-up Mistral AI raises £500M in funding', 'link': 'https://www.techerati.com/news-hub/french-ai-start-up-mistral-ai-raises-500m-in-funding/'}, {'title'

## Little embeddings and vectore store creation

In [9]:
def query_vector_store(query_embeddings, k=5):
    index = faiss.read_index(faiss_index_path)

    if not isinstance(query_embeddings, np.ndarray):
        query_embeddings = np.array(query_embeddings, dtype=np.float32)  
    if query_embeddings.ndim == 1:  
        query_embeddings = np.expand_dims(query_embeddings, axis=0)

    distances, indices = index.search(query_embeddings, k)  
    
    df = pd.read_csv(dataframe_out_path)

    results = []
    for idx in indices[0]:
        result = {
            "title": df.iloc[idx]["title"],
            "link": df.iloc[idx]["link"],
            "chunk": df.iloc[idx]["chunk"]
        }
        results.append(result)
    return results

def query_embeddings(texts):
    client = MistralClient(api_key=MISTRALL_API_KEY)
    response = client.embeddings(model="mistral-embed", input=["texts"])
    return [embedding.embedding for embedding in response.data]

embeddings = query_embeddings("AGI")
result = query_vector_store(embeddings[0], k=5)
result

[{'title': 'News | Mistral AI | Frontier AI in your hands',
  'link': 'https://mistral.ai/news/',
  'chunk': 'News | Mistral AI | Frontier AI in your hands\nDevelopersDevelopers\nDocsLa PlateformeTechnologyTechnology\nModelsCodestralCustomizationDeploymentPricesBusinessBusiness\nUse casesCustomer storiesAbout UsAbout Us\nMissionCareersNewsLe Chat\nLog InLe Chat\nLog InMistral AIlatest updatesMy Tailor is MistralJun 5, 2024By Mistral AI teamFine-tune and deploy your custom Mistral models using Mistral fine-tuning API and SDK.Read MoreCodestral: Hello, World!May 29, 2024By Mistral AI teamEmpowering developers and democratising coding with Mistral AI.Read MoreThe Mistral AI Non-Production LicenseMay 29, 2024By Mistral AI teamMistral AI introduces the MNPL to promote sustainable openness in AI.Read MoreCheaper, Better, Faster, StrongerApr 17, 2024By Mistral AI teamContinuing to push the frontier of AI and making it accessible to all.Read MoreAu LargeFeb 26, 2024By Mistral AI teamMistral La

### Tools definition

In [10]:
nest_asyncio.apply()

tools = [
    {
        "type": "function",
        "function": {
            "name": "mistral_web_search",
            "description": "Fetch and process data from Google search based on a query, store results in FAISS vector store, and retrieve results.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The search query to use for fetching data from Google."
                    }
                },
                "required": ["search_query"]
            }
        }
    }
]

def mistral_web_search(search_query: str):
    async def run_search():
        await fetch_and_process_data(search_query)
        embeddings = query_embeddings(search_query)
        results_ = query_vector_store(embeddings[0], k=5)
        return results_
    
    return asyncio.run(run_search())

search_query = "mistral and openai"
results = mistral_web_search(search_query)
print(results)

Fetching page: 1
[{'title': 'AI race heats up as OpenAI, Google and Mistral release ...', 'link': 'https://www.theguardian.com/technology/2024/apr/10/ai-race-heats-up-as-openai-google-and-mistral-release-new-models'}, {'title': 'Mistral-Large versus GPT-4-Turbo? - API', 'link': 'https://community.openai.com/t/mistral-large-versus-gpt-4-turbo/655508'}, {'title': 'Microsoft strikes deal with Mistral in push beyond OpenAI', 'link': 'https://www.ft.com/content/cd6eb51a-3276-450f-87fd-97e8410db9eb'}, {'title': 'How Mistral AI, an OpenAI competitor, rocketed to $2Bn in ...', 'link': 'https://www.bensbites.com/case-study/how-mistral-ai-an-openai-competitor-rocketed-to-2bn-in-12-months'}, {'title': 'OpenAI Competitor Mistral AI Raising At $6B Valuation', 'link': 'https://news.crunchbase.com/ai/openai-anthropic-competitor-mistral-fundraise/'}, {'title': 'Microsoft partners with Mistral in second AI deal beyond ...', 'link': 'https://www.theverge.com/2024/2/26/24083510/microsoft-mistral-partners

In [11]:
def tools_to_str(tools_output: list) -> str:
    return '\n'.join([tool['chunk'] for tool in tools_output])


tools_to_str(mistral_web_search(search_query))

Fetching page: 1
[{'title': 'AI race heats up as OpenAI, Google and Mistral release ...', 'link': 'https://www.theguardian.com/technology/2024/apr/10/ai-race-heats-up-as-openai-google-and-mistral-release-new-models'}, {'title': 'Mistral-Large versus GPT-4-Turbo? - API', 'link': 'https://community.openai.com/t/mistral-large-versus-gpt-4-turbo/655508'}, {'title': 'Microsoft strikes deal with Mistral in push beyond OpenAI', 'link': 'https://www.ft.com/content/cd6eb51a-3276-450f-87fd-97e8410db9eb'}, {'title': 'How Mistral AI, an OpenAI competitor, rocketed to $2Bn in ...', 'link': 'https://www.bensbites.com/case-study/how-mistral-ai-an-openai-competitor-rocketed-to-2bn-in-12-months'}, {'title': 'Microsoft partners with Mistral in second AI deal beyond ...', 'link': 'https://www.theverge.com/2024/2/26/24083510/microsoft-mistral-partnership-deal-azure-ai'}, {'title': 'OpenAI Competitor Mistral AI Raising At $6B Valuation', 'link': 'https://news.crunchbase.com/ai/openai-anthropic-competitor-m

"Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page.\nCloudflare Ray ID: 89b457afdc2acb15\n•\nYour IP:\nClick to reveal\n87.122.16.210\n•\nPerformance & security by Cloudflare\nDuring all the turmoil, Microsoft managed to get a nonvoting observer seat on the nonprofit board that controls OpenAI, providing the software giant with more visibility into OpenAI’s inner workings but with no voting power on big decisions.CommentsMost PopularMost PopularUber will pay you $1,000 to ditch your car for five weeksMore YouTube Premium plans are comingYouTube is stopping Dr Disrespect’s channel from making moneyValve reveals the most-played Steam Deck gamesPerplexity’s grand theft AIVerge Deals / Sign up for Verge Deals to get deals on products we've tested sent to your inbox weekly.Email (required)Sign upBy submitting your email, you agree to our Terms and Privacy Notice. This site is protected by reCAPTCHA and the Google Privacy Pol

In [15]:
import functools
import json

names_to_functions = {
    "mistral_web_search": functools.partial(mistral_web_search)
}

### Chat

In [None]:
model = "mistral-large-latest"
client = MistralClient(api_key=MISTRALL_API_KEY)

messages = []

while True: 
    input_ = input("Ask: ")

    messages.append(ChatMessage(role="user", content=input_))
    response = client.chat(model=model, messages=messages, tools=tools, tool_choice="any")
    messages.append(response.choices[0].message)
    print(response.choices[0].message.content)

    tool_call = response.choices[0].message.tool_calls[0]
    function_name = tool_call.function.name
    function_params = json.loads(tool_call.function.arguments)

    function_result_raw = names_to_functions[function_name](**function_params)
    print("sources: ", [f"{source['title']} - {source['link']}" for source in function_result_raw])
    function_result_text = tools_to_str(function_result_raw)
    messages.append(ChatMessage(role="tool", name=function_name, content=function_result_text))

    response = client.chat(model=model, messages=messages)
    final_response = response.choices[0].message.content
    print(final_response)