## Mistral AI search engine

![image info](../../images/mistral-search-graph.png)

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()  # load environment variables from .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

## Scraper Definitions

In [2]:
import requests

async def google_search(query, results, num_results=10):
    """
    Perform a Google search and append the results to the provided list.

    :param query: The search query.
    :param api_key: Your Google API key.
    :param cse_id: Your Custom Search Engine ID.
    :param results: The list to append the results to.
    :param num_results: The number of results to fetch.
    """
    print("Searching for:", query)
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': GOOGLE_API_KEY,
        'cx': "a2991ece49c8941ec",
        'num': num_results
    }
    response = requests.get(search_url, params=params)
    search_results = response.json()

    for item in search_results.get('items', []):
        print("adding item", item['title'])
        results.append({
            'title': item['title'],
            'links': item['link']
        })

# Example usage
results = []
await google_search("latest news about apple and openai", results)
print(results)

Searching for: latest news about apple and openai
adding item OpenAI and Apple announce partnership | OpenAI
adding item New download version on Mac not working? - ChatGPT - OpenAI ...
adding item Apple Intelligence - Apple
adding item Compatibility Issues with Desktop App on Mac - ChatGPT - OpenAI ...
adding item Apple not investing in OpenAI after all, new report says | 9to5Mac : r ...
adding item Apple drops out of talks to join OpenAI investment round, WSJ ...
adding item The complicated partnership between Apple and OpenAI | CNN ...
adding item Apple drops out of talks to join OpenAI investment round, WSJ reports
adding item OpenAI and Apple Announce Partnership | Hacker News
adding item Apple partners with OpenAI to roll out new artificial intelligence system
[{'title': 'OpenAI and Apple announce partnership | OpenAI', 'links': 'https://openai.com/index/openai-and-apple-announce-partnership/'}, {'title': 'New download version on Mac not working? - ChatGPT - OpenAI ...', 'links': 

In [None]:
import aiohttp
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import pandas as pd
import faiss
import numpy as np
from openai import OpenAI

# Apply the nest_asyncio patch
nest_asyncio.apply()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}

total_results_to_fetch = 3  # total number of results to fetch
chunk_size = 1000  # size of each text chunk

dataframe_out_path = 'temp.csv'
faiss_index_path = 'faiss_index.index'

openai_api_key = OPENAI_API_KEY  # replace with your actual API key
client = OpenAI(api_key=openai_api_key)

async def fetch(session, url, params=None):
    async with session.get(url, params=params, headers=headers, timeout=30) as response:
        return await response.text()

async def google_search(query, results, num_results=total_results_to_fetch):
    """
    Perform a Google search and append the results to the provided list.

    :param query: The search query.
    :param api_key: Your Google API key.
    :param cse_id: Your Custom Search Engine ID.
    :param results: The list to append the results to.
    :param num_results: The number of results to fetch.
    """
    print("Searching for:", query)
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': GOOGLE_API_KEY,
        'cx': "a2991ece49c8941ec",
        'num': num_results
    }
    response = requests.get(search_url, params=params)
    search_results = response.json()

    for item in search_results.get('items', []):
        print("adding item", item['title'])
        results.append({
            'title': item['title'],
            'links': item['link']
        })

def get_all_text_from_url(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def split_text_into_chunks(text, chunk_size):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = []

    for sentence in sentences:
        if sum(len(s) for s in current_chunk) + len(sentence) + 1 > chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
        else:
            current_chunk.append(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

async def process_text_content(texts, chunk_size):
    loop = asyncio.get_event_loop()
    tasks = [loop.run_in_executor(None, split_text_into_chunks, text, chunk_size) for text in texts]
    return await asyncio.gather(*tasks)

async def get_embeddings_from_openai(client, text_chunks):
    print("Getting embeddings for text chunks", len(text_chunks), "total chunks")
    embeddings_out = []
    for text_chunk in text_chunks[:30]:
        response = client.embeddings.create(input=text_chunk, model="text-embedding-3-small")
        embeddings_out.append(response.data[0].embedding)
    return embeddings_out

async def fetch_and_process_data(search_query):
    
    async with aiohttp.ClientSession() as session:
        page_num = 0
        results = []
        while len(results) < total_results_to_fetch:
            page_num += 1
            await google_search(search_query, results)

        urls = [result['links'] for result in results]

        with ThreadPoolExecutor(max_workers=10) as executor:
            loop = asyncio.get_event_loop()
            texts = await asyncio.gather(
                *[loop.run_in_executor(executor, get_all_text_from_url, url) for url in urls]
            )

        chunks_list = await process_text_content(texts, chunk_size)
        print(chunks_list)

        embeddings_list = []
        for chunks in chunks_list:
            embeddings = await get_embeddings_from_openai(client, chunks)
            embeddings_list.append(embeddings)

        data = []
        for i, result in enumerate(results):
            if i >= len(embeddings_list):
                print(f"Error: No embeddings returned for result {i}")
                continue
            for j, chunk in enumerate(chunks_list[i]):
                if j >= len(embeddings_list[i]):
                    print(f"Error: No embedding returned for chunk {j} of result {i}")
                    continue
                data.append({
                    'title': result['title'],
                    'url': result['links'],
                    'chunk': chunk,
                    'embedding': embeddings_list[i][j]
                })

        df = pd.DataFrame(data)
        df.to_csv(dataframe_out_path, index=False)

        # FAISS indexing
        dimension = len(embeddings_list[0][0])  # assuming all embeddings have the same dimension
        index = faiss.IndexFlatL2(dimension)

        embeddings = np.array([entry['embedding'] for entry in data], dtype=np.float32)
        index.add(embeddings)

        faiss.write_index(index, faiss_index_path)

await fetch_and_process_data("What is the latest news about apple and openai?")



Searching for: What is the latest news about apple and openai?
adding item OpenAI and Apple announce partnership | OpenAI
adding item Apple drops out of talks to join OpenAI investment round, WSJ ...
adding item Apple Intelligence - Apple
adding item The complicated partnership between Apple and OpenAI | CNN ...
adding item Microsoft, Apple Drop OpenAI Board Plans as Scrutiny Grows : r ...
adding item OpenAI and Apple Announce Partnership | Hacker News
adding item Apple partners with OpenAI to roll out new artificial intelligence system
adding item Apple to 'pay' OpenAI for ChatGPT through distribution, not cash ...
adding item Apple not investing in OpenAI after all, new report says | 9to5Mac : r ...
adding item Microsoft and Apple drop OpenAI seats amid antitrust scrutiny
Getting embeddings for text chunks 1 total chunks
Getting embeddings for text chunks 1 total chunks
Getting embeddings for text chunks 14 total chunks
Getting embeddings for text chunks 19 total chunks
Getting embed

## little embeddings and vector store creation

In [4]:
def query_vector_store(query_embedding, k=5):
    """
    Query the FAISS vector store and return the text results along with metadata.

    :param query_embedding: The embedding to query with.
    :param k: Number of nearest neighbors to retrieve.
    :return: List of dictionaries containing text results and metadata of the k nearest neighbors.
    """
    # Load the index

    index = faiss.read_index(faiss_index_path)

    # Ensure the query embedding is a numpy array with the correct shape
    if not isinstance(query_embedding, np.ndarray):
        query_embedding = np.array(query_embedding, dtype=np.float32)
    if query_embedding.ndim == 1:
        query_embedding = np.expand_dims(query_embedding, axis=0)

    # Query the index
    distances, indices = index.search(query_embedding, k)
    
    # Load the dataframe
    df = pd.read_csv(dataframe_out_path)
    
    # Retrieve the text results and metadata
    results = []
    for idx in indices[0]:
        result = {
            'title': df.iloc[idx]['title'],
            'url': df.iloc[idx]['url'],
            'chunk': df.iloc[idx]['chunk']
        }
        results.append(result)
    
    return results

def query_embeddings(text):

    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    return response.data[0].embedding


embedding = query_embeddings("AGI")
results = query_vector_store(embedding, k=5)
results


[{'title': 'OpenAI and Apple Announce Partnership | Hacker News',
  'url': 'https://news.ycombinator.com/item?id=40636980',
  'chunk': "linear scaling in place of quadratic, without loss of capabilities?\nwg0 7 months ago\n| root | parent | next [–]\nI'm pretty sure that statistical foundations of AI where a thing just been shy of 0.004 of the threshold value out of a million dimensional space can get miscategrized as something else will not deliver AGI or any useable and reliable AI for that matter other than that sequence of sequence mapping (voice to text,\ntext to voice etc.) applications.As for money and reputation, that's a lot behind gold making too in medieval times and look where that lead too.Scientific optimism is a thinking distortion and a fallacy too.\nnativeit 7 months ago\n| root | parent | prev | next [–]\nTool seems like a strong term for whatever ChatGPT is right now. Absurdly overhyped curiosity? Insanely overengineered autocorrect? Dystopian MadLibs?"},
 {'title': 

## Create openai call to analyze this information

In [None]:
await fetch_and_process_data("what are the best skincare products for dark spots?")
embedding = query_embeddings("skincare product")
results = query_vector_store(embedding, k=5)
print(results)

Searching for: what are the best skincare products for dark spots?
adding item How to Get Rid of Dark Spots on Face, According to Dermatologists ...
adding item The Best Dark Spot Correctors 2024 | The Strategist
adding item 12 Best Dark Spot Correctors 2025, Tested and Reviewed | Glamour
adding item Which TJs beauty products actually work for you? : r/traderjoes
adding item The 10 Best Dark Spot Correctors, Tested by Real People
adding item What is the best dark spot treatment you use/tried? : r/AsianBeauty
adding item 14 Best Dark Spot Correctors of 2024, Tested by Experts
adding item Dark Spot Treatment | SHISEIDO
adding item 10 tips for clearing acne in darker skin tones
adding item 7 dark spot correctors for brightening skin
[['How to Get Rid of Dark Spots on Face, According to Dermatologists | VogueSkip to main contentSearchSearchFashionBeautyCultureLivingWeddingsRunwayShoppingChevronVideoVogue ClubPhotoVogueFashionBeautyCultureLivingWeddingsRunwayMoreChevronOpen Navigation MenuM

# IGNORE EVERYTHING BELOW IT IS IRRELEVANT

## tools definition

In [9]:

nest_asyncio.apply()

tools = [
    {
        "type": "function",
        "function": {
            "name": "openai_web_search",
            "description": "Fetch and process data from Google search based on a query, store results in FAISS vector store, and retrieve results.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The search query to use for fetching data from Google search."
                    }
                },
                "required": ["search_query"]
            },
        },
    },
]




def mistral_web_search(search_query: str):
    async def run_search():
        await fetch_and_process_data(search_query)
        embeddings = query_embeddings(search_query)
        results_ = query_vector_store(embeddings[0], k=5)
        return results_

    return asyncio.run(run_search())

search_query = "mistral and openai"
results = mistral_web_search(search_query)
print(results)



Fetching page: 1
Fetching page: 2
[{'title': "Microsoft partners with France's Mistral AI, an OpenAI rival", 'url': 'https://apnews.com/article/mistral-ai-lechat-microsoft-openai-cbd6f5604fa577a0d1e7f9047708b718', 'chunk': 'Mistral has also previously said it is teaming up with other big cloud providers including Amazon and Google.\nRELATED COVERAGE\nUS antitrust enforcers will investigate leading AI companies Microsoft, Nvidia and OpenAI\nAI ‘gold rush’ for chatbot training data could run out of human-written text\nFormer OpenAI employees lead push to protect whistleblowers flagging artificial intelligence risks\nMistral made a big splash by attracting big amounts of investor funding to give it a multibillion-dollar valuation just months after it was founded last spring. It was started by three French former researchers from Google and Meta: CEO Arthur Mensch, Chief Scientist Guillaume Lample and Chief Technology Officer Timothee Lacroix.\nIt has advertised an “open-source” approach t

In [10]:
""" little helper function to extract only the texts """
def tools_to_str(tools_output: list) -> str:
    return '\n'.join([tool['chunk'] for tool in tools_output])


tools_to_str(mistral_web_search(search_query))

Fetching page: 1
Fetching page: 2


'Mistral has also previously said it is teaming up with other big cloud providers including Amazon and Google.\nRELATED COVERAGE\nUS antitrust enforcers will investigate leading AI companies Microsoft, Nvidia and OpenAI\nAI ‘gold rush’ for chatbot training data could run out of human-written text\nFormer OpenAI employees lead push to protect whistleblowers flagging artificial intelligence risks\nMistral made a big splash by attracting big amounts of investor funding to give it a multibillion-dollar valuation just months after it was founded last spring. It was started by three French former researchers from Google and Meta: CEO Arthur Mensch, Chief Scientist Guillaume Lample and Chief Technology Officer Timothee Lacroix.\nIt has advertised an “open-source” approach to developing AI that involves publicly releasing key components of some AI systems, in contrast to companies such as OpenAI that closely guard them.\nSomething went wrong while submitting the form.PricingLog inSign upCase s

In [11]:
import functools

names_to_functions = {
    'mistral_web_search': functools.partial(mistral_web_search),
}

## chat

In [12]:
messages = [
    ChatMessage(role="user", content="What happend during apple WWDC 2024?"),
]



In [13]:
model = "mistral-large-latest"

client = MistralClient(api_key=MISTRAL_API_KEY)
response = client.chat(model=model, messages=messages, tools=tools, tool_choice="any")
response

ChatCompletionResponse(id='ed2e245edaf04b3e96fea1a914e9e97f', object='chat.completion', created=1718097895, model='mistral-large-latest', choices=[ChatCompletionResponseChoice(index=0, message=ChatMessage(role='assistant', content='', name=None, tool_calls=[ToolCall(id='wq5uJdILV', type=<ToolType.function: 'function'>, function=FunctionCall(name='mistral_web_search', arguments='{"search_query": "apple WWDC 2024"}'))], tool_call_id=None), finish_reason=<FinishReason.tool_calls: 'tool_calls'>)], usage=UsageInfo(prompt_tokens=121, total_tokens=156, completion_tokens=35))

In [14]:
messages.append(response.choices[0].message)

In [15]:
import json

tool_call = response.choices[0].message.tool_calls[0]
function_name = tool_call.function.name
function_params = json.loads(tool_call.function.arguments)


print("\nfunction_name: ", function_name, "\nfunction_params: ", function_params)


function_name:  mistral_web_search 
function_params:  {'search_query': 'apple WWDC 2024'}


In [16]:
function_result = tools_to_str(names_to_functions[function_name](**function_params))
function_result

Fetching page: 1
Fetching page: 2


"Apple WWDC 2024: the 13 biggest announcements - The VergeSkip to main contentThe VergeThe Verge logo.The Verge homepageThe Verge homepageThe VergeThe Verge logo./Tech/Reviews/Science/Entertainment/AI/MoreMenuExpandThe VergeThe Verge logo.MenuExpandWWDC 2024/Apple/TechApple WWDC 2024: the 13 biggest announcementsApple WWDC 2024: the 13 biggest announcements / Apple’s WWDC keynote had a lot to do with AI.By\nEmma Roth, a news writer who covers the streaming wars, consumer tech, crypto, social media, and much more. Previously, she was a writer and editor at MUO.\nJun 10, 2024, 6:57 PM UTCShare this storyApple’s Worldwide Developers Conference keynote has come to a close — and the company had a whole lot to share.\nFifty Distinguished Winners, who are recognised for outstanding submissions, will be invited to Cupertino for a three-day experience.\nApple will share additional conference information in advance of WWDC24 through the Apple Developer app and website.\nShare article\nMedia\nTex

In [17]:
messages.append(ChatMessage(role="tool", name=function_name, content=function_result, tool_call_id=tool_call.id))

response = client.chat(model=model, messages=messages)
response.choices[0].message.content



"Apple's Worldwide Developers Conference (WWDC) took place on June 10, 2024. The keynote focused mainly on artificial intelligence, with several announcements made. Unfortunately, I don't have real-time information, so I can't provide the specific details of the 13 biggest announcements mentioned in the article from The Verge.\n\nHowever, some general expectations before the event included the possibility of new apps for iPadOS such as a native calculator and new features for Apple Pencil 3. For macOS, it was anticipated that there would be AI features and smart tools integrated into native apps like Pages, Keynote, and Xcode.\n\nThe event also included video sessions and opportunities for developers to engage with Apple designers and engineers. There was an in-person experience at Apple Park for selected developers to watch the keynote, meet with Apple team members, and participate in special activities. Apple also continued its tradition of supporting the next generation of developer

## Chat in a chain (cleaner user experience)

In [None]:
messages = []

while True:
    input_ = input("Ask: ")
    messages.append(ChatMessage(role="user", content=input_))
    response = client.chat(model=model, messages=messages, tools=tools, tool_choice="any")
    messages.append(response.choices[0].message)
    print(response.choices[0].message.content)
    tool_call = response.choices[0].message.tool_calls[0]
    function_name = tool_call.function.name
    function_params = json.loads(tool_call.function.arguments)

    function_result_raw = names_to_functions[function_name](**function_params)
    print("sources: ", [f"{source['title']} - {source['url']}" for source in function_result_raw])
    function_result_text = tools_to_str(function_result_raw)
    messages.append(ChatMessage(role="tool", name=function_name, content=function_result_text, tool_call_id=tool_call.id))

    response = client.chat(model=model, messages=messages)
    final_response = response.choices[0].message.content
    print(final_response)
