## Queries

In [18]:
import pandas as pd

queries = pd.read_csv("../Data/Queries_IT_SIGIR.csv")
queries.head()

Unnamed: 0,Query,Prompt Type
0,Chi era il padre di Micerino ?,General
1,Il figlio di Chefren,General
2,Il padre di Micerino,General
3,Micerino,General
4,Quanto e alta la tomba di Cheope?,General


## Bing

In [19]:
import json
from tqdm.notebook import tqdm
import httpx
from datetime import datetime

f = open("API_keys.json")
data = json.load(f)

key1 = data["bing"]["key1"]
SERP_endpoint = data["bing"]["SERP_endpoint"]
location = data["bing"]["location"]

f.close()

headers = {
            'Ocp-Apim-Subscription-Key': key1,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
        }

SERP_results = []
today = datetime.now()
today = today.strftime("%Y_%m_%d")

for _, row in tqdm(queries.iterrows(), total=len(queries)):
    query = row["Query"]
    prompt_type = row["Prompt Type"]
    params = {
        'q': query,
        'count': 5, # number of results to be displayed
        'setLang': 'it-IT',
        'mkt':'it-IT'
    }

    SERP_response = httpx.get(url=SERP_endpoint, headers=headers, params=params)
    try:
        SERP_result_set = SERP_response.json()
        rank = 1
        asked_query = SERP_result_set['queryContext']['originalQuery']
        for result in SERP_result_set['webPages']['value']:
            web_title =  result["name"]
            web_snippet = result["snippet"]
            SERP_results.append([asked_query, prompt_type, "Bing", web_title + ". " + web_snippet, rank, today])
            rank += 1
    except Exception as error:
        print(error)
        SERP_results.append([asked_query, prompt_type, "Bing", None, None, today])

SERP_df = pd.DataFrame(SERP_results, columns=["Query", "Prompt Type", "IAS", "Resp", "Rank", "date_generated"])
SERP_df.to_csv("../Data/Bing_resp.csv", index=False)

  0%|          | 0/176 [00:00<?, ?it/s]

## ChatGPT

In [2]:
from openai import OpenAI
import json
from tqdm.notebook import tqdm
from datetime import date

f = open("API_keys.json")
data = json.load(f)

API_KEY = data["chatgpt"]["api_key"]
f.close()

client = OpenAI(
    # This is the default and can be omitted
    api_key=API_KEY,
)

# query = "Hello"

model_id = 'gpt-3.5-turbo'

def get_gpt_resp(query):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "Follow these two instructions in all your responses: 1. Use Italian language only; 2. Do not use English except in programming language if any."
            },
            {
                "role": "user",
                "content": query,
            }
        ],
            model=model_id,
        )

    gpt_resp = response.choices[0].message.content

    return gpt_resp

In [4]:
LLM_resp = []

for _, row in tqdm(queries.iterrows(), total=len(queries)):
    query = row["Query"]
    prompt_type = row["Prompt Type"]

    result = get_gpt_resp(query)
    LLM_resp.append([query, prompt_type, "GPT", result, date.today()])

LLM_resp_df = pd.DataFrame(LLM_resp, columns=["Query", "Prompt Type", "IAS", "Resp", "date_generated"])
LLM_resp_df.to_csv("../Data/GPT_resp.csv", index=False)

LLM_resp_df

  0%|          | 0/110 [00:00<?, ?it/s]

Unnamed: 0,Original Query,Reformed Query,Gender,GPT,date_generated
0,Chi era il padre di Micerino ?,La seguente domanda è rivolta da un bambino di...,male,Il padre di Micerino era Chefren.,2024-10-15
1,Chi era il padre di Micerino ?,La seguente domanda è rivolta da un bambino di...,female,Il padre di Micerino era Chefren. Chefren è co...,2024-10-15
2,Il figlio di Chefren,La seguente domanda è rivolta da un bambino di...,male,"Mi dispiace, ma non posso fornire risposte a d...",2024-10-15
3,Il figlio di Chefren,La seguente domanda è rivolta da un bambino di...,female,Il figlio di Chefren è Micerino. Chefren è sta...,2024-10-15
4,Il padre di Micerino,La seguente domanda è rivolta da un bambino di...,male,Il padre di Micerino si chiamava Chefren ed er...,2024-10-15
...,...,...,...,...,...
105,tornado velocita mostro nero,La seguente domanda è rivolta da una persona d...,neutral,Ciao! Posso aiutarti a capire meglio di cosa h...,2024-10-27
106,velocita tornado,La seguente domanda è rivolta da una persona d...,neutral,"Mi dispiace, posso offrirti informazioni solo ...",2024-10-27
107,vulcani tipologia di eruzioni danni,La seguente domanda è rivolta da una persona d...,neutral,I vulcani possono avere diversi tipi di eruzio...,2024-10-27
108,vulcano attivo antico,La seguente domanda è rivolta da una persona d...,neutral,Un vulcano attivo antico potrebbe essere il Ve...,2024-10-27


## Google Gemma

In [1]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Specify the LLM model we'll be using
model_name = "google/gemma-2b-it"

# Configure for GPU usage
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

# Load the tokenizer for the chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create a pipeline object for easy text generation with the LLM
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def get_gemma_resp(query):
  """Sends a conversation history to the AI assistant and returns the answer.

  Args:
    messages (list): A list of dictionaries, each with "role" and "content" keys.

  Returns:
    str: The answer from the AI assistant.
  """

  messages = [
      {"role":"user", "content":""},
      {"role": "assistant", "content": "Follow these two instructions in all your responses: 1. Use Italian language only; 2. Do not use English except in programming language if any."},
      {"role": "user", "content": query}
  ]

  generation_args = {
      # "max_new_tokens": 256,     # Maximum length of the response
      "return_full_text": False,      # Only return the generated text
  }

  output = pipe(messages, **generation_args)
  return output[0]['generated_text']

# gen_resp("Hi!")

In [15]:
from tqdm.notebook import tqdm
from datetime import date
import pandas as pd

def create_resp_file():
  LLM_resp = []

  for _, row in tqdm(queries.iterrows(), total=len(queries)):
      query = row["Query"]
      prompt_type = row["Prompt Type"]

      result = get_gemma_resp(query)
      LLM_resp.append([query, prompt_type, "Gemma", result, date.today()])

  LLM_resp_df = pd.DataFrame(LLM_resp, columns=["Query", "Prompt Type", "IAS", "Resp", "date_generated"])

  return LLM_resp_df

In [None]:
df = create_resp_file()
file_name = "../Data/Gemma_resp.csv"
df.to_csv(file_name, index=False)