In [15]:
# from langchain_ollama import OllamaLLM
from langchain_ollama import ChatOllama

# llm = OllamaLLM(model='deepseekmini')
llm = ChatOllama(model='dsq4km', temperature=0.7)

In [16]:
from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel
import torch

class MiniLM(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", device="cpu"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.device = device

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            model_output = self.model(**inputs)

        embedding = self.mean_pooling(model_output, inputs["attention_mask"])
        return embedding[0].cpu().numpy().tolist()

  from scipy.sparse import csr_matrix, issparse


In [17]:
embedding_fn = MiniLM(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    device="cuda"  # or "cpu"
)

In [18]:
try:
    from langchain_qdrant import QdrantVectorStore, RetrievalMode
    from qdrant_client import QdrantClient

    client = QdrantClient(url="http://localhost:6333")

    qdrant = QdrantVectorStore(
        client=client,
        collection_name="wikipedia",
        embedding=embedding_fn,
        retrieval_mode=RetrievalMode.DENSE,
    )
except Exception as e:
    print(e)

[WinError 10061] No connection could be made because the target machine actively refused it


In [19]:
from pydantic import BaseModel, Field
from langchain_core.prompts import PromptTemplate

class RetrievalResponse(BaseModel):
    response: str = Field(..., title="Retrieval Necessity Judgment", 
                         description="Whether retrieval is necessary for the query. Answer only 'Yes' or 'No'.")

retrieval_prompt = PromptTemplate(
    input_variables=["query"],
    template="""Determine if the following user query requires factual, real-world knowledge to answer. This includes questions about events, people, places, companies, historical facts, scientific concepts, or recent news.

Query: {query}

Answer only 'Yes' or 'No'. Do not output anything else."""
)

class RelevanceResponse(BaseModel):
    response: str = Field(..., title="Relevance Judgment", 
                         description="Whether the retrieved context is relevant to the query. Answer only 'Relevant' or 'Irrelevant'.")

relevance_prompt = PromptTemplate(
    input_variables=["query", "context"],
    template="Given the query '{query}' and the retrieved context '{context}', determine if the context is relevant to the query and provides useful information to complete the task. Only answer 'Relevant' or 'Irrelevant'. Do not output anything else."
)

class GenerationResponse(BaseModel):
    response: str = Field(..., title="Generated Response", 
                         description="The response generated based on the query and context.")

generation_prompt = PromptTemplate(
    input_variables=["query", "context"],
    template="""You are a helpful AI assistant. Generate a response to the query based on the provided context. If the context is relevant, use it to answer the query accurately. If the context is irrelevant, rely on your own knowledge but indicate any uncertainties or lack of information. Be concise and informative. If you are presented a multiple choice, only answer with the letter, do not include anything else.

Query: {query}
Context: {context}

Response:"""
)

class SupportResponse(BaseModel):
    response: str = Field(..., title="Support Judgment", 
                         description="Whether the response is supported by the context. Answer only 'Fully supported', 'Partially supported', or 'No support'.")

support_prompt = PromptTemplate(
    input_variables=["response", "context"],
    template="""Given the response '{response}' to a query, and the information provided '{context}' for response generation, determine if the response is supported by the information. 

Use the following entailment scale to generate a score:
- Fully supported: All information in output is supported by the evidence, or extraction from the evidence. This is only applicable when the output and part of the evidence are almost identical.
- Partially supported: The output is supported by the evidence to some extent, but there is major information in the output that is not discussed in the evidence.
- No support: The output completely ignores evidence, is unrelated to the evidence, or contradicts the evidence.

Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the evidence.

Only answer 'Fully supported', 'Partially supported', or 'No support'. Do not output anything else."""
)

class UtilityResponse(BaseModel):
    response: int = Field(..., title="Utility Score", 
                         description="The utility score of the response from 1 to 5, where 5 is highest utility.")

utility_prompt = PromptTemplate(
    input_variables=["query", "response"],
    template="""Given the query '{query}' and the response '{response}', rate the perceived utility score of the response from 1 (lowest) to 5 (highest). 

The detailed criterion is as follows:
5: The response provides a complete, highly detailed, and informative response to the query, fully satisfying the information needs.
4: The response mostly fulfills the need in the query, while there can be some minor improvements such as discussing more detailed information, having better structure of the response, or improving coherence.
3: The response is acceptable, but some major additions or improvements are needed to satisfy users' needs.
2: The response still addresses the main request, but it is not complete or not relevant to the query.
1: The response is barely on-topic or completely irrelevant.

Only answer a single number between 1 and 5. Do not output anything else."""
)

In [20]:
retrieval_chain = retrieval_prompt | llm.with_structured_output(RetrievalResponse)
relevance_chain = relevance_prompt | llm.with_structured_output(RelevanceResponse)
generation_chain = generation_prompt | llm.with_structured_output(GenerationResponse)
support_chain = support_prompt | llm.with_structured_output(SupportResponse)
utility_chain = utility_prompt | llm.with_structured_output(UtilityResponse)

In [21]:
def score_response(response_tuple):
    response, support, utility = response_tuple
    support_score = 3 if support == 'fully supported' else (2 if support == 'partially supported' else 1)
    return support_score * 10 + utility

In [22]:
def self_rag(query, vectorstore, top_k=5):
    log = []  # collect logs here
    
    def log_msg(message):
        log.append(message)
    
    # Step 1: Determine if retrieval is necessary
    input_data = {"query": query}
    retrieval_decision = retrieval_chain.invoke(input_data).response.strip().lower()
    log_msg(f"Retrieval decision: {retrieval_decision}")
    
    if "yes" in retrieval_decision:
        # Step 2: Retrieve relevant documents
        docs = vectorstore.similarity_search(query, k=top_k)
        contexts = [doc.page_content for doc in docs]
        
        # Step 3: Evaluate relevance of retrieved documents
        relevant_contexts = []
        for i, context in enumerate(contexts):
            input_data = {"query": query, "context": context}
            relevance = relevance_chain.invoke(input_data).response.strip().lower()
            log_msg(f"Document {i+1} relevance: {relevance}")
            if relevance == 'relevant':
                relevant_contexts.append(context)
        
        log_msg(f"No of relevant context: {len(relevant_contexts)}")
        
        # If no relevant contexts found, generate without retrieval
        if not relevant_contexts:
            log_msg("No relevant contexts found. Generating without retrieval...")
            input_data = {"query": query, "context": "No relevant context found. Answer the query anyway."}
            return generation_chain.invoke(input_data).response, "\n".join(log)
        
        # Step 4: Generate response using relevant contexts
        responses = []
        for i, context in enumerate(relevant_contexts):
            input_data = {"query": query, "context": context}
            response = generation_chain.invoke(input_data).response
            
            # Step 5: Assess support
            input_data = {"response": response, "context": context}
            support = support_chain.invoke(input_data).response.strip().lower()
            log_msg(f"Support assessment for response {i+1}: {support}")
            
            # Step 6: Evaluate utility
            input_data = {"query": query, "response": response}
            utility = int(utility_chain.invoke(input_data).response)
            log_msg(f"Utility score for response {i+1}: {utility}")
            
            responses.append((response, support, utility))
        
        # Select the best response based on support and utility
        best_response = max(responses, key=score_response)
        response, support, utility = best_response

        log_msg(f"Best response support: {support}, utility: {utility}")

        if len(context) > 100:
            log_msg(f"Used context: {context[:100]}...")
        else:
            log_msg(f"Used context: {context}")
        
        return response, "\n".join(log)
    else:
        # Generate without retrieval
        log_msg("Generating without retrieval...")
        input_data = {"query": query, "context": "No retrieval necessary as the query can be answered with general knowledge."}
        return generation_chain.invoke(input_data).response, "\n".join(log)


In [23]:
from langchain.chains import RetrievalQA

# Create a prompt template that includes context
prompt_template = """Use the following pieces of context to answer the question. If you don't know the answer based on the context, or context is irrelevant, say so and answer with general knowledge.

Context: {context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)
try:
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=qdrant.as_retriever(search_kwargs={"k": 5}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
except Exception as e:
    print(e)

name 'qdrant' is not defined


In [24]:
import os
from pathlib import Path

data_dir = Path.cwd().parent / "data"

triviaqa_dir = data_dir / "triviaqa" / "triviaqa-unfiltered" / "test.json"
popqa_dir = data_dir / "popqa" / "test.json"
arc_dir = data_dir / "arc" / "arc.json"
result_dir = data_dir / "results" / "deepseek"

In [25]:
def parse_arc_response(response):
    """
    Parses the response string to extract a single letter (A, B, C, D).
    Returns the extracted letter or an empty string if not found.
    """
    resp = response.strip()
    # Check for single letter
    if len(resp) == 1 and resp.upper() in ["A", "B", "C", "D"]:
        return resp.upper()
    # Check for letter followed by a period
    for prefix in ["A", "B", "C", "D"]:
        if resp.startswith(f"{prefix}.") or f" {prefix}." in resp:
            return prefix
    return ""  # No valid pattern found

In [26]:
import json
import random
import os

def evaluate(benchmark, mode="selfrag", name="default", limit=None, shuffle=False):
    """
    Evaluate benchmark questions with different modes.
    
    mode options:
        - "selfrag": uses self_rag(query, qdrant)
        - "norag":   uses llm.invoke(query)
        - "rag":     uses qa_chain.invoke({"query": query})
    """

    response_file = result_dir / (name + "-r.jsonl")
    log_file = result_dir / (name + "-l.jsonl")

    # load benchmark data
    with open(benchmark, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if shuffle:
        random.shuffle(data)

    if limit is None or limit > len(data) or limit < 0:
        limit = len(data)

    # --- Resume support: find last processed index ---
    last_idx = -1
    if response_file.exists():
        with open(response_file, 'r', encoding='utf-8') as rf:
            for line in rf:
                try:
                    record = json.loads(line)
                    last_idx = max(last_idx, record.get("id", -1))
                except json.JSONDecodeError:
                    continue
        mode_write = "a"  # append
    else:
        mode_write = "w"  # new file

    print(f"Resuming from idx {last_idx+1} out of {limit}")

    # open response and log files
    with open(response_file, mode_write, encoding='utf-8') as rf, open(log_file, mode_write, encoding='utf-8') as lf:
        for idx, item in enumerate(data[:limit]):
            if idx <= last_idx:
                continue

            query = item['Question']
            answer = item['Answer']
            original_response = None
            max_retries = 5  # Maximum number of retries for empty responses
            attempt = 0

            while attempt <= max_retries:
                # --- Run according to mode ---
                if mode == "selfrag":
                    response, log = self_rag(query, qdrant)
                    parsed_response = response
                    break  # No retry for selfrag mode
                elif mode == "norag":
                    if benchmark == arc_dir:
                        formatted_query = query + "Output only the corresponding letter, do not output anything else."
                    else:
                        formatted_query = query
                    original_response = llm.invoke(formatted_query).content
                    parsed_response = parse_arc_response(original_response) if benchmark == arc_dir else original_response
                    log = original_response
                    # Retry only for ARC if response is empty
                    if benchmark != arc_dir or parsed_response.strip() != "":
                        break
                elif mode == "rag":
                    if benchmark == arc_dir:
                        formatted_query = query + "Output only the corresponding letter, do not output anything else."
                    else:
                        formatted_query = query
                    result = qa_chain.invoke({"query": formatted_query})
                    original_response = result['result']
                    parsed_response = parse_arc_response(original_response) if benchmark == arc_dir else original_response
                    docs = result.get('source_documents', [])
                    log = {
                        "log_answer": original_response,
                        "source_documents": [doc.page_content for doc in docs]
                    }
                    # Retry only for ARC if response is empty
                    if benchmark != arc_dir or parsed_response.strip() != "":
                        break
                else:
                    raise ValueError(f"Unknown mode: {mode}")

                attempt += 1
                if attempt > max_retries:
                    print(f"Warning: Max retries reached for query ID {idx}. Using empty response.")
                    break

            # Determine effective response for correctness check
            effective_response = parsed_response if benchmark == arc_dir and mode in ["rag", "norag"] else response

            # --- Check correctness ---
            responselower = effective_response.lower()
            anslower = [ans.lower() for ans in answer]
            correct = 1 if any(ans in responselower for ans in anslower) else 0

            # --- Write response JSONL ---
            json.dump({
                'id': idx,
                'query': query,
                'response': effective_response,
                'answer': answer,
                'correct': correct
            }, rf, ensure_ascii=False)
            rf.write('\n')

            # --- Write log JSONL ---
            json.dump({
                'id': idx,
                'query': query,
                'log': log
            }, lf, ensure_ascii=False)
            lf.write('\n')

            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1} / {limit} questions.")

In [27]:
def compute_accuracy(response_file):
    with open(response_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    total = len(lines)
    correct = sum(1 for line in lines if json.loads(line)['correct'] == 1)
    
    accuracy = correct / total if total > 0 else 0
    return accuracy

In [28]:
def analyze(log_file, response_file):
    # Track IDs by retrieval decision
    no_retrieval_ids = set()
    yes_retrieval_ids = set()
    
    with open(log_file, 'r', encoding='utf-8') as f:
        for line in f:
            log_entry = json.loads(line)
            if "Retrieval decision: no" in log_entry['log']:
                no_retrieval_ids.add(log_entry['id'])
            elif "Retrieval decision: yes" in log_entry['log']:
                yes_retrieval_ids.add(log_entry['id'])
                
    # Initialize counters
    no_correct = no_wrong = 0
    yes_correct = yes_wrong = 0
    total_correct = total_wrong = 0
    no_answer = 0
    with open(response_file, 'r', encoding='utf-8') as f:
        for line in f:
            response_entry = json.loads(line)
            rid = response_entry['id']
            correct = response_entry['correct'] == 1
            empty_response = response_entry['response'] == ""
            # Update totals
            if correct:
                total_correct += 1
            else:
                total_wrong += 1
            
            # Update retrieval-specific counts
            if rid in no_retrieval_ids:
                if correct:
                    no_correct += 1
                else:
                    no_wrong += 1
            elif rid in yes_retrieval_ids:
                if correct:
                    yes_correct += 1
                else:
                    yes_wrong += 1
            if empty_response:
                no_answer += 1

    
    total_answers = total_correct + total_wrong
    
    return {
        "total_answers": total_answers,
        "total_correct": total_correct,
        "total_wrong": total_wrong,
        "no_answer": no_answer,
        "no_retrieval_correct": no_correct,
        "no_retrieval_wrong": no_wrong,
        "yes_retrieval_correct": yes_correct,
        "yes_retrieval_wrong": yes_wrong,
    }

In [29]:
def count_inconsistent_retrieval(log_file):
    inconsistent_count = 0
    with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            log_entry = json.loads(line)
            if "Retrieval decision:" in log_entry['log']:
                if ("Retrieval decision: no" not in log_entry['log'] and
                    "Retrieval decision: yes" not in log_entry['log']):
                    inconsistent_count += 1
                    # print what comes after "Retrieval decision:"
                    print(log_entry['log'].split("Retrieval")[1].strip())
    return inconsistent_count


In [30]:
def accuracy_analysis(name):
    response_file = result_dir / f"{name}-r.jsonl"
    log_file = result_dir / f"{name}-l.jsonl"

    accuracy = compute_accuracy(response_file)
    try:
        analysis = analyze(log_file, response_file)
        inconsistent_count = count_inconsistent_retrieval(log_file)
    except Exception:
        analysis = {
            "total_answers": 0,
            "total_correct": 0,
            "total_wrong": 0,
            "no_answer": 0,
            "no_retrieval_correct": 0,
            "no_retrieval_wrong": 0,
            "yes_retrieval_correct": 0,
            "yes_retrieval_wrong": 0,
        }
        inconsistent_count = 0

    print(f"Overall Accuracy: {accuracy:.2%}")
    print(f"Total Answers: {analysis['total_answers']}")
    print(f"Total Correct: {analysis['total_correct']}")
    print(f"Total Wrong: {analysis['total_wrong']}")
    print(f"No Answer: {analysis['no_answer']}")
    print(f"No Retrieval - Correct: {analysis['no_retrieval_correct']}, Wrong: {analysis['no_retrieval_wrong']}")
    print(f"Yes Retrieval - Correct: {analysis['yes_retrieval_correct']}, Wrong: {analysis['yes_retrieval_wrong']}")
    print(f"Inconsistent Retrieval Decisions: {inconsistent_count}")


In [None]:
# evaluate(popqa_dir, mode="norag", name="popqa-default", limit=-1, shuffle=False)

Resuming from idx 0 out of 1399
Processed 10 / 1399 questions.
Processed 20 / 1399 questions.
Processed 30 / 1399 questions.
Processed 40 / 1399 questions.
Processed 50 / 1399 questions.
Processed 60 / 1399 questions.
Processed 70 / 1399 questions.
Processed 80 / 1399 questions.
Processed 90 / 1399 questions.
Processed 100 / 1399 questions.
Processed 110 / 1399 questions.
Processed 120 / 1399 questions.
Processed 130 / 1399 questions.
Processed 140 / 1399 questions.
Processed 150 / 1399 questions.
Processed 160 / 1399 questions.
Processed 170 / 1399 questions.
Processed 180 / 1399 questions.
Processed 190 / 1399 questions.
Processed 200 / 1399 questions.
Processed 210 / 1399 questions.
Processed 220 / 1399 questions.
Processed 230 / 1399 questions.
Processed 240 / 1399 questions.
Processed 250 / 1399 questions.
Processed 260 / 1399 questions.
Processed 270 / 1399 questions.
Processed 280 / 1399 questions.
Processed 290 / 1399 questions.
Processed 300 / 1399 questions.
Processed 310 / 1

In [None]:
# accuracy_analysis("popqa-default")

Overall Accuracy: 23.02%
Total Answers: 0
Total Correct: 0
Total Wrong: 0
No Retrieval - Correct: 0, Wrong: 0
Yes Retrieval - Correct: 0, Wrong: 0
Inconsistent Retrieval Decisions: 0


In [26]:
evaluate(arc_dir, mode="norag", name="arc-deepseek", limit=-1, shuffle=False)

Resuming from idx 0 out of 2590
Processed 10 / 2590 questions.
Processed 20 / 2590 questions.
Processed 30 / 2590 questions.
Processed 40 / 2590 questions.
Processed 50 / 2590 questions.
Processed 60 / 2590 questions.
Processed 70 / 2590 questions.
Processed 80 / 2590 questions.
Processed 90 / 2590 questions.
Processed 100 / 2590 questions.
Processed 110 / 2590 questions.
Processed 120 / 2590 questions.
Processed 130 / 2590 questions.
Processed 140 / 2590 questions.
Processed 150 / 2590 questions.
Processed 160 / 2590 questions.
Processed 170 / 2590 questions.
Processed 180 / 2590 questions.
Processed 190 / 2590 questions.
Processed 200 / 2590 questions.
Processed 210 / 2590 questions.
Processed 220 / 2590 questions.
Processed 230 / 2590 questions.
Processed 240 / 2590 questions.
Processed 250 / 2590 questions.
Processed 260 / 2590 questions.
Processed 270 / 2590 questions.
Processed 280 / 2590 questions.
Processed 290 / 2590 questions.
Processed 300 / 2590 questions.
Processed 310 / 2

In [10]:
accuracy_analysis("arc-deepseek")

Overall Accuracy: 35.33%
Total Answers: 2590
Total Correct: 915
Total Wrong: 1675
No Answer: 579
No Retrieval - Correct: 0, Wrong: 0
Yes Retrieval - Correct: 0, Wrong: 0
Inconsistent Retrieval Decisions: 0


In [39]:
import json
from pathlib import Path

def revise_arc_responses(name, max_retries=3):
    """
    Create revised versions of response and log files by retrying empty responses
    for ARC benchmark in norag mode.
    
    Args:
        name: Base name of the files to revise
        max_retries: Maximum number of retry attempts for empty responses
    """
    # Define file paths
    response_file = result_dir / (name + "-r.jsonl")
    log_file = result_dir / (name + "-l.jsonl")
    revised_response_file = result_dir / (name + "-revised-r.jsonl")
    revised_log_file = result_dir / (name + "-revised-l.jsonl")
    
    # Check if files exist
    if not response_file.exists() or not log_file.exists():
        print(f"Error: Files for {name} not found")
        return
    
    # Read all lines from original files
    with open(response_file, 'r', encoding='utf-8') as rf:
        response_lines = rf.readlines()
    
    with open(log_file, 'r', encoding='utf-8') as lf:
        log_lines = lf.readlines()
    
    # Process each line
    with open(revised_response_file, 'w', encoding='utf-8') as rrf, \
         open(revised_log_file, 'w', encoding='utf-8') as rlf:
        
        for resp_line, log_line in zip(response_lines, log_lines):
            try:
                # Parse JSON from both files
                resp_data = json.loads(resp_line)
                log_data = json.loads(log_line)
                
                # Check if response is empty and needs retry
                if (resp_data.get('response', '').strip() == '' and 
                    resp_data.get('id') == log_data.get('id')):
                    
                    print(f"Retrying empty response for ID {resp_data['id']}")
                    query = resp_data['query']
                    answer = resp_data['answer']
                    
                    # Format query for ARC
                    formatted_query = query + "Output only the corresponding letter, do not output anything else."
                    
                    # Retry up to max_retries times
                    for attempt in range(max_retries):
                        original_response = llm.invoke(formatted_query).content
                        parsed_response = parse_arc_response(original_response)
                        
                        if parsed_response.strip() != "":
                            # Success - update response and log
                            responselower = parsed_response.lower()
                            anslower = [ans.lower() for ans in answer]
                            correct = 1 if any(ans in responselower for ans in anslower) else 0
                            
                            resp_data['response'] = parsed_response
                            resp_data['correct'] = correct
                            log_data['log'] = original_response
                            
                            print(f"Success on attempt {attempt + 1}")
                            break
                        
                        print(f"Empty response on attempt {attempt + 1}")
                    
                    # If still empty after retries, keep original empty response
                    if parsed_response.strip() == "":
                        print(f"All retries failed for ID {resp_data['id']}")
                
                # Write the (possibly updated) data to revised files
                json.dump(resp_data, rrf, ensure_ascii=False)
                rrf.write('\n')
                json.dump(log_data, rlf, ensure_ascii=False)
                rlf.write('\n')
                
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON for line: {e}")
                # Write original lines if there's an error
                rrf.write(resp_line)
                rlf.write(log_line)
    
    print(f"Revision complete. Revised files saved as:")
    print(f"  - {revised_response_file}")
    print(f"  - {revised_log_file}")

In [32]:
revise_arc_responses("arc-deepseek", max_retries=5)

Retrying empty response for ID 0
Success on attempt 1
Retrying empty response for ID 3
Success on attempt 1
Retrying empty response for ID 4
Empty response on attempt 1
Success on attempt 2
Retrying empty response for ID 7
Empty response on attempt 1
Success on attempt 2
Retrying empty response for ID 8
Success on attempt 1
Retrying empty response for ID 10
Empty response on attempt 1
Empty response on attempt 2
Empty response on attempt 3
Empty response on attempt 4
Empty response on attempt 5
All retries failed for ID 10
Retrying empty response for ID 11
Success on attempt 1
Retrying empty response for ID 16
Success on attempt 1
Retrying empty response for ID 20
Success on attempt 1
Retrying empty response for ID 25
Success on attempt 1
Retrying empty response for ID 36
Success on attempt 1
Retrying empty response for ID 47
Success on attempt 1
Retrying empty response for ID 50
Success on attempt 1
Retrying empty response for ID 54
Empty response on attempt 1
Empty response on attemp

In [None]:
accuracy_analysis("arc-deepseek")
accuracy_analysis("arc-deepseek-revised")
accuracy_analysis("arc-deepseek-revised-revised")

Overall Accuracy: 35.33%
Total Answers: 2590
Total Correct: 915
Total Wrong: 1675
No Answer: 579
No Retrieval - Correct: 0, Wrong: 0
Yes Retrieval - Correct: 0, Wrong: 0
Inconsistent Retrieval Decisions: 0
Overall Accuracy: 43.55%
Total Answers: 2590
Total Correct: 1128
Total Wrong: 1462
No Answer: 56
No Retrieval - Correct: 0, Wrong: 0
Yes Retrieval - Correct: 0, Wrong: 0
Inconsistent Retrieval Decisions: 0
Overall Accuracy: 43.78%
Total Answers: 2590
Total Correct: 1134
Total Wrong: 1456
No Answer: 32
No Retrieval - Correct: 0, Wrong: 0
Yes Retrieval - Correct: 0, Wrong: 0
Inconsistent Retrieval Decisions: 0


In [40]:
revise_arc_responses("arc-deepseek-revised", max_retries=10)

Retrying empty response for ID 10
Success on attempt 1
Retrying empty response for ID 84
Empty response on attempt 1
Empty response on attempt 2
Empty response on attempt 3
Empty response on attempt 4
Success on attempt 5
Retrying empty response for ID 103
Empty response on attempt 1
Empty response on attempt 2
Empty response on attempt 3
Empty response on attempt 4
Empty response on attempt 5
Empty response on attempt 6
Empty response on attempt 7
Empty response on attempt 8
Success on attempt 9
Retrying empty response for ID 128
Empty response on attempt 1
Empty response on attempt 2
Empty response on attempt 3
Empty response on attempt 4
Empty response on attempt 5
Empty response on attempt 6
Empty response on attempt 7
Empty response on attempt 8
Empty response on attempt 9
Empty response on attempt 10
All retries failed for ID 128
Retrying empty response for ID 158
Empty response on attempt 1
Success on attempt 2
Retrying empty response for ID 226
Empty response on attempt 1
Empty