In [35]:
import os
import random
from collections import Counter
from datasets import load_dataset
from together import Together
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Self Consistency Implementation with majority vote decision and comparison with greedy decoding

In [36]:
# # Initialize Together client
os.environ["TOGETHER_API_KEY"] ="3116fd3668302432d738187aa87a1ef3d5f89559b73d736b02f31da487baebd0"
client = Together()

# Set Together AI API Key
# os.environ["TOGETHERAI_API_KEY"] = os.getenv("TOGETHER_API_KEY")

total_questions = 500

# Load a random subset of 3 questions from the validation set
dataset = load_dataset("commonsense_qa", split="validation")
sampled_dataset = random.sample(list(dataset), total_questions)

def format_prompt(question, choices):
    choice_str = "\n".join([f"{label}. {text}" for label, text in zip(choices['label'], choices['text'])])
    return f"Question: {question}\nChoices:\n{choice_str}\nAnswer with the correct letter"

def get_response(prompt, temperature=0.0):
    response = client.chat.completions.create(
        model="mistralai/Mistral-7B-Instruct-v0.2",
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    return response.choices[0].message.content.strip()

def sc_most_common(question, choices, n_samples=3):
    prompt = format_prompt(question, choices)
    answers = [get_response(prompt, temperature=0.7) for _ in range(n_samples)]

    return Counter(answers).most_common(1)[0][0],answers

# Evaluate self-consistency
self_consistency_accuracy = 0
greedy_accuracy = 0
difference = 0
better = 0
worse = 0


In [37]:
csv_file_path = "mistral_7b_instruct_output.csv"
if os.path.exists(csv_file_path):
    df = pd.read_csv(csv_file_path)
else:
    df = pd.DataFrame(columns=['Question', 'Answer Match Greedy', 'Answer Match Self-consistency', 'Correct Answer', 'Greedy Answer', 'Self Answer','Approaches Agree'])

In [38]:
def extract_option(answer):
    """Extract the option (e.g., 'A', 'B', etc.) from the answer text."""
    for option in ['A', 'B', 'C', 'D', 'E']:
        if answer.startswith(option) or f"{option}." in answer or f"{option} " in answer:
            return option
    return None  # Return None if no valid option is found

def run_tests(sampled_dataset,self_consistent_answer):
    for idx, item in enumerate(sampled_dataset, 1):
        if item["question"] in df['Question'].values:
            print(f"Skipping question {idx} as it is already in the dataframe.")
            continue

        print(f"\n--- Question {idx} ---")
        df.loc[idx, 'Question'] = item["question"]
        greedy_answer = get_response(format_prompt(item["question"], item["choices"]), temperature=0.0)
        df.loc[idx, 'Greedy Answer'] = greedy_answer

        n = 6
        prediction, answers = self_consistent_answer(item["question"], item["choices"], n_samples=n)
        df.loc[idx, 'Self Answer'] = prediction
        correct_answer = item["answerKey"] 
        index = item['choices']['label'].index(item['answerKey'])
        correct_answer = correct_answer + ". " + item['choices']['text'][index]    

        df.loc[idx, 'Correct Answer'] = correct_answer

        # Extract options from greedy_answer and prediction
        greedy_option = extract_option(greedy_answer)
        self_option = extract_option(prediction)

        if correct_answer[0] == greedy_option:
            df.loc[idx, 'Answer Match Greedy'] = True
        else:
            df.loc[idx, 'Answer Match Greedy'] = False
        
        if correct_answer[0] == self_option:
            df.loc[idx, 'Answer Match Self-consistency'] = True
        else :
            df.loc[idx, 'Answer Match Self-consistency'] = False
        
        if greedy_option != self_option:
            df.loc[idx, "Approaches Agree"] = False
        else:
            df.loc[idx, 'Approaches Agree'] = True

        df.to_csv("mistral_7b_instruct_output.csv")

    print("--- Evaluation ---")
    greedy_accuracy = df['Answer Match Greedy'].mean() * 100  # Percentage of correct greedy answers
    self_consistency_accuracy = df['Answer Match Self-consistency'].mean() * 100  # Percentage of correct self-consistent answers

    # Calculate differences
    total_questions = len(df)
    disagreements = df[df['Approaches Agree'] == False].shape[0]  # Count of disagreements
    better_self = df[(df['Approaches Agree'] == False) & (df['Answer Match Self-consistency'] == True)].shape[0]
    worse_self = df[(df['Approaches Agree'] == False) & (df['Answer Match Greedy'] == True)].shape[0]

    # Print results
    print(f"Total Questions: {total_questions}")
    print(f"Greedy Accuracy: {greedy_accuracy:.2f}%")
    print(f"Self-Consistency Accuracy: {self_consistency_accuracy:.2f}%")
    print(f"Disagreements: {disagreements}")
    print(f"Better Self-Consistency: {better_self}")
    print(f"Worse Self-Consistency: {worse_self}")

In [39]:
def cluster_and_majority_vote(answers, threshold=0.7):
    """
    Cluster answers based on cosine similarity and apply majority vote within clusters.
    
    Args:
        answers (list): List of answers generated by the model.
        threshold (float): Cosine similarity threshold to form clusters.
        
    Returns:
        str: The most common answer from the largest cluster.
    """
    # Convert answers to TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform(answers)
    similarity_matrix = cosine_similarity(vectorizer)

    # Clustering based on similarity threshold
    clusters = []
    visited = set()
    for i in range(len(answers)):
        if i in visited:
            continue
        cluster = [i]
        visited.add(i)
        for j in range(len(answers)):
            if j not in visited and similarity_matrix[i][j] >= threshold:
                cluster.append(j)
                visited.add(j)
        clusters.append(cluster)

    # Find the largest cluster and apply majority vote
    largest_cluster = max(clusters, key=len)
    clustered_answers = [answers[i] for i in largest_cluster]
    most_common_answer = Counter(clustered_answers).most_common(1)[0][0]

    return most_common_answer

# Update the self-consistency function to use clustering
def sc_with_clustering(question, choices, n_samples=3, threshold=0.7):
    """
    Self-consistency with clustering based on cosine similarity.
    t
    Args:
        question (str): The question text.
        choices (dict): The answer choices.
        n_samples (int): Number of samples to generate.
        threshold (float): Cosine similarity threshold for clustering.
        
    Returns:
        str: The final answer after clustering and majority vote.
    """
    prompt = format_prompt(question, choices)
    answers = [get_response(prompt, temperature=0.7) for _ in range(n_samples)]
    final_answer = cluster_and_majority_vote(answers, threshold=threshold)
    return final_answer, answers

run_tests(sampled_dataset,sc_with_clustering)


--- Question 1 ---

--- Question 2 ---

--- Question 3 ---

--- Question 4 ---

--- Question 5 ---

--- Question 6 ---

--- Question 7 ---

--- Question 8 ---

--- Question 9 ---

--- Question 10 ---

--- Question 11 ---

--- Question 12 ---

--- Question 13 ---

--- Question 14 ---

--- Question 15 ---

--- Question 16 ---

--- Question 17 ---

--- Question 18 ---

--- Question 19 ---

--- Question 20 ---

--- Question 21 ---

--- Question 22 ---

--- Question 23 ---

--- Question 24 ---

--- Question 25 ---

--- Question 26 ---

--- Question 27 ---

--- Question 28 ---
Skipping question 29 as it is already in the dataframe.

--- Question 30 ---

--- Question 31 ---

--- Question 32 ---

--- Question 33 ---
Skipping question 34 as it is already in the dataframe.

--- Question 35 ---

--- Question 36 ---

--- Question 37 ---

--- Question 38 ---

--- Question 39 ---

--- Question 40 ---

--- Question 41 ---

--- Question 42 ---

--- Question 43 ---
Skipping question 44 as it is alread

KeyboardInterrupt: 