In [None]:
print("Importing packages...")
from openicl import DatasetReader, PromptTemplate, BM25Retriever, CoTInferencer, TopkRetriever, BaseRetriever, RandomRetriever
from QKPRetriever import EmbeddingDistanceRetriever
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import openai
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

load_dotenv()
os.environ['OPENAI_API_KEY'] = "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"
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
# Load dataset
print("Loading dataset...")
data = DatasetReader('gsm8k', name='main',
						input_columns=['question'], output_column='answer', ds_size=25)

print(len(data['train']))
print(len(data['test']))

template = PromptTemplate('</E> Question: </Q> \n Answer: </A>',
							{'question':'</Q>', 'answer':'</A>'},
							ice_token='</E>')


In [None]:
print("Initiating retriever and inferencer...")

def roberta_encoding(dataset_reader: DatasetReader, index_split, test_split):
    index_ds = dataset_reader.dataset[index_split]
    test_ds = dataset_reader.dataset[test_split]    

    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    prompt_embeddings = model.encode([" ".join(ds[c] for c in dataset_reader.input_columns) for idx, ds in enumerate(index_ds) if idx < 5])
    test_embeddings = model.encode([" ".join(ds[c] for c in dataset_reader.input_columns) for idx, ds in enumerate(test_ds) if idx < 25])
    return prompt_embeddings, test_embeddings

def euclidian_distance(a, b):
    a_min_b = np.array(a) - np.array(b)
    return np.sqrt(np.einsum('ij,ij->j', a_min_b, a_min_b))

def cos_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    

retriever = EmbeddingDistanceRetriever(data, encoding_method=roberta_encoding, distance_method=distance.euclidean, ice_num=4)
idxs = retriever.retrieve()

print(data['test'][1]['question'])
print('=================')
print("\n---------\n".join(data['train'][idx]['question'] for idx in idxs[1]))


In [None]:
import numpy as np
arr1 = np.array([0,0,1])
arr2 = np.array([1,1,1])

np.outer(arr1, arr2)

In [None]:
# Inference by Chain-of-Thought
cot_list=["Let's think step by step.",
			"\nTherefore, the answer (arabic numerals) is"]

inferencer = CoTInferencer(cot_list=cot_list, api_name="gpt3", call_api=True)

In [None]:
print("Running and calculating score...")
predictions = inferencer.inference(retriever, ice_template=template)

predictions

In [None]:
df = pd.DataFrame({
    'question':data['test']['question'],
    'real_answer':data['test']['answer'],
    'model_answer':predictions
})
df['answer_number'] =  df['real_answer'].apply(lambda x: x.split('####')[-1])
display(df)
%pip install tabulate
from tabulate import tabulate
print(tabulate(df, headers='keys', tablefmt='psql'))