In [14]:
import os
import json
import operator
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def load_sentence_transformer():
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return model

In [16]:
model = load_sentence_transformer()

In [18]:
def load_dataset_for_few_shot( path):
    questions = []
    with open(path, "r") as file:
        data = json.load(file)
    for x in data:
        if x.get("Question") is not None:
            questions.append(x.get("Question"))
    return questions

In [19]:
import numpy as np
from sentence_transformers import util


In [20]:
def cos_sim( element, model, labels_sim, threshold=2):
    x = model.encode([element])
    res = util.dot_score(x, labels_sim)
    res = res.squeeze()
    y = np.array(res)
    ind = np.argpartition(y, -threshold)[-threshold:]

    ind = ind[np.argsort(y[ind])]
    return ind

In [21]:
def most_similar_items( question, questions, threshold=1):
    labels_sim = model.encode(questions)
    indexes = cos_sim(question, model, labels_sim, threshold=threshold)
    if len(indexes) == 1:
        res_list = operator.itemgetter(*indexes)(questions)
    else:
        res_list = list(operator.itemgetter(*indexes)(questions))
    return res_list

In [24]:
path = os.getcwd()
dataset ='compmix'
path = "/home/preetam19/app_hybrid/HybridQA/app"
questions = load_dataset_for_few_shot(f"{path}/data/{dataset}.json")
fetching_ques = most_similar_items('What actress portrayed Bella Swan?', questions)

In [46]:
questions

["What was the name of lucille ball's character on i love lucy?",
 'Who was the narrator of the novel moby-dick ?',
 'What is the central theme of the breaking bad television series?',
 'When was the author of mirrorshades born?',
 'What is the release year of game of thrones?',
 'Who directed raiders of the lost ark?',
 'Who was the creator of sherlock holmes?',
 'Creator of the the wheel of time?',
 'Who wrote the immortals of meluha?',
 'What is the original language of the movie lord of the rings?',
 'What was the narrative location of the book "wuthering heights "?',
 'What is the position played by franz beckenbauer?',
 'Who created the riverdale television series?',
 'What is the total number of emmy awards received by amy sherman-palladino?',
 'What is the country of origin of the tv series coupling?',
 'How many seasons of the emily in paris series?',
 'Who was the author for pride and prejudice book?',
 'Who is the director tv series alice in borderland?',
 'How many filmed e

In [40]:
from enchant.utils import levenshtein

In [None]:
def levenshtein_distance(ques, final_list):

    q = most_similar_items("When was the author of mirrorshades born?", ques)

    K = q * final_list * q
    obj_log_det = LogDeterminantFunction(n=20,
                                        mode="dense",
                                        lambdaVal=0,
                                        sijs=K)
    greedy_indices_and_scores = obj_log_det.maximize(budget=3,
                                                        optimizer='NaiveGreedy',
                                                        stopIfZeroGain=False,
                                                        stopIfNegativeGain=False,
                                                        verbose=False)

    greedy_indices, greedy_scores = zip(*greedy_indices_and_scores)
    return greedy_indices

In [44]:
with open(f"{path}/data/{dataset}.json", "r") as file:
        data = json.load(file)
        action_sequence = ""
        action_sequence_list = []
        final_template = ""
        ques = []
        for idx, x in enumerate(data):
                action_sequence_list.append(x.get("Action_Sequence"))
                ques.append(x.get("Question"))
                final_list = []
                for i in action_sequence_list:
                        x = []
                        for j in action_sequence_list:
                                x.append(1 - levenshtein(i, j)/max(len(i),len(j)))
                        final_list.append(x)
                final_list = np.array(final_list)
        indices = levenshtein_distance(ques, final_list)
        final_template = f"Example 1:{data[indices[0]].get('One_Shot')}\n\nExample 2:\n\n{data[indices[1]].get('One_Shot')}\n\nExample 3:\n\n{data[indices[2]].get('One_Shot')}"


In [43]:
data[0]

{'Question': "What was the name of lucille ball's character on i love lucy?",
 'Dataset': 'Compmix',
 'Action_Sequence': '\tWikiSearch\tGetWikidataID\tGenerateSparql\tRunSparql\tRunSparql\tGetLabel',
 'One_Shot': "Question:What was the name of lucille ball's character on i love lucy?\n\n\nThought:A good first step is to retrieve information about I love lucy.\n\nAction:WikiSearch\n\nAction Input: i love lucy\n\nObservation: Page: I Love Lucy\nSummary: I Love Lucy is an American television sitcom that aired from 1951 to 1957, starring Lucille Ball, Desi Arnaz, Vivian Vance, and William Frawley. The series follows the life of Lucy Ricardo (Ball), a young, middle-class housewife living in New York City, who often concocts plans with her best friends and landlords, Ethel and Fred Mertz (Vance and Frawley), to appear alongside her bandleader husband, Ricky Ricardo (Arnaz), in his nightclub. The show was highly popular and influential, winning five Emmy Awards and being voted the 'Best TV Sh

In [35]:
for idx, x in enumerate(data):
    if x.get("Question").strip() == fetching_ques.strip():
        action_sequence = x.get("Action_Sequence").strip().strip("\t")
        final_template = (
            f"Example 1: \n\n{final_template}{x.get('One_Shot')}"
        )
    action_sequence_list.append(
        x.get("Action_Sequence").strip().strip("\t")
    )
similar_sequences = self.model.encode(action_sequence_list)
if self.cos:
    print(f"doing cos max")
    indexes = self.cos_sim_least(
        action_sequence, self.model, similar_sequences, 3, most_similar=True
    )
    final_template = f"Example 1:{final_template}\n\nExample 2:\n\n{data[indexes[1]].get('One_Shot')}\n\nExample 3:\n\n{data[indexes[2]].get('One_Shot')}"
elif self.div:
    print(f"doing cos min")
    indexes = self.cos_sim_least(
        action_sequence,
        self.model,
        similar_sequences,
        10,
        most_similar=False,
    )
    selected_indices = random.sample(list(indexes), 3)
    final_template = f"Example 1:{data[selected_indices[0]].get('One_Shot')}\n\nExample 2:\n\n{data[selected_indices[1]].get('One_Shot')}\n\nExample 3:\n\n{data[selected_indices[2]].get('One_Shot')}"
else:
    print("cos general")
    indexes = self.cos_sim_least(
        action_sequence, self.model, similar_sequences, 2
    )
    final_template = f"{final_template}\n\nExample 2:\n\n{data[indexes[0]].get('One_Shot')}\n\nExample 3:\n\n{data[indexes[1]].get('One_Shot')}"


NameError: name 'self' is not defined