In [1]:
import os
import sys
from typing import Tuple
from llm_orchestration import *
from experts.pipeline.api import PipelineApi, PipelineTask
# sys.path.insert(0, "/notebooks/nebula3_experiments")
# from vg_eval import VGEvaluation, get_sc_graph, spice_get_triplets, tuples_from_sg
from movie.movie_db import MOVIE_DB
from sentence_transformers import SentenceTransformer
from transformers import pipeline as transformer_pipeline, set_seed, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from langchain import HuggingFaceHub, OpenAI
from langchain.model_laboratory import ModelLaboratory

def test_pipeline_task(pipeline_id):
    class LlmTask(PipelineTask):
        def __init__(self):
            self.llm_task = LlmTaskInternal()
            print("LlmTask Initialized successfully.")

        def process_movie(self, movie_id: str) -> Tuple[bool, str]:
            print (f'LlmTask: handling movie: {movie_id}')

            output = self.llm_task.process_movie(movie_id)

            print("LlmTask: Finished handling movie.")
            print(output)
            return output
        def get_name(self) -> str:
            return "llm"

    pipeline = PipelineApi(None)
    task = LlmTask()
    pipeline.handle_pipeline_task(task, pipeline_id, stop_on_failure=True)


ImportError: cannot import name 'DBBase' from 'database.arangodb' (/usr/local/lib/python3.9/dist-packages/database/arangodb.py)

In [None]:
ppl = PipelineApi(None)

class LLMBase(ABC):
    @abstractmethod
    def completion(prompt_template: str, *args, **kwargs):
        pass

class HuggingFaceLLM(LLMBase):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def completion(self, prompt_template: str, *args, **kwargs):
        prompt = prompt_template.format(*args)
        inputs = self.tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        outputs = self.model.generate(inputs, **kwargs)
        return [self.tokenizer.decode(x) for x in outputs]
        

class OptLLM(LLMBase):
    def __init__(self, model):
        self.model = model
        
    def completion(self, prompt_template: str, *args, **kwargs):
        prompt = prompt_template.format(*args)
        response = self.model(prompt, max_new_tokens=256, max_length=len(prompt)+256, **kwargs)
        return [x['generated_text'].strip() for x in response]        

def gpt_execute(prompt_template, *args, **kwargs):            
    prompt = prompt_template.format(*args)   
    response = openai.Completion.create(prompt=prompt, max_tokens=256, **kwargs)   
    # return response
    return [x['text'].strip() for x in response['choices']]
def opt_execute(prompt_template, *args, **kwargs):            
    prompt = prompt_template.format(*args)
    response = opt_generator(prompt, max_new_tokens=256, max_length=len(prompt)+256, **kwargs)
    print('Prompt length is {}'.format(len(prompt)))
    # return [x['generated_text'].strip() for x in response]   
    return [x['generated_text'][len(prompt):].strip() for x in response]   

In [None]:
nebula_db = NEBULA_DB()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
os.environ["OPENAI_API_KEY"] = nebula_db.get_llm_key()
# nebula_db.change_db("nebula_playground")

In [None]:
task = LlmTaskInternal()

In [None]:
mid = MovieImageId("Movies/-3103202934810463453",90)

In [None]:
# task.nebula_db.get_image_id_from_collection(2369414)
print(task.nebula_db.pg_db)

In [None]:
# nebula_db.get_doc_by_key(image_id_as_dict(mid),'s4_visual_clues')
nebula_db.get_doc_by_key(image_id_as_dict(mid),'s4_visual_clues')
# nebula_db.get_movie_frame_from_collection(mid)

In [None]:
nebula_db.write_doc_by_key({'gil': 5, 'dan': 15, 'tali': 20},collection_name='giltest', overwrite = True, key_list=['gil'])

In [None]:
list(nebula_db.db.collection('giltest').find({}))

In [None]:
ppl.get_new_movies("2bda2110-bcb8-4a6d-a334-455a1cf30c6c","llm")

In [None]:
test_pipeline_task("0cb4accc-14ff-46f7-bbb5-55b085afabeb")

In [None]:
mid = MovieImageId("Movies/-6295549713179447550",0)
mobj = task.nebula_db.get_movie_frame_from_collection(mid)
mobj['url']

In [None]:
task.prompt_obj.get_prompt(2369414)

In [None]:
rc = task.process_target_id(mid,image_url=mobj['url'],n=5)

In [None]:
rc

In [None]:
task.nebula_db.write_movie_frame_doc_to_collection(mid,rc,LLM_OUTPUT_COLLECTION)

In [None]:
task.process_movie("Movies/8477229371170297745",n=5)

In [None]:
hf = HuggingFaceHub(repo_id="google/flan-t5-xl")
openai_llm = OpenAI()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
# model = T5ForConditionalGeneration.from_pretrained("google/ul2", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to("cuda")                                                                                                   
# model = AutoModelForSeq2SeqLM.from_pretrained("google/ul2", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) # google/flan-t5-xl
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) 

model.cuda()

# set_seed(14)
# ul2_generator = transformer_pipeline('text-generation', model="google/ul2", do_sample=True)

In [None]:
input_string = "[NLG] Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, solid man wiht a bald head. Mrs. Dursley was thin and blonde and more than the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbours. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. <extra_id_0>"
inputs = tokenizer(input_string, return_tensors="pt", add_special_tokens=False).input_ids.to("cuda")
outputs = model.generate(inputs, max_length=300)
print(tokenizer.decode(outputs[0]))

In [None]:
train_ids = np.random.choice(task.s3_ids,3)

In [None]:
rc = task.prompt_obj.generate_prompt(train_ids, mid)
print(rc)

In [None]:
# rc = "What would an American in France find really weird?"
# input_string = "[NLG] "+rc+" <extra_id_0>"
# input_string = "[S2S] " + rc
input_string = rc#+" <extra_id_0>"
inputs = tokenizer(input_string, return_tensors="pt", add_special_tokens=False).input_ids.to("cuda")
outputs = model.generate(inputs, max_length=300, do_sample=True)
print(tokenizer.decode(outputs[0]))

In [None]:
llms = [HuggingFaceHub(repo_id="gpt2"), OpenAI(temperature=0.2), HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0.7}), HuggingFaceHub(repo_id="facebook/opt-30b", model_kwargs={"temperature":0.7})]
model_lab = ModelLaboratory.from_llms(llms)

In [None]:
model_lab.compare(rc)

In [None]:
set_seed(int(time.time()))
opt_generator = transformer_pipeline('text-generation', model="facebook/opt-2.7b", do_sample=True)

In [None]:
opt_execute(rc)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b")
# model = T5ForConditionalGeneration.from_pretrained("google/ul2", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to("cuda")                                                                                                   
# model = AutoModelForSeq2SeqLM.from_pretrained("google/ul2", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) # google/flan-t5-xl
model = AutoModelForCausalLM.from_pretrained("facebook/opt-30b", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) .to("cuda")


In [None]:
inputs = tokenizer(rc, return_tensors="pt", add_special_tokens=False).input_ids.to("cuda")
outputs = model.generate(inputs, max_new_tokens=256)

In [None]:
rc1 = tokenizer.decode(outputs[0])

In [None]:
print(rc1[len(rc):])
# print(rc)

In [None]:
ipc_data = json.load(open(IPC_PATH,'r'))

In [None]:
len(ipc_data)

In [None]:
import wget

In [None]:
!mkdir -p /storage/vg_data/ipc_images

In [None]:
download_path = "/storage/vg_data/ipc_images"

In [None]:
def download_ipc_images(n=100):
    ipc_data = json.load(open(IPC_PATH,'r'))
    download_path = "/storage/vg_data/ipc_images"
    for obj in ipc_data[:n]:
        print("Downloading "+obj['url'])
        if os.path.exists(os.path.join(download_path, os.path.split(obj['url'])[1])):
            print("Already exists")
        else:
            wget.download(obj['url'],out=download_path)

In [None]:
download_ipc_images(50)

In [None]:
z = [len(x['paragraph']) for x in ipc_data]

In [None]:
np.histogram(z)

In [None]:
z1 = {x['image_id']: x['paragraph'] for x in ipc_data}

In [None]:
z2 = [len(z1[x]) for x in task.s3_ids]

In [None]:
np.histogram(z2)