In [1]:
import sys
sys.path.insert(0, "/notebooks/pipenv")
sys.path.insert(0, "/notebooks/nebula3_vlm")
sys.path.insert(0, "/notebooks/nebula3_database")
sys.path.insert(0, "/notebooks/")
import os
import math
import random
import bisect
import pickle
import time
import itertools
import numpy as np

In [2]:
import openai

from typing import List, Tuple
from operator import itemgetter 
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
from database.arangodb import DatabaseConnector
from config import NEBULA_CONF
from movie_db import MOVIE_DB


# BASE_DIR = os.path.abspath(os.getcwd()+'/../..')  # /home/gil/dev/NEBULA2/
# os.chdir(os.getcwd()+'/../..')
with open('/storage/keys/openai.key','r') as f:
    OPENAI_API_KEY = f.readline().strip()
openai.api_key = OPENAI_API_KEY


In [3]:
class PIPELINE:
    def __init__(self):
        config = NEBULA_CONF()
        self.db_host = config.get_database_host()
        self.database = config.get_playground_name()
        self.gdb = DatabaseConnector()
        self.db = self.gdb.connect_db(self.database)

pipeline = PIPELINE()
mdb = MOVIE_DB()

In [4]:
fusion_model="davinci:ft-personal:fusion-2022-03-29-21-07-19"
fusion_prompt_template="Original: {}\nCandidates: {}\n\n###\n\n"
count_words = lambda s: len(s.split())
def flatten(lst): return [x for l in lst for x in l]


def gpt_execute(prompt_template, *args, **kwargs):            
    prompt = prompt_template.format(*args)   
    response = openai.Completion.create(prompt=prompt, max_tokens=256, **kwargs)   
    return response

def gpt_fusion_ft(base, experts, **kwargs):
    rc = gpt_execute(fusion_prompt_template, base, '; '.join(experts), stop=["\n"], model=fusion_model, **kwargs)
    return [x['text'].strip() for x in rc['choices']]


def gpt_batch_fusion(base, all_expert_combs, **kwargs):    
    prompts = [fusion_prompt_template.format(base,'; '.join(exp)) for exp in all_expert_combs]
    rc = openai.Completion.create(prompt=prompts, max_tokens=256, stop=["\n"], model=fusion_model, **kwargs)
    rc_sentences = [x['text'].strip() for x in rc['choices']]
    words_prompt = sum([count_words(x) for x in prompts])
    words_completion = sum([count_words(x) for x in rc_sentences])        
    return rc_sentences, words_prompt + words_completion

In [5]:
MAX_PROMPT_NUM = 10
MAX_RATE = 150000 / 5

def gpt_process_fusion(base, all_experts, **kwargs):
    rc = []
    total_words = 0
    start_time = time.time()
    curr_rate = 0
    flattened_experts = flatten(all_experts.values())
    print("Total number of experts: {}".format(len(flattened_experts)))
    exp_combinations = [list(x) for x in list(itertools.combinations(flattened_experts,3)) + list(itertools.combinations(flattened_experts,2))]
    # exp_combinations = [list(x) for x in list(itertools.combinations(flattened_experts,2))]
    print("Total number of expert combinations: {}".format(len(exp_combinations)))
    chunked_experts=[exp_combinations[i:i + MAX_PROMPT_NUM] for i in range(0, len(exp_combinations), MAX_PROMPT_NUM)]
    for experts in chunked_experts: 
        total_tokens = (total_words / 75) * 100
        curr_time = time.time() - start_time
        curr_rate = (total_tokens / curr_time) * 60        
        print("Total words, tokens, time, rate: {}/{}/{}/{}".format(total_words,total_tokens,curr_time,curr_rate))
        while curr_rate>MAX_RATE:
            print("Rate too high, going to sleep")
            time.sleep(5)
            curr_time = time.time() - start_time
            curr_rate = (total_tokens / curr_time) * 60 
        done = False
        while not done:
            try:
                fusion_results, num_words = gpt_batch_fusion(base, experts, **kwargs)
                done=True
            except Exception as e:
                print('Error, re-trying')
                print(e)
                time.sleep(10)               
        rc.extend(fusion_results)
        total_words += num_words             

    return rc, exp_combinations

def process_scene(doc, n=1, output_collection='s1_pipeline_results_phase2', **kwargs):
    mid = doc['movie_id']
    elem = doc['scene_element']
    rc = mdb.get_scene_from_collection(mid,elem,output_collection)
    if rc and len(rc['combined_sentences']):
        print("Results already exist for {}/{}".format(mid,elem))
        print("Length of output is {}".format(len(rc['combined_sentences'])))
        return
    print("Going forward with {}/{}".format(mid,elem))
    done = False
    while not done:
        try:
            rc, expert_combinations = gpt_process_fusion(doc['base'],doc['experts'], n=n, **kwargs)
            done=True
        except Exception as e:
            print('process_scene: Error, re-trying')
            time.sleep(20)
    rc_doc = {
        'movie_id': doc['movie_id'],
        'scene_element': doc['scene_element'],
        'combined_sentences': rc,
        'ordered_experts': [val for val in expert_combinations for _ in range(n)]
    }
    query = "INSERT {} INTO {}".format(rc_doc,output_collection)
    cursor = pipeline.db.aql.execute(query)     


In [10]:
query = 'FOR doc IN s2_clsmdc RETURN doc'
cursor = pipeline.db.aql.execute(query)
all_docs = sorted(list(cursor),key=lambda x: x['index'])

In [7]:
s2_collection_name = 's2_pipeline_after_gpt'
doc = all_docs[119]

In [None]:
rc, expert_combinations = gpt_process_fusion(doc['base'],doc['experts'], n=2)

In [None]:
len(all_docs)

In [11]:
for i,doc in enumerate(all_docs):
    process_scene(doc,n=2, output_collection=s2_collection_name)
    print('Finished with video #{} (index: {})'.format(i,doc['index']))    

Results already exist for Movies/222509634/0
Length of output is 40
Finished with video #0 (index: 0)
Results already exist for Movies/222509634/1
Length of output is 70
Finished with video #1 (index: 1)
Results already exist for Movies/222509721/0
Length of output is 70
Finished with video #2 (index: 2)
Results already exist for Movies/222509721/1
Length of output is 70
Finished with video #3 (index: 3)
Results already exist for Movies/222509721/2
Length of output is 112
Finished with video #4 (index: 4)
Results already exist for Movies/222509721/3
Length of output is 168
Finished with video #5 (index: 5)
Results already exist for Movies/222509721/4
Length of output is 70
Finished with video #6 (index: 6)
Results already exist for Movies/222509721/5
Length of output is 330
Finished with video #7 (index: 7)
Results already exist for Movies/222509820/0
Length of output is 70
Finished with video #8 (index: 8)
Results already exist for Movies/222509820/1
Length of output is 168
Finished w

In [None]:
process_scene(all_docs[119],n=2, output_collection=s2_collection_name)

In [None]:
all_docs[120]