In [1]:
import sys
sys.path.insert(0, "/notebooks/pipenv")
sys.path.insert(0, "/notebooks/nebula3_vlm")
sys.path.insert(0, "/notebooks/nebula3_database")
sys.path.insert(0, "/notebooks/")
import os
import math
import random
import bisect
import pickle
import time
import numpy as np


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import subprocess
import re
import tempfile
import itertools
import torch
import spacy
# import amrlib
# import penman

from typing import List, Tuple
from operator import itemgetter 
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
from database.arangodb import DatabaseConnector
from config import NEBULA_CONF
from movie_db import MOVIE_DB


In [3]:
class PIPELINE:
    def __init__(self):
        config = NEBULA_CONF()
        self.db_host = config.get_database_host()
        self.database = config.get_playground_name()
        self.gdb = DatabaseConnector()
        self.db = self.gdb.connect_db(self.database)

pipeline = PIPELINE()
mdb = MOVIE_DB()
from vlm.clip_api import CLIP_API
clip=CLIP_API('vit')
s2_collection_name = 's2_pipeline_after_gpt'
s2_results_orig_collection_name = 's2_pipeline_optim_orig'
s2_results_relaxed_collection_name = 's2_pipeline_optim_relaxed'
s2_compatibility_collection_name = 's2_pipeline_compatibility_scores'
s2_with_compat_collection_name = 's2_pipeline_compatibility_results'


In [4]:
def flatten(lst): return [x for l in lst for x in l]

def compute_batch_scores(video_emb: torch.Tensor, texts: List[str], normalize=True, **kwargs) -> List[float]:    
    emb_batch = clip.clip_batch_encode_text(texts, **kwargs)                           
    return (video_emb.expand_as(emb_batch)*emb_batch).sum(dim=1).cpu().numpy()


def compute_concat_score(image_emb: torch.Tensor, texts: List[str], join_on=',') -> float:
    combined_text = ""
    for t in [x.strip() for x in texts]:
        if t[-1]=='.':
            t = t[:-1]       
        t+=join_on
        t+=' '
        combined_text+=t
    print("Combined: "+combined_text)
    return torch.matmul(image_emb,mdmmt.encode_text(combined_text.strip()) )       

In [5]:
def transform_concept(c):
    exp = re.compile(r"^([a-zA-z]+)-?(\d*)$")
    r = exp.match(c)
    return r.group(1) if r else c

class ConceptManager:
    def __init__(self):
        pass
    def ground_concept(concept):
        return transform_concept(concept)

In [6]:
class SimilarityManager:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_lg')

    def similarity(self, src, target):
        rc = []
        s1 = self.nlp(src)
        s2 = self.nlp(target)
        for w in s1:
            # if not w or not w.vector_norm:
            #     print('Argghhh 1, bad word:')
            #     print(w.text)
            if w.pos_ not in ['NOUN', 'ADJ', 'ADV', 'VERB', 'PROPN', 'ADP'] and len(s1)>1:
                continue
            # for tok in s2:
            #     if not tok or not tok.vector_norm:
            #         print('Argghhh 2, bad word:')
            #         print(tok.text)
            #         print(s2.text)
            rc.append(max([w.similarity(x) for x in s2]))
        return np.mean(rc)
        
smanager = SimilarityManager()


In [7]:
softmax = lambda x: np.exp(x)/sum(np.exp(x))
def normalize(x):
    epsilon = 0.00001
    if np.std(x) < epsilon:
        return np.ones(x.shape)
    return (x - np.mean(x)) / np.std(x)

class SubsetOptimization:
    def __init__(self, video_emb, experts: List, candidates_strings: List[str], coverage_matrix = None, coverage_threshold=1.5, **kwargs):
        self.video_emb = video_emb
        self.initial_temp = 10
        self.final_temp = .001
        self.alpha = 0.01
        self.theta = 0.9
        self.theta1 = 0.5
        self.remove_prob_temprature = 5.
        self.expert_to_cover_temp = 1.
        self.candidate_to_add_cover_temp = 1.
        self.candidate_to_add_vlm_temp = 1.
        self.reset_every = 5000
        self.experts = experts
        self.coverage_threshold = coverage_threshold
        self.candidates_strings = candidates_strings
        self.max_size_coeff = 3.5
        print("Computing batch similarity...")
        self.candidates_similarity = compute_batch_scores(self.video_emb, self.candidates_strings)
        print("Done")
        self.opt_results = []
        self.smanager = SimilarityManager()

        if coverage_matrix is not None:
            self.coverage_matrix = coverage_matrix
        else:
            self.coverage_matrix = np.zeros([len(self.experts),len(self.candidates_strings)])
            self.coverage_matrix[:] = np.nan
            for i in range(len(experts)):
                for j in range(len(candidates_strings)):
                    self.coverage_matrix[i][j]=self.concept_similarity(self.experts[i],self.candidates_strings[j])
        self.max_size = int(len(self.experts)*self.max_size_coeff)

    def concept_similarity(self, concept, sent):        
        # return max(self.smanager.similarity(concept,sent))
        return self.smanager.similarity(concept,sent)

    def get_coverage(self,i,j):        
        if np.isnan(self.coverage_matrix[i][j]):
            self.coverage_matrix[i][j] = self.concept_similarity(self.experts[i],self.candidates_strings[j])
        return self.coverage_matrix[i][j]

    # Return, for each expert, the -sum total- of how much it is covered by the state.

    def get_expert_coverage(self,state):
        return self.coverage_matrix[:,state].sum(axis=1)
        # return self.coverage_matrix[:,state].max(axis=1)

    def get_state_coverage(self,state) -> float:
        # print("State coverage for {}:".format(state))
        # print(self.get_expert_coverage(state))
        return np.mean(self.get_expert_coverage(state))

    # def get_state_coverage(self, state: List[int]) -> float:
    #     experts_coverage = [max([self.get_coverage(i,j) for j in state]) for i in range(len(self.experts))]    # A list of partial coverege        
    #     return sum(experts_coverage) / len(self.experts)

    def get_score(self, state: List[int]) -> float:
        if not state:
            return 0
        coverage_score = self.get_state_coverage(state)   
        similarity_score = self.candidates_similarity[state].mean().item()
        return (1-self.theta)*coverage_score + self.theta*similarity_score


    def prob_to_remove(self, state):
        cover = self.get_state_coverage(state)
        relative_cover = min(max(cover / self.coverage_threshold,0),1)
        rc = np.power(relative_cover,4)
        # print("Cover, relative cover, prob to remove: {}, {}, {}".format(cover, relative_cover, rc))
        return rc


    def candidate_to_add_probs(self, expert_to_cover, anti_state):       
        add_cover = normalize(self.coverage_matrix[expert_to_cover][anti_state])
        add_similarity = normalize(self.candidates_similarity[anti_state])
        # print("add_cover:")
        # print(add_cover)
        # print("add_similarity:")
        # print(add_similarity)
        probs = softmax(add_cover*self.candidate_to_add_cover_temp + add_similarity*self.candidate_to_add_vlm_temp)
        return probs
     
    # state here is assumed (and guaranteed on return) to be -sorted-
    def get_candidate(self, state: List[int]) -> List[int]:
        def compute_state_arrays(s):
            s_score = self.candidates_similarity[s]
            s_coverage = self.coverage_matrix.mean(axis=0)[s]
            s_max_coverage = self.coverage_matrix.max(axis=0)[s]
            s_fitscore = (1-self.theta1)*s_coverage+self.theta1*s_score
            # print("fitscore: {}".format(s_fitscore))
            return (s_score,s_coverage,s_max_coverage,s_fitscore)

        if not state:
            # print("Empty state")
            return [random.randint(0,len(self.candidates_strings)-1)]
            
        rc = state.copy()
        s = np.array(state)
        s_score, s_coverage, s_max_coverage, s_fitscore = compute_state_arrays(s)
               
        if len(state) == self.max_size:
            # print("Maximum state size, removing")
            idx = np.argmin(s_fitscore)
            del rc[idx]
            return rc
            
        remove_sentence = random.random()<self.prob_to_remove(state) or len(state)==len(self.candidates_strings)  
        # print("coverage of {} is {}, remove?{}".format(state,self.get_state_coverage(state),remove_sentence))
        if remove_sentence:             # We decide to remove a sentence from the set
            # print("Removing")
            probs = softmax(-s_fitscore*self.remove_prob_temprature)
            # print(probs)
            idx = np.random.multinomial(1,probs).argmax()
            # print("removing index: {}".format(idx))
            del rc[idx]                   
        else:                           # Add a sentence from the outside
            # print("Adding")
            anti_state = []
            for i in range(len(self.candidates_strings)):
                if not i in state:
                    anti_state.append(i)
            s1 = np.array(anti_state)
            s1_score, s1_coverage, s1_max_coverage, s1_fitscore = compute_state_arrays(s1)
            # Pick an expert to try and cover
            expert_coverage = self.get_expert_coverage(s)
            # print("expert coverage when adding:")
            # print(list(zip(self.experts,expert_coverage)))
            probs = softmax(-expert_coverage*self.expert_to_cover_temp)         # Coverage is in (0,1), so we use low temprature
            # print(probs)
            expert_to_cover = np.random.multinomial(1,probs).argmax()
            # print("Trying to cover expert: {}".format(self.experts[expert_to_cover]))
            probs = self.candidate_to_add_probs(expert_to_cover,s1)
            # probs = softmax(self.coverage_matrix[expert_to_cover][s1]*10)
            idx_to_add = np.random.multinomial(1,probs).argmax()
            bisect.insort(rc,anti_state[idx_to_add])
            
        return rc

    def temp_schedule(self,i):
        schedule = [(5000,0.5), (15000,0.1), (25000,0.01), (35000,0.005), (45000,self.final_temp)]
        if i<schedule[0][0]:
            return schedule[0][1]
        if i>=schedule[-1][0]:
            return schedule[-1][1]
        for j in range(len(schedule)):
            if i<schedule[j+1][0]:
                break
        start = schedule[j][0]
        end = schedule[j+1][0]
        start_val = schedule[j][1]
        end_val = schedule[j+1][1]

        # if i > 20:
        #     return self.final_temp

        return ((i-start)/(end-start))*(end_val-start_val)+start_val         

    def get_scored_permutations(self, k):
        n = len(self.candidates)
        return [(x,self.get_score(list(x))) for x in itertools.permutations(range(n),k)]

    def reset(self):
        return max(self.opt_results,key=lambda x:x[1])[0]
    
        
    def simulated_annealing(self, initial_state, clear_prev = False, reset_every = None):
        current_temp = self.initial_temp
        i = 0
        if clear_prev:
            self.opt_results = []
        if not reset_every:
            reset_every = self.reset_every

       # Start by initializing the current state with the initial state
        current_state = initial_state
        curr_score = self.get_score(initial_state)

        while current_temp > self.final_temp:
            if i % reset_every == 0 and i>0:                
                next_cand = self.reset()
                print("Reset to state: {}".format(next_cand))
            else:
                next_cand = self.get_candidate(current_state)            
            next_score = self.get_score(next_cand)

            # print("current score: {} ({}). Candidate score: {} ({})".format(curr_score,current_state,next_score,next_cand))

            # Check if next_cand is best so far
            score_diff = next_score - curr_score

            # if the new solution is better, accept it
            move = False
            if score_diff > 0:
                move = True
            # if the new solution is not better, accept it with a probability of e^(-cost/temp)
            else:
                # print("chance to move (from score_diff {}): {}".format(score_diff,math.exp(score_diff / current_temp)))
                move = random.uniform(0, 1) < math.exp(score_diff / current_temp)                    
            if move:
                current_state = next_cand
                curr_score = next_score
                self.opt_results.append((current_state,curr_score))
            # decrement the temperature
            current_temp = self.temp_schedule(i)
            i += 1
            if i % 1000 == 0:
                print("i: {}".format(i))            

        return self.reset()



In [109]:
def powerset(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1,len(s)+1))


def optimize_sents(emb_video, experts, sents, compat_scores, use_ordered_scores=False):
    smanager = SimilarityManager()
    compat_scores = np.array(compat_scores)
    as_compat = compat_scores.argsort()
    # print(compat_scores)
    graded_scores = sorted(list(zip(as_compat,range(len(as_compat)))),key = lambda x:x[0])
    # print(list(zip(as_compat,range(len(as_compat)))))
    # print(graded_scores)
    order_scores = np.array(list(zip(*graded_scores))[1]) / len(graded_scores)
    print(order_scores)
    # print(list(zip(compat_scores[as_compat],as_compat)))
    candidates_similarity = compute_batch_scores(emb_video, sents)
    coverage_matrix = np.zeros([len(experts),len(sents)])
    coverage_matrix[:] = np.nan
    for i in range(len(experts)):
        for j in range(len(sents)):
            coverage_matrix[i][j]=smanager.similarity(experts[i],sents[j])

    def get_score(state: List[int]) -> float:
        theta_similarity = 9
        theta_coverage = 1
        theta_compat = 9
        if not state:
            return 0
        coverage_score = get_state_coverage(state)   
        similarity_score = candidates_similarity[state].mean().item()
        if use_ordered_scores:            
             compat_score = order_scores[state].mean().item()
        else:            
             compat_score = compat_scores[state].mean().item()
        return theta_coverage*coverage_score + theta_similarity*similarity_score + theta_compat*compat_score

    def get_expert_coverage(state):
        # return self.coverage_matrix[:,state].sum(axis=1)
        return coverage_matrix[:,state].max(axis=1)
          
    def get_state_coverage(state) -> float:
        # print("State coverage for {}:".format(state))
        # print(get_expert_coverage(state))
        return np.mean(get_expert_coverage(state))


    superset = list(range(len(sents)))
    pset = [list(x) for x in powerset(superset)]
    pset_scores = [get_score(x) for x in pset]
    best_cand = pset[np.argmax(pset_scores)]
    return list(itemgetter(*best_cand)(sents)), candidates_similarity[best_cand].mean()

def optimize_scene(doc,mat=None, emb_video=None, **kwargs):
    mid = doc['movie_id']
    elem = doc['scene_element']
    emb_video = clip.clip_encode_video(mid,elem)
    all_sents = doc['sentences']
    rc = mdb.get_scene_from_collection(mid,elem,'s2_clsmdc')    
    experts = flatten(rc['experts'].values())
    rc = mdb.get_scene_from_collection(mid,elem,s2_compatibility_collection_name)  
    all_compat_scores = rc['compat_scores']
    n = len(all_sents)
    rc_sents = n*[None]
    mean_scores = n*[None]
    for i in range(n):
        rc_sents[i], mean_scores[i] = optimize_sents(emb_video,experts,all_sents[i],all_compat_scores[i], **kwargs)

    return rc_sents, mean_scores
    
def run_pipeline(all_docs, target_collection_name=s2_with_compat_collection_name, **kwargs):
    for doc in all_docs:
        mid = doc['movie_id']
        elem = doc['scene_element']
        rc = mdb.get_scene_from_collection(mid,elem,target_collection_name)
        if rc:
            print("Results already exist for {}/{}".format(mid,elem))
            continue
        print("Going forward with {}/{}".format(mid,elem))

        rc_sents, sim_scores = optimize_scene(doc,**kwargs)
        rc_doc = {
            'movie_id': mid,
            'scene_element': elem,
            'sentences': rc_sents,
            'mean_scores': sim_scores,
        }
        query = "INSERT {} INTO {}".format(rc_doc,target_collection_name)
        cursor = pipeline.db.aql.execute(query)  

In [9]:
query = 'FOR doc IN {} RETURN doc'.format(s2_results_relaxed_collection_name)
cursor = pipeline.db.aql.execute(query)
all_docs = sorted(list(cursor), key=lambda x:"{}/{}".format(x['movie_id'],x['scene_element']))


In [None]:
run_pipeline(all_docs)

In [111]:
rc, sim_scores = optimize_scene(all_docs[1])
rc1, sim_scores1 = optimize_scene(all_docs[1], use_ordered_scores=True)

http://ec2-18-159-140-240.eu-central-1.compute.amazonaws.com:7000/static/dataset1/concatenated_mp4_200/0026_The_Big_Fish_01.27.55.748-01.28.12.028.mp4
24.0
Movie info: {'arango_id': 'Movies/222509634', 'source': 'lsmdc', 'fps': 24, 'width': 1920, 'height': 1080, 'scenes': [[0, 542]], 'mdfs': [[3, 317, 517], [523, 534, 538]], 'scene_elements': [[0, 520], [520, 542]]}
fn path: /tmp/file.mp4
/tmp/file.mp4
Scene:  1
[0.76923077 0.15384615 0.         0.30769231 0.38461538 0.23076923
 0.84615385 0.53846154 0.61538462 0.07692308 0.92307692 0.46153846
 0.69230769]
[0.         0.07692308 0.23076923 0.30769231 0.46153846 0.38461538
 0.69230769 0.84615385 0.61538462 0.15384615 0.76923077 0.92307692
 0.53846154]
[0.84615385 0.         0.07692308 0.46153846 0.69230769 0.30769231
 0.23076923 0.15384615 0.61538462 0.38461538 0.92307692 0.53846154
 0.76923077]
[0.15384615 0.30769231 0.46153846 0.07692308 0.69230769 0.84615385
 0.         0.76923077 0.61538462 0.23076923 0.92307692 0.38461538
 0.538461

In [77]:
rc, sim_scores

([['an old woman with curly hair is watching you in a room',
   'A blonde woman in a blue dress is watching another woman with a ring in her hand',
   'An old woman in a blue dress is standing in a room and watching you',
   'A woman in a blue dress is standing in a room and watches as the door begins to ring'],
  ['a blonde woman in a blue dress is watching you starting to talk to you',
   'an old woman with curly hair is watching you in a room',
   'An old woman with curly hair is watching in a room',
   'A blonde woman in a blue dress is watching another woman with a ring in her hand',
   'An old woman in a blue dress is standing in a room and watching you',
   'a woman with curly hair in a blue dress is watching somebody through the door'],
  ['a blonde woman in a blue dress is watching you starting to talk to you',
   'an old woman with curly hair is watching you in a room',
   'An old woman with curly hair is watching in a room',
   'A woman in a blue dress is standing in a room 