In [1]:
import sys
sys.path.insert(0, "/notebooks/pipenv")
sys.path.insert(0, "/notebooks/nebula3_vlm")
sys.path.insert(0, "/notebooks/nebula3_database")
sys.path.insert(0, "/notebooks/")
import os
import math
import random
import bisect
import pickle
import time
import numpy as np


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import subprocess
import re
import tempfile
import itertools
import torch
import spacy
# import amrlib
# import penman

from typing import List, Tuple
from operator import itemgetter 
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
from database.arangodb import DatabaseConnector
from config import NEBULA_CONF
from movie_db import MOVIE_DB


In [3]:
class PIPELINE:
    def __init__(self):
        config = NEBULA_CONF()
        self.db_host = config.get_database_host()
        self.database = config.get_playground_name()
        self.gdb = DatabaseConnector()
        self.db = self.gdb.connect_db(self.database)

pipeline = PIPELINE()
mdb = MOVIE_DB()
from vlm.clip_api import CLIP_API
clip=CLIP_API('vit')
s2_collection_name = 's2_pipeline_after_gpt'
s2_results_orig_collection_name = 's2_pipeline_optim_orig'
s2_results_relaxed_collection_name = 's2_pipeline_optim_relaxed'
s2_compatibility_collection_name = 's2_pipeline_compatibility_scores'
s2_with_compat_collection_name = 's2_pipeline_compatibility_results'


In [4]:
def flatten(lst): return [x for l in lst for x in l]

def compute_batch_scores(video_emb: torch.Tensor, texts: List[str], normalize=True, **kwargs) -> List[float]:    
    emb_batch = clip.clip_batch_encode_text(texts, **kwargs)                           
    return (video_emb.expand_as(emb_batch)*emb_batch).sum(dim=1).cpu().numpy()


def compute_concat_score(image_emb: torch.Tensor, texts: List[str], join_on=',') -> float:
    combined_text = ""
    for t in [x.strip() for x in texts]:
        if t[-1]=='.':
            t = t[:-1]       
        t+=join_on
        t+=' '
        combined_text+=t
    print("Combined: "+combined_text)
    return torch.matmul(image_emb,mdmmt.encode_text(combined_text.strip()) )       

In [5]:
# def transform_concept(c):
#     exp = re.compile(r"^([a-zA-z]+)-?(\d*)$")
#     r = exp.match(c)
#     return r.group(1) if r else c

# class ConceptManager:
#     def __init__(self):
#         pass
#     def ground_concept(concept):
#         return transform_concept(concept)

In [6]:
class SimilarityManager:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_lg')

    def similarity(self, src, target):
        rc = []
        s1 = self.nlp(src)
        s2 = self.nlp(target)
        for w in s1:
            # if not w or not w.vector_norm:
            #     print('Argghhh 1, bad word:')
            #     print(w.text)
            if w.pos_ not in ['NOUN', 'ADJ', 'ADV', 'VERB', 'PROPN', 'ADP'] and len(s1)>1:
                continue
            # for tok in s2:
            #     if not tok or not tok.vector_norm:
            #         print('Argghhh 2, bad word:')
            #         print(tok.text)
            #         print(s2.text)
            rc.append(max([w.similarity(x) for x in s2]))
        return np.mean(rc)
        
smanager = SimilarityManager()


In [7]:
softmax = lambda x: np.exp(x)/sum(np.exp(x))
def normalize(x):
    epsilon = 0.00001
    if np.std(x) < epsilon:
        return np.ones(x.shape)
    return (x - np.mean(x)) / np.std(x)


In [50]:
def powerset(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(1,len(s)+1))


def optimize_sents(emb_video, experts, sents, compat_scores, use_ordered_scores=False):
    smanager = SimilarityManager()
    compat_scores = np.array(compat_scores)
    as_compat = compat_scores.argsort()
    # print(compat_scores)
    graded_scores = sorted(list(zip(as_compat,range(len(as_compat)))),key = lambda x:x[0])
    # print(list(zip(as_compat,range(len(as_compat)))))
    # print(graded_scores)
    order_scores = np.array(list(zip(*graded_scores))[1]) / len(graded_scores)
    print(order_scores)
    # print(list(zip(compat_scores[as_compat],as_compat)))
    candidates_similarity = normalize(compute_batch_scores(emb_video, sents))
    coverage_matrix = np.zeros([len(experts),len(sents)])
    coverage_matrix[:] = np.nan
    for i in range(len(experts)):
        for j in range(len(sents)):
            coverage_matrix[i][j]=smanager.similarity(experts[i],sents[j])
        coverage_matrix[i] = normalize(coverage_matrix[i])

    def get_score(state: List[int]) -> float:
        theta_similarity = 1.
        theta_coverage = 1.
        theta_compat = 1.
        if not state:
            return 0
        coverage_score = get_state_coverage(state)   
        similarity_score = candidates_similarity[state].mean().item()
        if use_ordered_scores:            
             compat_score = order_scores[state].mean().item()
        else:            
             compat_score = compat_scores[state].mean().item()
        return theta_coverage*coverage_score + theta_similarity*similarity_score + theta_compat*compat_score

    def get_expert_coverage(state):
        # return self.coverage_matrix[:,state].sum(axis=1)
        return coverage_matrix[:,state].max(axis=1)
          
    def get_state_coverage(state) -> float:
        # print("State coverage for {}:".format(state))
        # print(get_expert_coverage(state))
        return np.mean(get_expert_coverage(state))


    superset = list(range(len(sents)))
    pset = [list(x) for x in powerset(superset)]
    pset_scores = [get_score(x) for x in pset]
    best_cand = pset[np.argmax(pset_scores)]
    print("Best candidates:")
    print(best_cand)
    rc = list(itemgetter(*best_cand)(sents)), candidates_similarity[best_cand].mean()
    if type(rc)==int:
        return [rc]
    else:
        return rc

def optimize_scene(doc,mat=None, emb_video=None, **kwargs):
    mid = doc['movie_id']
    elem = doc['scene_element']
    emb_video = clip.clip_encode_video(mid,elem)
    all_sents = doc['sentences']
    rc = mdb.get_scene_from_collection(mid,elem,'s2_clsmdc')    
    experts = flatten(rc['experts'].values())
    rc = mdb.get_scene_from_collection(mid,elem,s2_compatibility_collection_name)  
    all_compat_scores = rc['compat_scores']
    n = len(all_sents)
    rc_sents = n*[None]
    mean_scores = n*[None]
    for i in range(n):
        rc_sents[i], mean_scores[i] = optimize_sents(emb_video,experts,all_sents[i],all_compat_scores[i], **kwargs)

    return rc_sents, mean_scores
    
def run_pipeline(all_docs, target_collection_name=s2_with_compat_collection_name, **kwargs):
    for doc in all_docs:
        mid = doc['movie_id']
        elem = doc['scene_element']
        rc = mdb.get_scene_from_collection(mid,elem,target_collection_name)
        if rc:
            print("Results already exist for {}/{}".format(mid,elem))
            continue
        print("Going forward with {}/{}".format(mid,elem))

        rc_sents, sim_scores = optimize_scene(doc,**kwargs)
        rc_doc = {
            'movie_id': mid,
            'scene_element': elem,
            'sentences': rc_sents,
            'mean_scores': sim_scores,
        }
        query = "INSERT {} INTO {}".format(rc_doc,target_collection_name)
        cursor = pipeline.db.aql.execute(query)  

In [49]:
type((3,2))==int

False

In [9]:
query = 'FOR doc IN {} RETURN doc'.format(s2_results_relaxed_collection_name)
cursor = pipeline.db.aql.execute(query)
all_docs = sorted(list(cursor), key=lambda x:"{}/{}".format(x['movie_id'],x['scene_element']))


In [None]:
run_pipeline(all_docs)

In [47]:
flatten([[itemgetter(*[2,3])([1,2,3,4,5,6,7,8,9])]])

[(3, 4)]

In [18]:
rc, sim_scores = optimize_scene(all_docs[1])
# rc1, sim_scores1 = optimize_scene(all_docs[1], use_ordered_scores=True)

http://ec2-18-159-140-240.eu-central-1.compute.amazonaws.com:7000/static/dataset1/concatenated_mp4_200/0026_The_Big_Fish_01.27.55.748-01.28.12.028.mp4
24.0
Movie info: {'arango_id': 'Movies/222509634', 'source': 'lsmdc', 'fps': 24, 'width': 1920, 'height': 1080, 'scenes': [[0, 542]], 'mdfs': [[3, 317, 517], [523, 534, 538]], 'scene_elements': [[0, 520], [520, 542]]}
fn path: /tmp/file.mp4
/tmp/file.mp4
Scene:  1
[0.76923077 0.15384615 0.         0.30769231 0.38461538 0.23076923
 0.84615385 0.53846154 0.61538462 0.07692308 0.92307692 0.46153846
 0.69230769]
Best candidates:
[10]
[0.         0.07692308 0.23076923 0.30769231 0.46153846 0.38461538
 0.69230769 0.84615385 0.61538462 0.15384615 0.76923077 0.92307692
 0.53846154]
Best candidates:
[0, 7, 9, 10]
[0.84615385 0.         0.07692308 0.46153846 0.69230769 0.30769231
 0.23076923 0.15384615 0.61538462 0.38461538 0.92307692 0.53846154
 0.76923077]
Best candidates:
[1, 10]
[0.15384615 0.30769231 0.46153846 0.07692308 0.69230769 0.8461538

In [19]:
rc, sim_scores

([['A',
   'n',
   ' ',
   'o',
   'l',
   'd',
   ' ',
   'w',
   'o',
   'm',
   'a',
   'n',
   ' ',
   'i',
   'n',
   ' ',
   'a',
   ' ',
   'b',
   'l',
   'u',
   'e',
   ' ',
   'd',
   'r',
   'e',
   's',
   's',
   ' ',
   'i',
   's',
   ' ',
   's',
   't',
   'a',
   'n',
   'd',
   'i',
   'n',
   'g',
   ' ',
   'i',
   'n',
   ' ',
   'a',
   ' ',
   'r',
   'o',
   'o',
   'm',
   ' ',
   'a',
   'n',
   'd',
   ' ',
   'w',
   'a',
   't',
   'c',
   'h',
   'i',
   'n',
   'g',
   ' ',
   'y',
   'o',
   'u'],
  ['a blonde woman in a blue dress is watching you starting to talk to you',
   'An old woman with curly hair is watching in a room',
   'A blonde woman in a blue dress is watching another woman with a ring in her hand',
   'An old woman in a blue dress is standing in a room and watching you'],
  ['a blonde woman in a blue dress is watching you starting to talk to you',
   'An old woman with curly hair is watching in a room'],
  ['A',
   'n',
   ' ',
   'o',
