In [1]:
from sentence_transformers import SentenceTransformer, util

import numpy as np
import tensorflow as tf
import spacy

from glob import glob
import csv
import json
import os

# Lọc danh sách objects để tăng tốc độ xử lí

- Chỉ lấy 1 object mỗi frame và chỉ khi object có điểm >= 0.7

- Chạy một lần thôi, để tạo thư mục objects_filtered.

In [35]:
os.makedirs('./objects_filtered', exist_ok=True)
objects_paths = glob('./objects\\*\\*.json')

for obj_path in objects_paths:
    with open(obj_path, 'r') as r_stream:
        keyframe_filename = obj_path.split('\\')[-2] + '/' + obj_path.split('\\')[-1]
        obj = json.load(r_stream)
        obj_filtered = {
            "Objects": [],
            "Scores": []
        }
        idx = 0
        for score in obj["detection_scores"][:1]:
            if float(score) < 0.7:
                break
            else:
                obj_filtered["Scores"].append(score)
                obj_filtered["Objects"].append(obj["detection_class_entities"][idx])
                idx += 1
        os.makedirs('./objects_filtered/' + keyframe_filename.split('/')[0], exist_ok=True)
        with open('./objects_filtered/' + keyframe_filename, 'w') as w_stream:
            json.dump(obj_filtered, w_stream)

# Tải model xử lí ngôn ngữ tự nhiên en_core_web_lg từ spacy

- Nếu đã tải rồi thì thôi

In [3]:
# !python -m spacy download en_core_web_lg

# Load nlp model

In [2]:
nlp = spacy.load('en_core_web_lg')

# CLIP vectors

- Đọc và tạo mảng gồm tất cả các CLIP vectors để đưa vào model.

In [8]:
# Tạo list tên video và tên keyframe để lát đổi chiếu
objects_paths = glob('./objects_filtered\\*\\*.json')

keyframe_names = []
for path in objects_paths:
    vid_frameid = [path.split('\\')[-2], path.split('\\')[-1].split('.')[0]]
    keyframe_names.append(vid_frameid)

In [19]:
keyframe_names[1]

['L01_V001', '0002']

In [13]:
len(keyframe_names)

199110

In [14]:
clip_paths = glob('./clip-features-vit-b32\\*.npy')
clip_embeddings = np.load(clip_paths[0])

In [15]:
#Đọc và tạo numpy array embeddings CLIP
for path in clip_paths[1:]:
    c_vector = np.load(path)
    clip_embeddings = np.append(clip_embeddings, c_vector, axis=0)
    #clip_embeddings = np.concatenate(clip_embeddings, c_vector)

In [16]:
clip_embeddings = clip_embeddings.astype(np.float32)

In [21]:
clip_embeddings

array([[ 0.03004456,  0.03555298,  0.01282501, ...,  0.05236816,
        -0.03994751, -0.01226044],
       [ 0.00272942, -0.03497314,  0.01190186, ...,  0.09503174,
        -0.01295471,  0.00335312],
       [-0.00409698, -0.02764893, -0.0124054 , ...,  0.08166504,
         0.00548172,  0.00435638],
       ...,
       [-0.04769897,  0.0136261 , -0.0019722 , ...,  0.15551758,
        -0.06311035, -0.01803589],
       [-0.03121948,  0.01737976, -0.02980042, ...,  0.08605957,
        -0.02941895, -0.01332855],
       [ 0.00219727,  0.00026584, -0.00503922, ...,  0.06530762,
         0.00248718, -0.01753235]], dtype=float32)

In [17]:
len(clip_embeddings)

202148

# CLIP model

In [28]:
# Chuyển numpy array sang tensor của tensorflow
# img_emb = tf.convert_to_tensor(clip_embeddings)

# Các hàm predict

## Hàm áp dụng model clip-ViT-B-32

In [7]:
# Tìm bằng CLIP vectors
def clip_search(query, img_embs, k=5):
    #Initialize model
    clip_model = SentenceTransformer('clip-ViT-B-32')
    
    query_emb = clip_model.encode([query], convert_to_tensor=True, show_progress_bar=False)
    
    hits = util.semantic_search(query_emb, img_embs, top_k=k)[0]
    
    results = []
    for hit in hits:
        result = keyframe_names[hit['corpus_id']]
        results.append(result)
    return results

## Hàm áp dụng objects để tiếp tục lọc

- B1: Ta lọc ra các chủ ngữ, đối tượng trong query.

- B2: Ta tính điểm cho từng keyframes, nếu frame nào ko có objects thì coi như điểm = 0
+ Cách thức tính điểm: **SUM(similarity * confidence)**

similarity là điểm tương đồng giữa tên object và tên của từng chủ ngữ, đối tượng, ta tính xong lấy max

confidence là điểm confidence của object đó

In [8]:
# Lọc qua objects
def objects_filtering(clip_results, query, nlp_model, k=3):
    # Extract subjects + objects from query
    query_objs = []
    doc = nlp_model(query)
    # for word in doc:
    #     if word.dep_ == "nsubj" or word.dep_ == "iobj" or word.dep_ == "dobj":
    #         query_objs.append(word)
    for chunk in doc.noun_chunks:
        query_objs.append(chunk)
    
    #Scoring
    scores = []
    for kf in clip_results:
        obj_path = './objects_filtered/' + kf[0] + '/' + kf[1] + '.json'
        with open(obj_path, 'r') as r_stream:
            obj = json.load(r_stream)
            score = 0
            
            for object, confidence in zip(obj["Objects"], obj["Scores"]):
                sims = []
                for query_obj in query_objs:
                    sims.append(nlp_model(object).similarity(query_obj))
                similarity = max(sims)
                score += similarity * float(confidence) #Score is sum of the similarity of the object to the query * confidence score
                
            if len(obj["Objects"]) == 0: # Remain impartial to frames with no objects ?
                score = 0.5
        scores.append(score)
        
    #Get top k frames
    top_k = np.argsort(scores)[-k:]
    return [clip_results[x] for x in top_k]
            

# Lọc concepts

In [9]:
def concepts_filtering(clip_results, query, nlp_model, k=3):
    # Extract subjects + objects from query
    query_objs = []
    doc = nlp_model(query)
    for chunk in doc.noun_chunks:
        query_objs.append(chunk)
    
    scores = []
    for kf in clip_results:
        concept_path = './concepts/' + kf[0] + '/' + kf[1] + '.json'
        with open(concept_path, 'r') as r_stream:
            concept = json.load(r_stream)
            score = 0
            sims = []
            for query_obj in query_objs:
                sims.append(nlp_model(concept).similarity(query_obj))
            score = max(sims)
        scores.append(score)
    
    #Get top k frames
    top_k = np.argsort(scores)[-k:]
    return [clip_results[x] for x in top_k]

# Hàm predict cuối

- B1: Lọc bằng CLIP vectors

- B2: Lọc tiếp bằng objects

- B?? ... (có thể cải tiến thêm)

In [10]:
def predict(query, clip_embeds, nlp_model, k=5):
    
    clip_results = clip_search(query, clip_embeds, k*3)
    
    concept_results = concepts_filtering(clip_results, query, nlp_model, k*2)
    
    objects_results = objects_filtering(concept_results, query, nlp_model, k)
    
    #Mapping results to actual frame numbers
    final_results = []
    for result in objects_results:
        path = './map-keyframes/' + result[0] + '.csv'
        with open(path, 'r') as r_stream:
            csvreader = csv.reader(r_stream)
            next(csvreader) #Skip columns names
            
            for row in csvreader:
                if int(row[0]) == int(result[1]):
                    # final_results.append([result[0], int(row[3]) + np.random.randint(-150,150)])
                    final_results.append([result[0], int(row[3]) + np.random.randint(-2,2)])
                    break
    
    #upload to csv file
    print("Prediction complete! Writing to csv file...")
    os.makedirs('./results', exist_ok=True)
    with open('./results/newest_result.csv', 'w', newline='') as w_stream:
        csvwriter = csv.writer(w_stream)
        csvwriter.writerows(final_results)
    print("Complete")
    return final_results

## Predict test

In [11]:
query = 'The video shows a woman in a yellow shirt putting trash in a trash can. The trash can is dark green and the lid is red. The garbage that was putting in the bin said it was 1kg baby spinach.'

results = predict(query, clip_embeddings, nlp, k=100)

print(results)

Prediction complete! Writing to csv file...
Complete
[['L09_V014', 11763], ['L09_V008', 10019], ['L05_V022', 4626], ['L10_V010', 3777], ['L10_V013', 30168], ['L10_V015', 29365], ['L08_V014', 26287], ['L08_V010', 25295], ['L01_V006', 14188], ['L02_V016', 14189], ['L06_V019', 23554], ['L08_V018', 29513], ['L01_V020', 10623], ['L01_V014', 6587], ['L07_V013', 10283], ['L04_V017', 22488], ['L08_V009', 21151], ['L10_V014', 32577], ['L01_V008', 12596], ['L08_V019', 23093], ['L02_V022', 9069], ['L09_V021', 15627], ['L10_V013', 35050], ['L07_V015', 13189], ['L10_V030', 5634], ['L08_V009', 21236], ['L06_V028', 20217], ['L04_V014', 9948], ['L06_V014', 2427], ['L06_V020', 3801], ['L05_V004', 3502], ['L08_V015', 14239], ['L04_V016', 19232], ['L10_V013', 34921], ['L03_V007', 9089], ['L05_V027', 11067], ['L07_V019', 3027], ['L04_V006', 26949], ['L06_V009', 9372], ['L04_V002', 28156], ['L10_V018', 21661], ['L07_V019', 16528], ['L10_V005', 21401], ['L01_V008', 12439], ['L05_V027', 11586], ['L03_V021', 