In [None]:
!pip install Flask-RESTful

In [None]:
!pip install Flask

In [None]:
import torch
import clip
import cv2
import numpy as np
from PIL import Image
from flask import Flask
from flask_restful import Resource, Api, reqparse
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [3]:
THRESHOLD = 0.27
NUM_FRAMES = 5

In [15]:
def extract_frames(video_path):
    vidcap = cv2.VideoCapture(video_path)
    frames = []
    success,image = vidcap.read()
    count = 0
    while success:
        frames.append(Image.fromarray(image))
        success,image = vidcap.read()
        count += 1
    return frames

In [4]:
def check(score):
    curr_score = 0
    i = 0
    cnt = 0
    while i < NUM_FRAMES and i < len(score):
        curr_score += score[i]
        i += 1
    if(curr_score/NUM_FRAMES >= THRESHOLD):
        cnt+=1
    while i < len(score):
        curr_score += score[i];
        curr_score -= score[i- NUM_FRAMES]
        if(curr_score/NUM_FRAMES >= THRESHOLD):
            cnt+=1
        i+=1
    return cnt

In [5]:
def process_videos(video_paths):
    res = []
    for video_path in video_paths:
        frames = extract_frames(video_path)
        curr_vid = []
        for img in frames:
            image = preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                image = model.encode_image(image).float()
                image /= image.norm(dim=-1, keepdim=True)
            curr_vid.append(image)
        res.append(curr_vid)
    return res

In [44]:
def search_preprocessed_data(data, text):
    res = []
    text = clip.tokenize(text).to(device)
    with torch.no_grad():
        text = model.encode_text(text).float()
    text /= text.norm(dim=-1, keepdim=True)
    i = 0
    all_score = []
    for vid in data:
        score = []
        for frame in vid:
            similarity = text.cpu().numpy() @ frame.cpu().numpy().T
            score.append(similarity)
        if(check(score)):
            res.append(i)
        i += 1
        all_score.append(score)
    return res, all_score

In [45]:
app = Flask(__name__)
api = Api(app)

In [26]:
path = os.getcwd()
path = path + '\dataset'
video_paths = [path + '\\bird.mp4',path + '\\cat.mp4', path + '\\fish.mp4', path + '\\flower.mp4' ]
data = process_videos(video_paths)

In [27]:
links = ["https://pixabay.com/videos/budgerigar-bird-parrot-pet-animal-2471/","https://pixabay.com/videos/cute-cat-funny-cat-kitten-domestic-3092/",
"https://pixabay.com/videos/fishes-small-aquarium-small-fish-16166/", "https://pixabay.com/videos/blue-tit-blossom-tree-blue-tit-3063/"]
d = {}
i = 0
for vid in links:
    d[i] = vid
    i+=1

In [46]:
class clipSearch(Resource):
    def get(self):
        parser = reqparse.RequestParser()
        parser.add_argument('text', required=True)
        args = parser.parse_args()
        text = args['text']
        ans = []
        res, scores = search_preprocessed_data(data, text)
        for vid in res:
            ans.append({'link': d[vid], 'path': video_paths[vid]})
        return {'data': ans}, 
api.add_resource(clipSearch, '/clipsearch') 

In [None]:
app.run()