In [1]:
import os
import torch
import cv2
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np

cur_dir = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

# Function to extract frames from a video
def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

  return self.fget.__get__(instance, owner)()
2024-11-21 14:35:32.775198: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
text_prompt_dir = os.path.join(cur_dir, '../configs/src_text_prompts/')
videos_dir = os.path.join(cur_dir, '../outputs/first_frame/input')

cos_sim_all = {}
for filename in os.listdir(first_frame_dir):

    with open(os.path.join(text_prompt_dir, filename.replace(".jpg", ".txt").replace(".png", ".txt")), "r") as file:
        text_input = file.read().strip()  # Remove any surrounding whitespace or newline characters
    text_input = text_input.replace("Amazing quality, masterpiece", "")
    text_input = text_input.replace('.', '').replace(',', '')

    image_input = Image.open(os.path.join(videos_dir, filename))
    inputs = processor(
        text=text_input, 
        images=image_input, 
        return_tensors="pt", 
    )

    # Forward pass through the model
    outputs = model(**inputs)
    text_features = outputs.text_embeds
    image_features = outputs.image_embeds

    text_features /= text_features.norm(dim=-1, keepdim=True)
    image_features /= image_features.norm(dim=-1, keepdim=True)

    cos_sim = torch.nn.functional.cosine_similarity(text_features, image_features)
    cos_sim_all[filename.split('.')[0]] = cos_sim.data


for k, v in cos_sim_all.items():
    print(f"Cosine Similarity of {k}: {v}")
avg = sum([v for k, v in cos_sim_all.items()]) / len(cos_sim_all)
print(f"Averaged Cosine Similarity: {avg}")



In [9]:
text_prompt_dir = os.path.join(cur_dir, 'configs/tgt_text_prompts/')
first_frame_dir = os.path.join(cur_dir, 'results')

cos_sim_all = {}
for filedir in os.listdir(first_frame_dir):
    if not os.path.isdir(os.path.join(first_frame_dir, filedir, 'result_frames')):
        continue
    
    try:
        with open(os.path.join(text_prompt_dir, filedir+".txt"), "r") as file:
            text_input = file.read().strip()  # Remove any surrounding whitespace or newline characters
        text_input = text_input.replace("Amazing quality, masterpiece", "")
        text_input = text_input.replace('.', '').replace(',', '')
    except:
        print(filedir)
        continue
    
    cos_sim_frames = []
    for filename in os.listdir(os.path.join(first_frame_dir, filedir, 'result_frames')):
        if 'rf_inv_latents' in filename:
            continue
        
        image_input = Image.open(os.path.join(first_frame_dir, filedir, 'result_frames', filename))
        inputs = processor(
            text=text_input, 
            images=image_input, 
            return_tensors="pt", 
        )

        # Forward pass through the model
        outputs = model(**inputs)
        text_features = outputs.text_embeds
        image_features = outputs.image_embeds

        text_features /= text_features.norm(dim=-1, keepdim=True)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        cos_sim = torch.nn.functional.cosine_similarity(text_features, image_features)
        cos_sim_frames.append(cos_sim.data)
    
    cos_sim_all[filedir.split('.')[0]] = sum(cos_sim_frames)/len(cos_sim_frames)

for k, v in cos_sim_all.items():
    print(f"Cosine Similarity of {k}: {v}")
avg = sum([v for k, v in cos_sim_all.items()]) / len(cos_sim_all)
print(f"Averaged Cosine Similarity: {avg}")



car-turn_car
wolf_elphant
car-turn_tank_frames40
car-turn_lion_frames40
car-turn_spaceship_frames24
Cosine Similarity of locomotive_car_snow: tensor([0.2653])
Cosine Similarity of car-turn_cartoon: tensor([0.3462])
Cosine Similarity of woman-running_sculpture: tensor([0.2874])
Cosine Similarity of eiffel-flyover_style: tensor([0.3403])
Cosine Similarity of bus_car: tensor([0.2561])
Cosine Similarity of dog_cat: tensor([0.3300])
Cosine Similarity of blackswan_duck: tensor([0.2998])
Cosine Similarity of car-turn_elephant: tensor([0.3067])
Cosine Similarity of eiffel-flyover_object_multiple: tensor([0.3140])
Cosine Similarity of camel_giraffe: tensor([0.3114])
Cosine Similarity of locomotive_car: tensor([0.2953])
Cosine Similarity of wolf_silver_robotic: tensor([0.2266])
Cosine Similarity of eiffel-flyover_bg: tensor([0.3144])
Cosine Similarity of aircraft_object: tensor([0.2926])
Cosine Similarity of car-turn_tank: tensor([0.3052])
Cosine Similarity of bear_giraffe: tensor([0.3161])
Cosi

In [None]:
text_prompt_dir = os.path.join(cur_dir, '../configs/tgt_text_prompts/')
videos_dir = os.path.join(cur_dir, '../outputs/all_frames/motion')

cos_sim_all = {}
for filedir in os.listdir(videos_dir):
    with open(os.path.join(text_prompt_dir, filedir.replace(".mp4", ".txt")), "r") as file:
        text_input = file.read().strip()  # Remove any surrounding whitespace or newline characters
    text_input = text_input.replace("Amazing quality, masterpiece", "")
    text_input = text_input.replace('.', '').replace(',', '')
    
    cos_sim_frames = []
    frames = extract_frames(os.path.join(videos_dir, filedir))

    # Process the text and frames with CLIP processor
    inputs = processor(
        text=text_input, 
        images=[Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames], 
        return_tensors="pt"
    )

    # Forward pass through the model
    outputs = model(**inputs)
    text_features = outputs.text_embeds
    image_features = outputs.image_embeds

    # Calculate cosine similarities between the text and each frame
    cosine_similarities = torch.nn.functional.cosine_similarity(text_features, image_features)

    # Optionally, find the frame with the highest similarity
    avg_score = torch.mean(cosine_similarities).item()
    print(f"The averaged similarity is {avg_score}.")
    cos_sim_all[filedir.split('.')[0]] = avg_score

for k, v in cos_sim_all.items():
    print(f"Cosine Similarity of {k}: {v}")
avg = sum([v for k, v in cos_sim_all.items()]) / len(cos_sim_all)
print(f"Averaged Cosine Similarity: {avg}")
