In [1]:
%cd ../

/mnt/dog/data/indraroy15/2025-MatchCut


In [2]:
import json
from typing import Set

import pandas as pd

from data.matchcut.matchcut.data import Dataset
import matplotlib.pyplot as plt

In [3]:
df_titles = pd.read_csv('data/matchcut/data/imdb-title-set.csv')
df_titles

Unnamed: 0,IMDB ID,title,genres,country,split
0,tt0050706,Mon Oncle (1958),Comedy,France,train
1,tt0059592,Pierrot le Fou (1965),"Crime, Drama, Romance",France,train
2,tt0061722,The Graduate (1967),"Comedy, Drama, Romance",USA,train
3,tt0061781,The Firemen's Ball (1967),"Comedy, Drama",Czechoslovakia,validation
4,tt0066921,A Clockwork Orange (1971),"Crime, Drama, Sci-Fi",UK,train
...,...,...,...,...,...
95,tt5827496,At Cafe 6 (2016),"Comedy, Romance",Taiwan,test
96,tt5866930,The Adventurers (2017),"Action, Adventure, Crime, Drama",China,validation
97,tt6157626,Legend of the Demon Cat (2017),"Drama, Fantasy, Horror, Mystery, Romance",China,validation
98,tt6298600,The Miracles of the Namiya General Store (2017),Drama,Japan,train


In [14]:
source_path = "../OpenDataLab___MovieNet/raw/240P" # NOTE: has to be downloaded separately 
target_path = "data/matchcut/data/movienet_netflix"

#if target path is no there, xreate it 
import os
if not os.path.exists(target_path):
    os.makedirs(target_path)

# for each IMDB ID, copy the .tar file form source path into target path 
import shutil 
for idx, row in df_titles.iterrows():
    imdb_id = row['IMDB ID']
    tar_file = f"{imdb_id}.tar"
    source_file = f"{source_path}/{tar_file}"
    target_file = f"{target_path}/{tar_file}"
    if os.path.exists(source_file):
        shutil.copyfile(source_file, target_file)
        print(f"copied {source_file} to {target_file}")
    else:
        print(f"source file {source_file} does not exist")

copied ../OpenDataLab___MovieNet/raw/240P/tt0050706.tar to data/matchcut/data/movienet_netflix/tt0050706.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0059592.tar to data/matchcut/data/movienet_netflix/tt0059592.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0061722.tar to data/matchcut/data/movienet_netflix/tt0061722.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0061781.tar to data/matchcut/data/movienet_netflix/tt0061781.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0066921.tar to data/matchcut/data/movienet_netflix/tt0066921.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0070245.tar to data/matchcut/data/movienet_netflix/tt0070245.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0070246.tar to data/matchcut/data/movienet_netflix/tt0070246.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0071315.tar to data/matchcut/data/movienet_netflix/tt0071315.tar
copied ../OpenDataLab___MovieNet/raw/240P/tt0079182.tar to data/matchcut/data/movienet_netflix/tt0079182.tar
copied ../OpenDataL

In [15]:
# create dir movienet_netflix_untar where ewach tar inside is untarred in separate dir
import tarfile
import os
import shutil
import glob

target_path_untar = "data/matchcut/data/movienet_netflix_untar"
if not os.path.exists(target_path_untar):
    os.makedirs(target_path_untar)

for tar_file in glob.glob(f"{target_path}/*.tar"):
    with tarfile.open(tar_file, 'r') as tar:
        tar.extractall(target_path_untar)
        print(f"untarred {tar_file} to {target_path_untar}")

untarred data/matchcut/data/movienet_netflix/tt0208092.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt0387898.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt1560747.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt4273292.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt1220719.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt1974419.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt0780504.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt3501416.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt3672840.tar to data/matchcut/data/movienet_netflix_untar
untarred data/matchcut/data/movienet_netflix/tt0118799.tar to da

In [4]:
def get_df(task: str, source: str = None) -> pd.DataFrame:
    """
    Read a dataframe where `task` is one of frame or motion.
    if `source` is not provided all the data is returned,
    if it is then the data is filtered down to the pairs
    retrieved by one of the 4 heuristics.
    """
    ds = Dataset(
        task=task,
        split="train",  # ignored
        encoder_name=None,
        agg_name=None,
        source=source,
    )
    return pd.DataFrame(p.__dict__ for p in ds.pairs_labeled_all)

In [5]:
def get_stats(df: pd.DataFrame, task: str):
    return {
        "Task": task,
        "Annotated pairs": len(df),
        "Positive pairs (majority label)": df.label.sum(),
        "Positive rate": df.label.mean(),

    }
df_frame = get_df('frame')
df_motion = get_df('motion')

df_stats = pd.DataFrame([
    get_stats(df_frame, "frame"),
    get_stats(df_motion, "motion"),
])
df_stats = df_stats.set_index("Task").transpose().round(3)
df_stats = df_stats.assign(overall=df_stats.frame + df_stats.motion)
df_stats.loc['Positive rate', 'overall'] = df_stats.loc['Positive pairs (majority label)', 'overall'] / df_stats.loc['Annotated pairs', 'overall']
df_stats

Task,frame,motion,overall
Annotated pairs,9985.0,9320.0,19305.0
Positive pairs (majority label),867.0,927.0,1794.0
Positive rate,0.087,0.099,0.092929


In [6]:
df_frame_true = df_frame[df_frame.label == True]

In [7]:
df_frame_true

Unnamed: 0,imdb_id,shot1_idx,shot2_idx,label,source_info
50,tt0118799,48,539,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
54,tt0118799,101,439,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
55,tt0118799,101,607,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
59,tt0118799,125,219,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
61,tt0118799,135,441,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
...,...,...,...,...,...
9949,tt6788942,366,1433,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9959,tt6788942,622,1154,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9963,tt6788942,658,1012,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9969,tt6788942,815,1366,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."


In [10]:
import os
import shutil
import glob

target_path_untar = "data/matchcut/data/movienet_netflix_untar"


#contains only the positive pairs for motions
new_target_path = "data/matchcut/data/movienet_netflix_filtered"
if not os.path.exists(new_target_path):
    os.makedirs(new_target_path)

    
def get_sub_df(task: str, label: str) -> pd.DataFrame:
    df = get_df(task)
    label_bool = True if label == "pos" else False
    return df[df.label == label_bool]


def dump_filtered_files(task: str, label: str):
    df = get_sub_df(task, label)
    folder_target_path = new_target_path + f"/{task}_{label}"
    if not os.path.exists(folder_target_path):
        os.makedirs(folder_target_path)

    for idx, row in df_frame_true.iterrows():
        imdb_id = row['imdb_id']
        shot1_idx = row['shot1_idx']
        shot2_idx = row['shot2_idx']
        
        source_folder_path = os.path.join(target_path_untar, str(imdb_id))
  
            
        dest_folder_path = os.path.join(folder_target_path, str(imdb_id))
        if not os.path.exists(dest_folder_path):
            os.makedirs(dest_folder_path)
        
        shot1_pattern = f"shot_{str(shot1_idx).zfill(4)}_img_*.jpg"
        shot2_pattern = f"shot_{str(shot2_idx).zfill(4)}_img_*.jpg"
        
        for shot_pattern in [shot1_pattern, shot2_pattern]:
            for img_file in glob.glob(os.path.join(source_folder_path, shot_pattern)):
                shutil.copy(img_file, dest_folder_path)
                print(f"Copied {img_file} to {dest_folder_path}")


In [11]:
dump_filtered_files('frame', 'pos')

Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/frame_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0101_img_1.jpg to data/matchcut/data/movienet_netflix_

In [12]:
dump_filtered_files('frame', 'neg')

Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/frame_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0101_img_1.jpg to data/matchcut/data/movienet_netflix_

In [13]:
dump_filtered_files('motion', 'pos')

Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/motion_pos/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0101_img_1.jpg to data/matchcut/data/movienet_ne

In [14]:
dump_filtered_files('motion', 'neg')

Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0048_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_0.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_1.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0539_img_2.jpg to data/matchcut/data/movienet_netflix_filtered/motion_neg/tt0118799
Copied data/matchcut/data/movienet_netflix_untar/tt0118799/shot_0101_img_1.jpg to data/matchcut/data/movienet_ne

In [15]:
    df = get_sub_df("frame", "pos")

In [16]:
df

Unnamed: 0,imdb_id,shot1_idx,shot2_idx,label,source_info
50,tt0118799,48,539,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
54,tt0118799,101,439,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
55,tt0118799,101,607,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
59,tt0118799,125,219,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
61,tt0118799,135,441,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
...,...,...,...,...,...
9949,tt6788942,366,1433,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9959,tt6788942,622,1154,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9963,tt6788942,658,1012,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
9969,tt6788942,815,1366,True,"SourceInfo(sources_and_scores=(('heuristic 2',..."
