# Libraries 

In [1]:
from collections import defaultdict
import re
import webvtt
import os
import extract_utils
import insert_utils
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from get_variables import get_kukleva_merged_interactions, get_kukleva_merged_relationships
import datetime
import math
import pickle

# Dataset import

In [2]:
dataset = pd.read_pickle("../../../MovieGraphs_Data/Annotations/mg/py3loader/2017-11-02-51-7637_py3.pkl")
movies_data = open("../../../MovieGraphs_Data/Annotations/mg/dvds", mode="r", encoding="utf-8").read().split("\n")

# Extract all the movies basic info 

In [3]:
dataset_movies = list(dataset.keys())
movies_info = extract_utils.extract_movies_info(dataset_movies,movies_data)

## Sample results

In [4]:
# Initialize the counter
i = 0
# Loop through the elements
for movie_id, info in movies_info.items():
    # Display the movie and its info 
    print(f"ID: {movie_id} - Info : {info}")
    # Increase the counter
    i+=1
    # Check if enough samples have been displayed 
    if i==500:
        break

ID: tt0147800 - Info : {'title': '10 Things I Hate About You', 'year': '1999'}
ID: tt0988595 - Info : {'title': '27 Dresses', 'year': '2008'}
ID: tt0119822 - Info : {'title': 'As Good as It Gets', 'year': '1997'}
ID: tt0455824 - Info : {'title': 'Australia', 'year': '2008'}
ID: tt0307987 - Info : {'title': 'Bad Santa', 'year': '2003'}
ID: tt0388795 - Info : {'title': 'Brokeback Mountain', 'year': '2005'}
ID: tt0118842 - Info : {'title': 'Chasing Amy', 'year': '1997'}
ID: tt0375679 - Info : {'title': 'Crash', 'year': '2004'}
ID: tt1570728 - Info : {'title': 'Crazy, Stupid, Love.', 'year': '2011'}
ID: tt1499658 - Info : {'title': 'Horrible Bosses', 'year': '2011'}
ID: tt0790636 - Info : {'title': 'Dallas Buyers Club', 'year': '2013'}
ID: tt1907668 - Info : {'title': 'Flight', 'year': '2012'}
ID: tt0109830 - Info : {'title': 'Forrest Gump', 'year': '1994'}
ID: tt0109831 - Info : {'title': 'Four Weddings and a Funeral', 'year': '1994'}
ID: tt1632708 - Info : {'title': 'Friends with Benefit

## Generate the movies insertions script

In [5]:
insert_utils.insert_movies(movies_info)

# Movies and clip graphs structure (sample)

In [6]:
# Initialize the counter
i = 0

# Loop through the movies
for movie in dataset:
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items():
        print(f"Movie : {movie} - Clip : {clip}")
        break
        # Increase the counter
    i+=1
    # Check if enough samples have been displayed 
    if i==5:
        break

Movie : tt0988595 - Clip : (1, <GraphClasses.ClipGraph object at 0x169c1fdf0>)
Movie : tt1285016 - Clip : (0, <GraphClasses.ClipGraph object at 0x16de5d3c0>)
Movie : tt0167404 - Clip : (0, <GraphClasses.ClipGraph object at 0x2a04d1ba0>)
Movie : tt0790636 - Clip : (3, <GraphClasses.ClipGraph object at 0x2a0c57160>)
Movie : tt0100405 - Clip : (1, <GraphClasses.ClipGraph object at 0x2a33effd0>)


# Get all the characters featured in each clip/scene
**Output format : {movie_id : {clip_id: [characters_list]}}**

Note : The characters list is a set since we should avoid repetition for a single clip

In [7]:
# Get the characters
clip_characters = extract_utils.extract_characters(dataset)

## Sample results (characters) for a movie

Movie tt0037884, Clip 7

In [8]:
# Print characters of a specific movie
print(clip_characters["tt0037884"][7])

{'Don Birnam', 'Wick Birnam', 'Helen St. James'}


### Extract all the chracters names
The total text variable will be useful during the embedding phase, when we'll use the tokenizer 


## Generate the clip and featured characters insertions script

## Characters insertion

Insert characters and their corresponding movie. 

**Waiting for the clips/scenes to be properly delimited to insert clip and scene features.**

In [9]:
insert_utils.insert_characters_and_features(clip_characters)

# Interactions processing for each clip

- **Output format : {movie_id : {clip_id: [interactions]}}**
- **Interactions format (as of now): {summary, start_time, end_time}**

## Clip structure sample

In [10]:
dataset["tt0037884"].clip_graphs[1].orig_graph_json

{'index_fname': '/static/indexed_data/video/tt0037884/elza/index.json',
 'last_saved_str': '2017-04-24-16-21-09',
 'image': 'offweb/raw_data/video_scenes/tt0037884/scene-002.ss-0007.es-0009.jpg',
 'sentence_description': 'While Wick searches for the typewriter, Don tries to open an alcohol bottle, but quickly hides it when Wick returns.',
 'scene': 'bedroom',
 'depth': 1,
 'edges': [{'source': 20, 'target': 22},
  {'source': 22, 'target': 21},
  {'source': 21, 'undirected': True, 'target': 23},
  {'source': 21, 'undirected': True, 'target': 24},
  {'source': 20, 'undirected': True, 'target': 25},
  {'source': 21, 'target': 26},
  {'source': 26, 'target': 20},
  {'source': 24, 'undirected': True, 'target': 27},
  {'source': 21, 'undirected': True, 'target': 28},
  {'source': 28, 'undirected': True, 'target': 29},
  {'source': 28, 'undirected': True, 'target': 30},
  {'source': 21, 'undirected': True, 'target': 31},
  {'source': 31, 'undirected': True, 'target': 32},
  {'source': 31, 'un

## Extract interactions data

In [11]:
%load_ext autoreload 

In [12]:
%autoreload now

In [13]:
# Get the equivalences 
equivalences = get_kukleva_merged_interactions()

In [14]:
# Search interaction 
def get_interaction_class(summary, equivalences:dict): 
    """_summary_

    Args:
        interaction_list (_type_): _description_
    """

    # Loop through the elements
    for i,key in enumerate(equivalences.keys()): 
        # Check if the interaction matches 
        if summary==key:
            return i
    
    raise f"Could not find the interaction class for {summary}"

In [29]:
%autoreload now

In [30]:
# Initialize the object 
clip_interactions, not_found, stats = extract_utils.extract_interactions(dataset)

FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/maxone/Clips/tt1045658/scenes.txt'

In [17]:
# Create a dataframe containing the interaction stats
stats_pd = pd.DataFrame()
stats_pd["Interaction"] = stats.keys()
stats_pd["Count"] = stats.values()
stats_pd["Class"] = stats_pd.Interaction.apply(lambda x: get_interaction_class(x, equivalences))
# Display a sample 
stats_pd.head()

Unnamed: 0,Interaction,Count,Class
0,gives (to),266,36
1,photographs,23,88
2,smiles (at),61,75
3,helps,108,44
4,asks,2474,60


In [18]:
# Top 29 interactions 
stats_pd.sort_values(by="Count", ascending=False).head(20)

Unnamed: 0,Interaction,Count,Class
4,asks,2474,60
41,informs,1237,61
25,explains (to),1129,37
8,watches (something/someone/with),925,62
12,suggests/offers (to/something)/gives opinion,796,42
33,orders,726,39
48,answers (to),606,38
13,talks (to/with),532,63
11,compliments/seduces,463,43
16,greets,462,40


In [19]:
2476/sum(stats_pd["Count"])

0.1428077056177183

In [20]:
# Interaction stats
stats_pd["Count"].describe()

count     100.000000
mean      173.380000
std       323.052448
min         1.000000
25%        24.500000
50%        69.000000
75%       155.750000
max      2474.000000
Name: Count, dtype: float64

In [21]:
# Save to CSV 
stats_pd.to_csv("interactions_stats.csv",index=False)

In [22]:
print(f"Interaction count : {sum([len(value) for _,value in clip_interactions.items()])}")
print(f"Rejected interaction count: {sum(not_found.values())}")

Interaction count : 6184
Rejected interaction count: 1000


In [23]:
for element,occurrences in not_found.items(): 
    if occurrences>=2:
        print(f"Element: {element} -- Occurrences : {occurrences}")

Element: gives directions to -- Occurrences : 3
Element: points out to -- Occurrences : 2
Element: shakes head at -- Occurrences : 2
Element: plays music for -- Occurrences : 2
Element: kneels in front of -- Occurrences : 3
Element: blocks -- Occurrences : 2
Element: lifts -- Occurrences : 2
Element: attends party with -- Occurrences : 3
Element: prepares -- Occurrences : 2
Element: manipulates -- Occurrences : 2
Element: goes to -- Occurrences : 2
Element: expresses doubts -- Occurrences : 2
Element: worries for -- Occurrences : 4
Element: examines -- Occurrences : 3
Element: moves away from -- Occurrences : 2
Element: carresses -- Occurrences : 3
Element: hangs up on -- Occurrences : 2
Element: forces -- Occurrences : 2
Element: throws out -- Occurrences : 3
Element: expresses regret -- Occurrences : 4
Element: forgives -- Occurrences : 3
Element: escorts out -- Occurrences : 2
Element: undresses -- Occurrences : 3
Element: wakes -- Occurrences : 2
Element: takes away from -- Occurre

In [24]:
with open("not_found.txt", "w+") as file: 
    # Save the void interactions 
    file.write(str(not_found))

## Sample results (interactions) for a movie

Movie tt0037884, Clip 3

In [25]:
clip_interactions["tt0037884"][3]

[{'characters': defaultdict(set,
              {'towards': {'Don Birnam'},
               'performed_by': {'Helen St. James'}}),
  'summary': 'gives (to)',
  'start_time': 0,
  'end_time': 6,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []},
 {'characters': defaultdict(set,
              {'towards': {'Don Birnam'},
               'performed_by': {'Helen St. James'}}),
  'summary': 'kisses',
  'start_time': 17,
  'end_time': 20.5,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []},
 {'characters': defaultdict(set,
              {'towards': {'Don Birnam'},
               'performed_by': {'Helen St. James'}}),
  'summary': 'suggests/offers (to/something)/gives opinion',
  'start_time': 8,
  'end_time': 10.5,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []}]

In [26]:
# Print interactions of a specific movie
for interaction in clip_interactions["tt0037884"][3]:
    if "reason" in interaction.keys():
        print(f"Characters : {interaction['characters']}\n- Summary : {interaction['summary']} \
          \n- Time stamps : [{interaction['start_time']};{interaction['end_time']}] Reason : {interaction['reason']} \n")
    else:
        print(f"Characters : {interaction['characters']}\n- Summary : {interaction['summary']} \
          \n- Time stamps : [{interaction['start_time']};{interaction['end_time']}] \n")

Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})
- Summary : gives (to)           
- Time stamps : [0;6] 

Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})
- Summary : kisses           
- Time stamps : [17;20.5] 

Characters : defaultdict(<class 'set'>, {'towards': {'Don Birnam'}, 'performed_by': {'Helen St. James'}})
- Summary : suggests/offers (to/something)/gives opinion           
- Time stamps : [8;10.5] 



## Generate the scenes (clips), places and contexts insertion script

In [27]:
insert_utils.insert_scenes_places_contexts(dataset)

## Generate interactions insertion script

In [28]:
insert_utils.insert_interactions(clip_interactions)

# Subtitles processing

## Load the subtitle paths

In [29]:
# Intialize the object 
subtitle_paths = defaultdict(dict)
# Loop through the movies IDs 
for movie in dataset_movies:
    # Set the current folder name
    current_folder = f"../../../MovieGraphs_Data/Subtitles/clip_srt/{movie}/"

    # Check if the directory exists 
    if os.path.exists(current_folder):
        # Get all the filenames in the current folder (subtitle files)
        current_subtitile_files = os.listdir(current_folder)
        # Set the values for the current movie
        subtitle_paths[movie] = current_subtitile_files

## Sample results (subtitle paths) for a movie

In [30]:
# Display sample results 
subtitle_paths["tt0988595"][0:5]

['scene-106.ss-0531.es-0536_utf8.webvtt',
 'scene-194.ss-0955.es-0961_utf8.webvtt',
 'scene-046.ss-0199.es-0206_utf8.webvtt',
 'scene-014.ss-0059.es-0065_utf8.webvtt',
 'scene-055.ss-0250.es-0251_utf8.webvtt']

## Process the files

In [31]:
# Intialize the object 
dataset_speech = extract_utils.extract_subtitles_V2(subtitle_paths)

## Sample results (speeches) for a movie

Movie tt0988595, Scene 20

In [32]:
dataset_speech["tt0988595"][20]

[{'transcript': ['you in ?', 'yeah .'],
  'start_time': '00:00:00.000',
  'end_time': '00:00:00.048'},
 {'transcript': ['[ grunt ##s ]', '[ horn hon ##ks ]'],
  'start_time': '00:00:01.690',
  'end_time': '00:00:04.921'},
 {'transcript': ['hey ! hey ! you are down to 260 .'],
  'start_time': '00:00:06.294',
  'end_time': '00:00:10.253'},
 {'transcript': ['are you sure you wanna keep this up ?'],
  'start_time': '00:00:10.365',
  'end_time': '00:00:12.856'},
 {'transcript': ['no !', 'okay , then .'],
  'start_time': '00:00:12.968',
  'end_time': '00:00:15.562'}]

## Match interactions and speeches 

In [33]:
def convert_interaction_time_to_datetime(time:float): 
    """_summary_

    Args:
        time (float): _description_

    Returns:
        _type_: _description_
    """

    # Set the reference date
    reference_date = datetime.datetime(2023, 1, 1, 0, 0, 0)

    # Compute the time delta
    time_delta = datetime.timedelta(seconds=time)
    
    # Add the time delta to the reference date
    result_datetime = reference_date + time_delta

    # Return as time object
    return result_datetime.time()

In [34]:
def convert_speech_time_to_datetime(time:str):
    """_summary_

    Args:
        time (str): _description_

    Returns:
        _type_: _description_
    """

    # Set the time format
    time_format = "%H:%M:%S.%f"

    # Cast to datetime
    time = datetime.datetime.strptime(time, time_format)

    return time.time()

In [35]:
def get_overlap_duration(interaction_start, interaction_end, speech_start, speech_end): 
    """_summary_

    Args:
        interaction_start (_type_): _description_
        interaction_end (_type_): _description_
        speech_start (_type_): _description_
        speech_end (_type_): _description_
    """
    
    # Get the overlap start 
    overlap_start = max(interaction_start, speech_start)
    # Get the overlap 
    overlap_end = min(interaction_end, speech_end)

    # Calculate the total overlapping time
    overlap_seconds = (overlap_end.hour - overlap_start.hour) * 3600 + \
                        (overlap_end.minute - overlap_start.minute) * 60 + \
                        (overlap_end.second - overlap_start.second)
        
    #print(f"Total overlap time : {overlap_seconds} seconds.")

    return overlap_seconds

In [36]:
def get_duration(start, end): 
    """_summary_

    Args:
        start (_type_): _description_
        end (_type_): _description_
    """

    start = datetime.datetime.combine(datetime.date.today(), start)
    end = datetime.datetime.combine(datetime.date.today(), end)

    return end-start

In [37]:
def time_ranges_overlap(interaction_start, interaction_end, speech_start, speech_end): 
    """_summary_

    Args:
        interaction_start (_type_): _description_
        interaction_end (_type_): _description_
        speech_start (_type_): _description_
        speech_end (_type_): _description_

    Returns:
        _type_: _description_
    """

    
    #       Case 1 : 
    #     Interaction
    #   -----------------
    #     -------------
    #        Speech
    #
    #          OR 
    #
    #       Case 2 : 
    #     Interaction
    #   ---------------------
    #  ------------------------
    #        Speech
    # 
    if (interaction_start <= speech_start and interaction_end > speech_start) or \
        (interaction_start >= speech_start and interaction_start <= speech_end)  :
        # Get the overlap duration
        overlap_seconds = get_overlap_duration(interaction_start, interaction_end, speech_start, speech_end)
        # Check if the timestamps overlap for more than 0 second 
        # or the speech lasts for less than 1 second
        condition = abs(overlap_seconds) > 0 or get_duration(speech_start, speech_end).seconds==0
        
        return condition
    
    return False

In [38]:
# Loop through the speeches
for movie_id, clip in dataset_speech.items():
    # Loop through the speeches of this clip 
    for clip_id, speeches in clip.items():
        # Loop through the speeches 
        for speech in speeches:
            speech_start = convert_speech_time_to_datetime(speech["start_time"])              
            speech_end = convert_speech_time_to_datetime(speech["end_time"])
            # Initialize the interaction list
            speech["interaction"] = list()
            # Check 
            if clip_id in clip_interactions[movie_id].keys():
                # Loop trough this clip interactions
                for i,interaction in enumerate(clip_interactions[movie_id][clip_id]): 
                    # Check the validity of the timestamps
                    if interaction["start_time"] not in[None,-1]  and interaction["end_time"] not in [None,-1]:
                        # Convert the time to datetime 
                        interaction_start = convert_interaction_time_to_datetime(interaction["start_time"])
                        interaction_end =  convert_interaction_time_to_datetime(interaction["end_time"])
                        # Check if the speech belongs to this interaction 
                        #if speech_interval["start"]<=interaction_start and speech_interval["end"]>=interaction_stop  \
                        #or (speech_interval["start"]>=interaction_start and speech_interval["end"]<=interaction_stop) \
                        #or (speech_interval["start"]>=interaction_start and speech_interval["end"]<=interaction_stop) :
                        if time_ranges_overlap(interaction_start, interaction_end, speech_start, speech_end):
                            # Append to the list
                            speech["interaction"].append(i)

In [39]:
clip_interactions["tt0988595"][20]

[{'characters': defaultdict(set,
              {'performed_by': {'groomsman #1',
                'groomsman #2',
                'groomsman #3'},
               'towards': set()}),
  'summary': 'talks (to/with)',
  'start_time': 0,
  'end_time': 13.096418,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []},
 {'characters': defaultdict(set,
              {'performed_by': {'Casey'},
               'towards': {'groomsman #1', 'groomsman #2', 'groomsman #3'}}),
  'summary': 'watches (something/someone/with)',
  'start_time': 0,
  'end_time': 4,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': [],
  'reason': "she's interested in them"},
 {'characters': defaultdict(set,
              {'performed_by': {'Jane'}, 'towards': {'Casey'}}),
  'summary': 'yells (at)',
  'start_time': 4,
  'end_time': 6.5,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': [],
  'reason': 'for thinking about sex all the time'}]

In [40]:
dataset_speech["tt0988595"][20]

[{'transcript': ['you in ?', 'yeah .'],
  'start_time': '00:00:00.000',
  'end_time': '00:00:00.048',
  'interaction': [0, 1]},
 {'transcript': ['[ grunt ##s ]', '[ horn hon ##ks ]'],
  'start_time': '00:00:01.690',
  'end_time': '00:00:04.921',
  'interaction': [0, 1]},
 {'transcript': ['hey ! hey ! you are down to 260 .'],
  'start_time': '00:00:06.294',
  'end_time': '00:00:10.253',
  'interaction': [0]},
 {'transcript': ['are you sure you wanna keep this up ?'],
  'start_time': '00:00:10.365',
  'end_time': '00:00:12.856',
  'interaction': [0]},
 {'transcript': ['no !', 'okay , then .'],
  'start_time': '00:00:12.968',
  'end_time': '00:00:15.562',
  'interaction': [0]}]

In [41]:
A = convert_interaction_time_to_datetime(0)
B = convert_interaction_time_to_datetime(13.096418)

C = convert_speech_time_to_datetime("00:00:00.000")
D = convert_speech_time_to_datetime("00:00:00.048")

time_ranges_overlap(A,B,C,D)

True

In [42]:
get_duration(C,D)

datetime.timedelta(microseconds=48000)

In [43]:
get_overlap_duration(A,B,C,D)

0

In [44]:
speech_without_interactions = defaultdict(list)
count = 0

for movie_id, clips in dataset_speech.items(): 
    speech_without_interactions[movie_id] = defaultdict(int)
    for clip_id, speeches in clips.items():
        for speech in speeches:
            if speech["interaction"]==[]: 
                speech_without_interactions[movie_id][clip_id]+=1
                count+=1

In [45]:
count

67823

In [46]:
clip_interactions["tt0988595"][13]

[{'characters': defaultdict(set,
              {'towards': {'Jane'}, 'performed_by': {'Taxi Driver Khaleel'}}),
  'summary': 'watches (something/someone/with)',
  'start_time': 3.5,
  'end_time': 5.5,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []},
 {'characters': defaultdict(set,
              {'towards': {'Jane'}, 'performed_by': {'Taxi Driver Khaleel'}}),
  'summary': 'reassures',
  'start_time': 10,
  'end_time': 12.5,
  'frame_start': -1,
  'frame_end': -1,
  'image_files': []}]

In [47]:
dataset_speech["tt0988595"][13]

[{'transcript': ['to be joined in holy mat ##rim ##ony .'],
  'start_time': '00:00:00.000',
  'end_time': '00:00:00.064',
  'interaction': []},
 {'transcript': ['oh , wow .', 'sorry .'],
  'start_time': '00:00:00.172',
  'end_time': '00:00:03.141',
  'interaction': []},
 {'transcript': ['taxi !'],
  'start_time': '00:00:03.242',
  'end_time': '00:00:06.678',
  'interaction': [0]},
 {'transcript': ['great .'],
  'start_time': '00:00:06.779',
  'end_time': '00:00:10.112',
  'interaction': []},
 {'transcript': ['thanks . 31 water street . brooklyn .'],
  'start_time': '00:00:10.215',
  'end_time': '00:00:13.673',
  'interaction': [1]},
 {'transcript': ['okay . i will give you $ 300 flat . . .'],
  'start_time': '00:00:13.786',
  'end_time': '00:00:16.653',
  'interaction': []},
 {'transcript': ['for the whole night on one condition', 'yeah .'],
  'start_time': '00:00:16.755',
  'end_time': '00:00:18.723',
  'interaction': []},
 {'transcript': ["you don ' t look in the rear view mirror or 

# Emotions processing

## Extraction

In [48]:
# Initialize the emotions set
dataset_emotions = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the emotions for the current clip 
        dataset_emotions[movie][clip[0]] = extract_utils.extract_characters_and_emotions(clip[1].orig_graph_json)

## Sample results 

Movie tt0988595, Clip 2

In [49]:
dataset_emotions["tt0988595"][2]

defaultdict(dict,
            {'Jane': ['happy', 'responsible', 'understanding'],
             'wedding guests': [],
             'Hal': ['sad'],
             'Tess': ['needy']})

## Generate the emotions insertion script

In [50]:
insert_utils.insert_characters_and_emotions(dataset_emotions)

# Characters attributes processing

## Extraction

In [51]:
# Initialize the emotions set
dataset_attributes = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the attributes for the current clip 
        dataset_attributes[movie][clip[0]] = extract_utils.extract_characters_and_attributes(clip[1].orig_graph_json)

## Attributes persistance 

Ethnicity and gender 

In [52]:
def persist_attributes(dataset_attributes, movie_id, persistence_list): 
    """_summary_

    Args:
        dataset_attributes (_type_): _description_
        movie_id (_type_): _description_
        persistence_list (_type_): _description_
    """

    # Loop through the elements 
    for clip_id, elements in dataset_attributes[movie_id].items(): 
        # Loop through the character and attributes 
        for character, attributes in elements.items(): 
            # Check if there's a persistent attribute 
            if persistence_list[movie_id][character]!=[]: 
                # Loop through the persistence list 
                for element in persistence_list[movie_id][character]: 
                    # Get the key, value 
                    key = list(element.keys())[0]
                    value = list(element.values())[0]
                    # Update the dataset attribute
                    dataset_attributes[movie_id][clip_id][character][key] = value
    
    return dataset_attributes

In [53]:
# Initialize the persistence list 
persistence_list = defaultdict(dict)

# Loop through the movies and clips
for movie_id, clips in dataset_attributes.items(): 
    # Loop through the clip_ids and characters
    for clip_id, elements in clips.items(): 
        # Loop through the character and attributes 
        for character, attributes in elements.items(): 
            # Initialize the persistence list
            persistence_list[movie_id][character] = list()

In [54]:
# Loop through the movies and clips
for movie_id, clips in dataset_attributes.items(): 
    # Loop through the clip_ids and characters
    for clip_id, elements in clips.items(): 
        # Loop through the character and attributes 
        for character, attributes in elements.items(): 
            # Loop through the attributes
            for key,value in attributes.items(): 
                # Check if the attribute is persistent
                if key in ["gender","ethnicity"] and {key:value} not in persistence_list[movie_id][character]: 
                    # Update the persistence list 
                    persistence_list[movie_id][character].append({key:value})

# Persist the attributes
for movie_id in persistence_list:
   dataset_attributes = persist_attributes(dataset_attributes,movie_id,persistence_list)

## Sample results

Movie tt0988595, Clip 2

In [55]:
dataset_attributes["tt0988595"][2]

defaultdict(dict,
            {'Jane': {'age': 'kid',
              'gender': 'female',
              'ethnicity': 'caucasian'},
             'wedding guests': {},
             'Hal': {'gender': 'male',
              'age': 'adult',
              'ethnicity': 'caucasian'},
             'Tess': {'gender': 'female',
              'age': 'kid',
              'ethnicity': 'caucasian'}})

## Generate the characters attributes insertion scripts 

In [56]:
insert_utils.insert_characters_and_attributes(dataset_attributes)

# Extract the relationships

In [57]:
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

### Sample results

In [58]:
# Loop through the movies and clips
for movie_id, clips in dataset_relationships.items():
    # Loop through the clips relationships
    for clip_id, relationships in clips.items():
        # Display the relationships 
        print(relationships)
        break
    break

[{'type': 'family', 'subject': 'Hal', 'subject_role': 'parent', 'object': 'Tess', 'object_role': 'child'}, {'type': 'family', 'subject': 'Hal', 'subject_role': 'parent', 'object': 'Jane', 'object_role': 'child'}, {'type': 'family', 'subject': 'Jane', 'subject_role': 'other family', 'object': 'Tess', 'object_role': 'other family'}]


### Generate the relationships insertion script

In [59]:
insert_utils.insert_relationships(dataset_relationships)

# Convert to PyTorch/Tensorflow format : Interaction prediction

In [60]:
# Initialize the object 
nx_dataset = defaultdict(dict)

## From dict data to Nx graph (TF-GNN)

## Graph generation function

In [61]:
def generate_graph(graph):
    """_summary_
    """

    # Initialize the scene graph 
    scene_graph = nx.DiGraph()
    
    # Get the original json
    clip_graph = graph.orig_graph_json
    # Get the place
    place = re.sub(r'["\n]', '',clip_graph["scene"]).strip() if "scene" in clip_graph else None
    # Get the context 
    context = re.sub(r'["\n]', '',clip_graph["situation"]).strip() if "situation" in clip_graph else None

    # Create the Scene node
    scene_graph.add_node("Scene",type="Scene",color="#16a5a5")
        
    if place is not None: 
        scene_graph.add_node(place,type="Place",color="#4C8EDA")
        scene_graph.add_edge("Scene",place,type="location")

    if context is not None:
        scene_graph.add_node(context,type="Context",color="#4C8EDA")
        scene_graph.add_edge("Scene",context,type="circumstance")
    
    # Check if there are some characters within this scene 
    if clip_id in clip_characters[movie_id].keys():
        # Loop through the characters
        for character in clip_characters[movie_id][clip_id]:
            # Insert characters 
            scene_graph.add_node(character,type="character",color="#4C8EDA")
            # Insert the edges 
            scene_graph.add_edge("Scene",character,type="features")

        # Insert the characters attributes 
        if clip_id in dataset_attributes[movie_id].keys():
            # Loop through the characters 
            for character, attributes in dataset_attributes[movie_id][clip_id].items():
                # Check if the attributes are not empty
                if attributes is not {}:
                    # Loop through the attributes 
                    for key,value in attributes.items():
                        # Check if the attribute doesn't have the name of an existing node 
                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value]["type"]=="attribute"):
                            # Insert the attribute 
                            scene_graph.add_node(value,name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value,type="possesses")
                        else:
                            # Insert the attribute 
                            scene_graph.add_node(value+":attribute",name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value+":attribute",type="possesses")
                
        # Insert the relationships between characters 
        if clip_id in dataset_relationships[movie_id].keys():
            # Initialize the counter 
            counter=0
            # Loop through the relationships
            for relationship in dataset_relationships[movie_id][clip_id]: 
                # Insert the relationship node
                scene_graph.add_node(relationship["type"]+":"+str(counter),type="Relationship",color="#4C8EDA")
                # Insert the roles 
                # Subject
                scene_graph.add_edge(relationship["subject"],relationship["type"]+":"+str(counter), type="linked_to", role=f"{relationship['subject_role']}")
                # Object 
                scene_graph.add_edge(relationship["object"], relationship["type"]+":"+str(counter), type="linked_to", role=f"{relationship['object_role']}")
                # Increase the counter 
                counter+=1
            
        # Insert the characters emotions 
        if clip_id in dataset_emotions[movie_id].keys():
            # Loop through the characters and emotions list
            for character, emotions in dataset_emotions[movie_id][clip_id].items():
                # Loop through the emotions of a specific character
                for emotion in emotions: 
                    # Check if the emotion doesn't have the name of an attribute 
                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion]["type"]=="Emotion"):
                        # Insert the emotion 
                        scene_graph.add_node(emotion, type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion, type="expresses")
                    else: 
                        # Insert the emotion 
                        scene_graph.add_node(emotion+":emotion", type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion+":emotion", type="expresses")



    return scene_graph

## Interaction class extraction

In [62]:
# Search interaction 
def get_interaction_class(summary, interaction_list:dict): 
    """_summary_

    Args:
        interaction_list (_type_): _description_
    """

    # Loop through the elements
    for i,key in enumerate(interaction_list.keys()): 
        # Check if the interaction matches 
        if summary==key:
            return i
    
    raise f"Could not find the interaction class for {summary}"

In [63]:
interaction_list = get_kukleva_merged_interactions()

## Conversion

In [64]:
# Initialize the object 
nx_dataset = defaultdict(dict)

# Loop through the movies and clips 
for movie_id, clips in dataset.items():
    # Loop through the scenes_id and graphs
    for clip_id, graph in clips.clip_graphs.items():                 
        # Check if there are some interactions within this scene 
        if clip_id in clip_interactions[movie_id].keys(): 
            # Loop through the interactions 
            for interaction_id,interaction in enumerate(clip_interactions[movie_id][clip_id]):
                # Generate the scene graph
                scene_graph = generate_graph(graph)
                # Check if the summary is not none
                if "summary" in interaction.keys():
                    # Get the summary class 
                    summary_class = get_interaction_class(interaction["summary"],interaction_list)
                    # Insert the interaction
                    scene_graph.add_node(summary_class,
                                         type="Interaction",
                                         frame_start=interaction["frame_start"],
                                         frame_end=interaction["frame_end"],
                                         color="#4C8EDA")
                    scene_graph.add_edge("Scene",summary_class,type="has")
                    # Insert the clip representation 
                    scene_graph.add_node("Frames", type="Frames", files=interaction["image_files"], color="#4C8EDA")
                    # Add the edge between the interaction and the frames 
                    scene_graph.add_edge("Frames", summary_class, type="images")
                    # Insert the roles 
                    # Towards
                    if "towards" in interaction["characters"].keys():
                        # Loop through the characters 
                        for character in interaction["characters"]["towards"]:
                            # Insert the roles
                            scene_graph.add_edge(summary_class,character, type="involves", role="towards")

                    # Performed by
                    if "performed_by" in interaction["characters"].keys():
                        # Loop through the characters
                        for character in interaction["characters"]["performed_by"]:
                            # Insert the roles
                            scene_graph.add_edge(summary_class,character, type="involves", role="performed_by")

                # Insert the subtitles
                if clip_id in dataset_speech[movie_id].keys(): 
                    # Loop through the speeches and tokens list 
                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):
                        # Check 
                        if interaction_id in speech["interaction"]: 
                            for line_number, line in enumerate(speech["transcript"]): 
                                #print(f"yes {movie_id} - {clip_id} - {i}", end="\r")
                                # Add the node 
                                scene_graph.add_node(f"Speech_{movie_id}_{interaction_id}_{speech_number}_{line_number}", type="Speech", color="#4C8EDA", transcript=line)
                                # Add an edge 
                                scene_graph.add_edge(summary_class, f"Speech_{movie_id}_{interaction_id}_{speech_number}_{line_number}", type="has_subs")

                #if "image_files" in interaction.keys() and interaction["image_files"]!=[]:
                # Set the clip graph for this interaction
                nx_dataset[movie_id+"_"+str(clip_id)+"_"+str(interaction_id)] = scene_graph

In [65]:
nx_dataset["tt0068646_100_0"].nodes

NodeView(('Scene', 'dining room', 'talk about work', 'Tom Hagen', 'Sonny Corleone', 'Clemenza', 'Michael Corleone', 'Tessio', 'male', 'caucasian', 'worried', 'scheming', 'nervous', 'frustrated', 23, 'Frames', 'Speech_tt0068646_0_0_0', 'Speech_tt0068646_0_1_0', 'Speech_tt0068646_0_2_0', 'Speech_tt0068646_0_3_0', 'Speech_tt0068646_0_4_0', 'Speech_tt0068646_0_4_1', 'Speech_tt0068646_0_4_2', 'Speech_tt0068646_0_4_3', 'Speech_tt0068646_0_5_0', 'Speech_tt0068646_0_6_0', 'Speech_tt0068646_0_6_1', 'Speech_tt0068646_0_7_0', 'Speech_tt0068646_0_8_0', 'Speech_tt0068646_0_9_0', 'Speech_tt0068646_0_10_0', 'Speech_tt0068646_0_11_0', 'Speech_tt0068646_0_12_0', 'Speech_tt0068646_0_13_0', 'Speech_tt0068646_0_14_0', 'Speech_tt0068646_0_14_1', 'Speech_tt0068646_0_15_0', 'Speech_tt0068646_0_16_0', 'Speech_tt0068646_0_17_0', 'Speech_tt0068646_0_18_0', 'Speech_tt0068646_0_19_0', 'Speech_tt0068646_0_20_0'))

### Save the nx dataset

In [66]:
len(nx_dataset)

17338

In [67]:
# Create the file if it doesn't exist 
with open("nx_dataset_V3.pkl", "wb") as file: 
    # Save the dictionary 
    pickle.dump(nx_dataset,file)
    # Print a success message £
    print("Nx dataset saved successfully.")

Nx dataset saved successfully.


## Display sample results
Movie : tt0988595, Scene : 2, Interaction 1

In [68]:
A = nx_dataset["tt0988595_4_1"]
node_colors = [A.nodes[node]["color"] for node in A.nodes]

In [69]:
A.nodes

NodeView(('Scene', 'church', 'wedding', 'Jane', 'Father', 'UNLISTED CHARACTER', 'Tess', 'Cousin Lisa', 'kid', 'female', 'caucasian', 'male', 'adult', 'family:0', 'happy', 'helpful', 'proud', 'relieved', 44, 'Frames', 'Speech_tt0988595_1_3_0', 'Speech_tt0988595_1_3_1', 'Speech_tt0988595_1_4_0', 'Speech_tt0988595_1_5_0'))

In [70]:
#plt.figure(figsize=(30,14))

#nx.draw_networkx(A, node_size=800, node_color=node_colors, pos=nx.circular_layout(A), font_size=14)
#edge_labels = nx.draw_networkx_edge_labels(A, pos=nx.circular_layout(A), label_pos=0.4, font_size=10, clip_on=False)

#plt.show()

# Convert to PyTorch/Tensorflow format : Relaltionship prediction

In [71]:
# Initialize the object 
nx_dataset = defaultdict(dict)

## From dict data to Nx graph (TF-GNN)

## Graph generation function

In [154]:
def generate_graph_2(graph):
    """_summary_
    """

    # Initialize the scene graph 
    scene_graph = nx.DiGraph()
    
    # Get the original json
    clip_graph = graph.orig_graph_json
    # Get the place
    place = re.sub(r'["\n]', '',clip_graph["scene"]).strip() if "scene" in clip_graph else None
    # Get the context 
    context = re.sub(r'["\n]', '',clip_graph["situation"]).strip() if "situation" in clip_graph else None

    # Create the Scene node
    scene_graph.add_node("Scene",type="Scene",color="#16a5a5")
        
    if place is not None: 
        scene_graph.add_node(place,type="Place",color="#4C8EDA")
        scene_graph.add_edge("Scene",place,type="location")

    if context is not None:
        scene_graph.add_node(context,type="Context",color="#4C8EDA")
        scene_graph.add_edge("Scene",context,type="circumstance")
    
    # Check if there are some characters within this scene 
    if clip_id in clip_characters[movie_id].keys():
        # Loop through the characters
        for character in clip_characters[movie_id][clip_id]:
            # Insert characters 
            scene_graph.add_node(character,type="character",color="#4C8EDA")
            # Insert the edges 
            scene_graph.add_edge("Scene",character,type="features")

        # Insert the characters attributes 
        if clip_id in dataset_attributes[movie_id].keys():
            # Loop through the characters 
            for character, attributes in dataset_attributes[movie_id][clip_id].items():
                # Check if the attributes are not empty
                if attributes is not {}:
                    # Loop through the attributes 
                    for key,value in attributes.items():
                        # Check if the attribute doesn't have the name of an existing node 
                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value]["type"]=="attribute"):
                            # Insert the attribute 
                            scene_graph.add_node(value,name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value,type="possesses")
                        else:
                            # Insert the attribute 
                            scene_graph.add_node(value+":attribute",name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value+":attribute",type="possesses")
            
        # Insert the characters emotions 
        if clip_id in dataset_emotions[movie_id].keys():
            # Loop through the characters and emotions list
            for character, emotions in dataset_emotions[movie_id][clip_id].items():
                # Loop through the emotions of a specific character
                for emotion in emotions: 
                    # Check if the emotion doesn't have the name of an attribute 
                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion]["type"]=="Emotion"):
                        # Insert the emotion 
                        scene_graph.add_node(emotion, type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion, type="expresses")
                    else: 
                        # Insert the emotion 
                        scene_graph.add_node(emotion+":emotion", type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion+":emotion", type="expresses")
    
    # Check if there's an interaction within this scene
    if clip_id in clip_interactions[movie_id].keys(): 
    # Loop through the interactions 
        for i,interaction in enumerate(clip_interactions[movie_id][clip_id]):
            # Check if the summary is not none
            if "summary" in interaction.keys():
                summary_class = interaction["summary"]
                # Insert the interaction
                scene_graph.add_node(summary_class,
                                     type="Interaction",
                                     frame_start=interaction["frame_start"], 
                                     frame_end = interaction["frame_end"],
                                     color="#4C8EDA")
                scene_graph.add_edge("Scene",summary_class,type="has")
                # Insert the roles 
                # Towards
                if "towards" in interaction["characters"].keys():
                # Loop through the characters 
                    for character in interaction["characters"]["towards"]:
                        # Insert the roles
                        scene_graph.add_edge(summary_class,character, type="involves", role="towards")
                # Performed by
                if "performed_by" in interaction["characters"].keys():
                    # Loop through the characters
                    for character in interaction["characters"]["performed_by"]:
                        # Insert the roles
                        scene_graph.add_edge(summary_class,character, type="involves", role="performed_by")
            
            if clip_id in dataset_speech[movie_id].keys(): 
                    # Loop through the speeches and tokens list 
                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):
                        # Check 
                        if i in speech["interaction"]: 
                            for line_number, line in enumerate(speech["transcript"]): 
                                #print(f"yes {movie_id} - {clip_id} - {i}", end="\r")
                                # Add the node 
                                scene_graph.add_node(f"Speech_{movie_id}_{i}_{speech_number}_{line_number}", type="Speech", color="#4C8EDA", transcript=line)
                                # Add an edge 
                                scene_graph.add_edge(summary_class, f"Speech_{movie_id}_{i}_{speech_number}_{line_number}", type="has_subs")
                
                

    return scene_graph

## Conversion

In [155]:
# Search relationship
def get_relationship_class(relationship_class, relationship_list:dict): 
    """_summary_

    Args:
        interaction_list (_type_): _description_
    """

    # Loop through the elements
    for i, (_, values) in enumerate(relationship_list.items()): 
        # Check if the interaction matches 
        if relationship_class in values:
            return i
    
    raise NameError(f"Could not find the relationship class for {relationship_class}")

In [156]:
%autoreload now
relationship_list = get_kukleva_merged_relationships()

In [168]:
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_kukleva_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

In [169]:
nx_dataset = defaultdict()
# Count the types of relationships
relationship_counts = defaultdict(int)

# Loop through the movies and clips 
for movie_id, clips in dataset.items():
    # Loop through the scenes_id and graphs
    for clip_id, graph in clips.clip_graphs.items():                 
        # Check if there are some interactions within this scene 
        if clip_id in dataset_relationships[movie_id].keys(): 
            # Generate the scene graph
            scene_graph = generate_graph_2(graph)
            # Loop through the relationships
            # for relationship in dataset_relationships[movie_id][clip_id]: 
            for i,relationship in enumerate(dataset_relationships[movie_id][clip_id]):
                # Get the relationship class 
                relationship_class = get_relationship_class(relationship["class"], relationship_list)
                # Update the count of relationships for this class
                relationship_counts[list(relationship_list.keys())[relationship_class]]+=1
                # Insert the relationship node
                scene_graph.add_node(relationship_class,type="Relationship", id=f"{movie_id}_{clip_id}_{i}", color="#4C8EDA")
                # Insert the roles 
                # Subject
                scene_graph.add_edge(relationship["subject"],relationship_class, type="linked_to")
                # Object 
                scene_graph.add_edge(relationship_class, relationship["object"], type="linked_to")
            if scene_graph is not None:
                # Set the clip graph for this relationship
                nx_dataset[movie_id+"_"+str(clip_id)] = scene_graph

In [174]:
#nx_dataset["tt0988595_2"].nodes(data=True)

In [175]:
list(dataset_relationships.items())[0]

('tt0988595',
 {2: [{'class': 'parent', 'subject': 'Hal', 'object': 'Tess'},
   {'class': 'parent', 'subject': 'Hal', 'object': 'Jane'},
   {'class': 'sibling', 'subject': 'Jane', 'object': 'Tess'}],
  3: [{'class': 'cousin', 'subject': 'Cousin Lisa', 'object': 'Jane'},
   {'class': 'cousin', 'subject': 'Cousin Lisa', 'object': 'Tess'}],
  4: [{'class': 'parent', 'subject': 'Father', 'object': 'Cousin Lisa'}],
  5: [{'class': 'customer',
    'subject': 'Jane',
    'object': 'Bridal Salesgirl #2'},
   {'class': 'customer', 'subject': 'Jane', 'object': 'Bridal Salesgirl #1'}],
  6: [{'class': 'friend', 'subject': 'Jane', 'object': 'Bride Suzanne'}],
  8: [{'class': 'friend', 'subject': 'Jane', 'object': 'Casey'}],
  10: [{'class': 'friend', 'subject': 'Bride Suzanne', 'object': 'Casey'},
   {'class': 'customer',
    'subject': 'Bride Suzanne',
    'object': 'photographer'}],
  11: [{'class': 'stranger', 'subject': 'Jane', 'object': 'Kevin'}],
  12: [{'class': 'customer',
    'subject': '

In [176]:
# Create the file if it doesn't exist 
with open("nx_dataset_relationships_multiple.pkl", "wb") as file: 
    # Save the dictionary 
    pickle.dump(nx_dataset,file)
    # Print a success message 
    print("Nx dataset saved successfully.")

Nx dataset saved successfully.


In [177]:
len(nx_dataset)

1537

In [78]:
import numpy as np

In [79]:
print(len(relationship_counts))
print(relationship_counts)
print(np.mean(list(relationship_counts.values())))

15
defaultdict(<class 'int'>, {'parent': 148, 'sibling': 58, 'customer': 89, 'friend': 203, 'stranger': 629, 'lover': 234, 'colleague': 250, 'boss/owner': 130, 'kbr': 121, 'acquaintance': 141, 'enemy': 85, 'worker': 64, 'manager': 76, 'child': 52, 'ex-lover': 18})
153.2


| Relationship | Count |
|---|---|
| Parent | 148 |
| Sibling | 58 |
| Customer | 89 |
| Friend | 203 |
| Stranger | 629 |
| Lover | 234 |
| Colleague | 250 |
| Boss/Owner | 130 |
| KBR | 121 |
| Acquaintance | 141 |
| Enemy | 85 |
| Worker | 64 |
| Manager | 76 |
| Child | 52 |
| Ex-Lover | 18 |


Distributions of relationships. 

Augmenting data for the classes : 
- Work * 2
- Romantic * 2 
- Dependecy/caretaking *2 
- Hostile * 5

# Generate the oversampled relationships dataset

In [80]:
[i for i in range(2)]

[0, 1]

In [81]:
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_kukleva_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

In [82]:
nx_dataset = defaultdict()
# Initialize the list of samples to duplicate
# If not enough samples (< mean) are provided and the class
# doesnt belong to the list of classes of interest,
#  we duplicate less samples
relationships_left = defaultdict(int)
relationships_left["lover"] = 3
relationships_left["friend"] = 2
relationships_left["boss/owner"] = 2, 
relationships_left["manager"] = 4
relationships_left["enemy"] = 4
relationships_left["worker"] = 4
relationships_left["customer"] = 4
relationships_left["colleague"] = 2

# Loop through the movies and clips 
for movie_id, clips in dataset.items():
    # Loop through the scenes_id and graphs
    for clip_id, graph in clips.clip_graphs.items():                 
        # Check if there are some interactions within this scene 
        if clip_id in dataset_relationships[movie_id].keys(): 
            # Loop through the relationships
            #for relationship in dataset_relationships[movie_id][clip_id]: 
            for i,relationship in enumerate(dataset_relationships[movie_id][clip_id]):
                # Get the relationship class 
                relationship_class = get_relationship_class(relationship["class"], relationship_list)
                # Check if it's in the list of classes to resample
                if list(relationship_list.keys())[relationship_class] in relationships_left:
                    # Add n times 
                    for j in range(relationships_left[relationship["class"]]+1):
                        # Generate the scene graph
                        scene_graph = generate_graph_2(graph)
                        # Insert the relationship node
                        scene_graph.add_node(relationship_class,type="Relationship", id=f"{movie_id}_{clip_id}_{i}", color="#4C8EDA")
                        # Insert the roles 
                        # Subject
                        scene_graph.add_edge(relationship["subject"],relationship_class, type="linked_to")
                        # Object 
                        scene_graph.add_edge(relationship_class, relationship["object"], type="linked_to")

                        # Set the clip graph for this relationship
                        nx_dataset[movie_id+"_"+str(clip_id)+"_"+str(i)+str(j)] = scene_graph
                else:
                    # Loop through the list of relationships to append
                    # Generate the scene graph
                    scene_graph = generate_graph_2(graph)
                    # Insert the relationship node
                    scene_graph.add_node(relationship_class,type="Relationship", id=f"{movie_id}_{clip_id}_{i}", color="#4C8EDA")
                    # Insert the roles 
                    # Subject
                    scene_graph.add_edge(relationship["subject"],relationship_class, type="linked_to")
                    # Object 
                    scene_graph.add_edge(relationship_class, relationship["object"], type="linked_to")

                    # Set the clip graph for this relationship
                    nx_dataset[movie_id+"_"+str(clip_id)+"_"+str(i)] = scene_graph

In [83]:
len(nx_dataset)

3821

In [84]:
# Create the file if it doesn't exist 
with open("nx_dataset_relationships_oversampled.pkl", "wb") as file: 
    # Save the dictionary 
    pickle.dump(nx_dataset,file)
    # Print a success message 
    print("Nx dataset saved successfully.")

Nx dataset saved successfully.


# Objectification graphs

## Load the files

In [85]:
annotations = pd.read_csv("ObyGaze12_thresh_02.csv", delimiter=";")

In [86]:
# Remove NaN rows 
annotations.dropna(how="all", inplace=True)
# Remove the nan graph IDs 
annotations.dropna(subset=["id"], inplace=True)
# Add the splits 
annotations["split"] = ["val" for i in range(len(annotations))]
annotations.head(5)
# Display results

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split
1,0.0,1.0,tt0108160scene-001.ss-0001.es-0001,Easy Neg,1.0,[''],tt0108160-001,tt0108160,scene-001.ss-0001.es-0001.srt,tt0108160_scene_1.avi,0.0,val
2,1.0,0.0,tt0108160scene-002.ss-0002.es-0002,Easy Neg,1.0,[''],tt0108160-002,tt0108160,scene-002.ss-0002.es-0002.srt,,-1.0,val
3,2.0,1.0,tt0108160scene-003.ss-0003.es-0006,Not Sure,1.0,['Activities'],tt0108160-003,tt0108160,scene-003.ss-0003.es-0006.srt,tt0108160_scene_3.avi,2.0,val
4,3.0,1.0,tt0108160scene-004.ss-0007.es-0017,Easy Neg,0.85,[''],tt0108160-004,tt0108160,scene-004.ss-0007.es-0017.srt,tt0108160_scene_4.avi,3.0,val
5,4.0,1.0,tt0108160scene-005.ss-0018.es-0018,Easy Neg,0.85,[''],tt0108160-005,tt0108160,scene-005.ss-0018.es-0018.srt,tt0108160_scene_5.avi,4.0,val


In [87]:
# Display the categories
annotations.label.value_counts()

Hard Neg    711
Easy Neg    453
Not Sure    397
Sure        353
Name: label, dtype: int64

In [88]:
# Initialize the training ratio
train_ratio = 0.6
# Get the training ratio for each label
hard_neg = round(annotations.label.value_counts()["Hard Neg"] * train_ratio)
not_sure = round(annotations.label.value_counts()["Not Sure"] * train_ratio)
easy_neg = round(annotations.label.value_counts()["Easy Neg"] * train_ratio)
sure = round(annotations.label.value_counts()["Sure"] * train_ratio)

# Assign the values
samples_left = {"Hard Neg": hard_neg, "Not Sure": not_sure, "Easy Neg": easy_neg, "Sure": sure}

samples_left

{'Hard Neg': 427, 'Not Sure': 238, 'Easy Neg': 272, 'Sure': 212}

In [89]:
# shuffle the dataset 
annotations = annotations.sample(random_state=123, frac=1)
annotations.head()

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split
817,816.0,1.0,tt0822832scene-066.ss-0983.es-0987,Sure,0.88,['Speech'],tt0822832-066,tt0822832,scene-066.ss-0983.es-0987.srt,tt0822832_scene_66.mp4,65.0,val
1519,1518.0,1.0,tt1454029scene-121.ss-1022.es-1025,Easy Neg,0.84,[''],tt1454029-121,tt1454029,scene-121.ss-1022.es-1025.srt,tt1454029_scene_121.mp4,120.0,val
143,142.0,1.0,tt0110912scene-033.ss-0268.es-0273,Sure,0.22,"['Voice', ' Appearance', ' Clothes', ' Look']",tt0110912-033,tt0110912,scene-033.ss-0268.es-0273.srt,tt0110912_scene_33.mkv,32.0,val
1130,1129.0,1.0,tt1142988scene-090.ss-0763.es-0770,Sure,1.0,"['Body', ' Clothes', ' Look']",tt1142988-090,tt1142988,scene-090.ss-0763.es-0770.srt,tt1142988_scene_90.mkv,89.0,val
835,834.0,0.0,tt0822832scene-084.ss-1213.es-1215,Hard Neg,1.0,['Activities'],tt0822832-084,tt0822832,scene-084.ss-1213.es-1215.srt,,-1.0,val


In [90]:
annotations.loc[2,"label"]

'Easy Neg'

In [91]:
# Loop through the annotations
for i in range(1,len(annotations)): 
    # Get the current annotation 
    annotation = annotations.loc[i]
    # Get the label 
    label = annotation["label"]
    # Check if there's still some train data left
    if label in samples_left and samples_left[label]>0:
        # Assign to the training samples
        annotations.at[i,"split"] = "train"
        # Decrement the samples left for this label
        samples_left[label] = samples_left[label] - 1

In [92]:
samples_left

{'Hard Neg': 0, 'Not Sure': 0, 'Easy Neg': 0, 'Sure': 0}

In [93]:
annotations["split"].value_counts()

train    1149
val       765
Name: split, dtype: int64

## Perform data processing on the annotation files

In [94]:
def process_label(label: str):
    if label=="Hard Neg":
        return 0
    elif label=="Easy Neg":
        return 1
    elif label=="Not Sure":
        return 2
    else: 
        return 3

In [95]:
annotations["label"] = annotations.label.apply(lambda label: process_label(label))
annotations.head(3)

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split
817,816.0,1.0,tt0822832scene-066.ss-0983.es-0987,3,0.88,['Speech'],tt0822832-066,tt0822832,scene-066.ss-0983.es-0987.srt,tt0822832_scene_66.mp4,65.0,train
1519,1518.0,1.0,tt1454029scene-121.ss-1022.es-1025,1,0.84,[''],tt1454029-121,tt1454029,scene-121.ss-1022.es-1025.srt,tt1454029_scene_121.mp4,120.0,val
143,142.0,1.0,tt0110912scene-033.ss-0268.es-0273,3,0.22,"['Voice', ' Appearance', ' Clothes', ' Look']",tt0110912-033,tt0110912,scene-033.ss-0268.es-0273.srt,tt0110912_scene_33.mkv,32.0,train


In [96]:
#annotations["graph_number"] = annotations.graph_number.apply(lambda x: str(x))
annotations["graph_id"] = annotations["id"].apply(lambda x: x.split("-")[0]+"_"+str(int(x.split("-")[1])))
annotations.head(100)

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split,graph_id
817,816.0,1.0,tt0822832scene-066.ss-0983.es-0987,3,0.88,['Speech'],tt0822832-066,tt0822832,scene-066.ss-0983.es-0987.srt,tt0822832_scene_66.mp4,65.0,train,tt0822832_66
1519,1518.0,1.0,tt1454029scene-121.ss-1022.es-1025,1,0.84,[''],tt1454029-121,tt1454029,scene-121.ss-1022.es-1025.srt,tt1454029_scene_121.mp4,120.0,val,tt1454029_121
143,142.0,1.0,tt0110912scene-033.ss-0268.es-0273,3,0.22,"['Voice', ' Appearance', ' Clothes', ' Look']",tt0110912-033,tt0110912,scene-033.ss-0268.es-0273.srt,tt0110912_scene_33.mkv,32.0,train,tt0110912_33
1130,1129.0,1.0,tt1142988scene-090.ss-0763.es-0770,3,1.00,"['Body', ' Clothes', ' Look']",tt1142988-090,tt1142988,scene-090.ss-0763.es-0770.srt,tt1142988_scene_90.mkv,89.0,train,tt1142988_90
835,834.0,0.0,tt0822832scene-084.ss-1213.es-1215,0,1.00,['Activities'],tt0822832-084,tt0822832,scene-084.ss-1213.es-1215.srt,,-1.0,train,tt0822832_84
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,304.0,1.0,tt0119822scene-063.ss-0375.es-0394,1,1.00,[''],tt0119822-063,tt0119822,scene-063.ss-0375.es-0394.srt,tt0119822_scene_63.avi,62.0,train,tt0119822_63
1845,1844.0,1.0,tt2267998scene-153.ss-1852.es-1853,1,1.00,[''],tt2267998-153,tt2267998,scene-153.ss-1852.es-1853.srt,tt2267998_scene_153.mp4,152.0,val,tt2267998_153
1001,1000.0,1.0,tt1045658scene-130.ss-1272.es-1291,0,1.00,"['Speech', ' Look']",tt1045658-130,tt1045658,scene-130.ss-1272.es-1291.srt,tt1045658_scene_130.mkv,129.0,train,tt1045658_130
1252,1251.0,1.0,tt1193138scene-077.ss-0602.es-0607,0,0.88,['Speech'],tt1193138-077,tt1193138,scene-077.ss-0602.es-0607.srt,tt1193138_scene_77.mkv,76.0,val,tt1193138_77


In [97]:
annotations.label.value_counts()

0    711
1    453
2    397
3    353
Name: label, dtype: int64

In [98]:
def generate_graph_3(graph, label: str):
    """_summary_
    """

    # Initialize the scene graph 
    scene_graph = nx.DiGraph()
    # Set the objectification tag for this graph 
    #scene_graph.__setattr__
    
    # Get the original json
    clip_graph = graph.orig_graph_json
    # Get the place
    place = re.sub(r'["\n]', '',clip_graph["scene"]).strip() if "scene" in clip_graph else None
    # Get the context 
    context = re.sub(r'["\n]', '',clip_graph["situation"]).strip() if "situation" in clip_graph else None

    # Create the Scene node
    scene_graph.add_node("Scene",type="Scene",color="#16a5a5", objectification=label)
        
    if place is not None: 
        scene_graph.add_node(place, type="Place",color="#4C8EDA")
        scene_graph.add_edge("Scene", place,type="location")

    if context is not None:
        scene_graph.add_node(context,type="Context",color="#4C8EDA")
        scene_graph.add_edge("Scene",context,type="circumstance")
    
    # Check if there are some characters within this scene 
    if clip_id in clip_characters[movie_id].keys():
        # Loop through the characters
        for character in clip_characters[movie_id][clip_id]:
            # Insert characters 
            scene_graph.add_node(character,type="character",color="#4C8EDA")
            # Insert the edges 
            scene_graph.add_edge("Scene",character,type="features")

        # Insert the characters attributes 
        if clip_id in dataset_attributes[movie_id].keys():
            # Loop through the characters 
            for character, attributes in dataset_attributes[movie_id][clip_id].items():
                # Check if the attributes are not empty
                if attributes is not {}:
                    # Loop through the attributes 
                    for key,value in attributes.items():
                        # Check if the attribute doesn't have the name of an existing node 
                        if value not in list(scene_graph.nodes) or (value in list(scene_graph.nodes) and scene_graph.nodes[value]["type"]=="attribute"):
                            # Insert the attribute 
                            scene_graph.add_node(value,name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value,type="possesses")
                        else:
                            # Insert the attribute 
                            scene_graph.add_node(value+":attribute",name=key,type="attribute",color="#fb9e00")
                            # Insert the edge between the character and its attribute 
                            scene_graph.add_edge(character,value+":attribute",type="possesses")
            
        # Insert the characters emotions 
        if clip_id in dataset_emotions[movie_id].keys():
            # Loop through the characters and emotions list
            for character, emotions in dataset_emotions[movie_id][clip_id].items():
                # Loop through the emotions of a specific character
                for emotion in emotions: 
                    # Check if the emotion doesn't have the name of an attribute 
                    if emotion not in list(scene_graph.nodes) or (emotion in list(scene_graph.nodes) and scene_graph.nodes[emotion]["type"]=="Emotion"):
                        # Insert the emotion 
                        scene_graph.add_node(emotion, type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion, type="expresses")
                    else: 
                        # Insert the emotion 
                        scene_graph.add_node(emotion+":emotion", type="Emotion", color="#4C8EDA")
                        # Insert the edge between the character and the emotion 
                        scene_graph.add_edge(character, emotion+":emotion", type="expresses")
                        
        # Insert the relationships between characters 
        if clip_id in dataset_relationships[movie_id].keys():
            # Initialize the counter 
            counter=0
            # Loop through the relationships
            for relationship in dataset_relationships[movie_id][clip_id]: 
                # Insert the relationship node
                scene_graph.add_node(relationship["type"]+":"+str(counter),type="Relationship",color="#4C8EDA")
                # Insert the roles 
                # Subject
                scene_graph.add_edge(relationship["subject"],relationship["type"]+":"+str(counter), type="linked_to", role=f"{relationship['subject_role']}")
                # Object 
                scene_graph.add_edge(relationship["object"], relationship["type"]+":"+str(counter), type="linked_to", role=f"{relationship['object_role']}")
                # Increase the counter 
                counter+=1
    
    # Check if there's an interaction within this scene
    if clip_id in clip_interactions[movie_id].keys(): 
    # Loop through the interactions 
        for i,interaction in enumerate(clip_interactions[movie_id][clip_id]):
            # Check if the summary is not none
            if "summary" in interaction.keys():
                summary_class = interaction["summary"]
                # Insert the interaction
                scene_graph.add_node(summary_class,
                                     type="Interaction",
                                     frame_start=interaction["frame_start"], 
                                     frame_end = interaction["frame_end"],
                                     color="#4C8EDA")
                scene_graph.add_edge("Scene",summary_class,type="has")
                # Insert the roles 
                # Towards
                if "towards" in interaction["characters"].keys():
                # Loop through the characters 
                    for character in interaction["characters"]["towards"]:
                        # Insert the roles
                        scene_graph.add_edge(summary_class,character, type="involves", role="towards")
                # Performed by
                if "performed_by" in interaction["characters"].keys():
                    # Loop through the characters
                    for character in interaction["characters"]["performed_by"]:
                        # Insert the roles
                        scene_graph.add_edge(summary_class,character, type="involves", role="performed_by")
            
            if clip_id in dataset_speech[movie_id].keys(): 
                    # Loop through the speeches and tokens list 
                    for speech_number, speech in enumerate(dataset_speech[movie_id][clip_id]):
                        # Check 
                        if i in speech["interaction"]: 
                            for line_number, line in enumerate(speech["transcript"]): 
                                #print(f"yes {movie_id} - {clip_id} - {i}", end="\r")
                                # Add the node 
                                scene_graph.add_node(f"Speech_{movie_id}_{i}_{speech_number}_{line_number}", type="Speech", color="#4C8EDA", transcript=line)
                                # Add an edge 
                                scene_graph.add_edge(summary_class, f"Speech_{movie_id}_{i}_{speech_number}_{line_number}", type="has_subs")
                          
    return scene_graph

In [99]:
def create_dataset(set, name):
    """_summary_
    """

    # Initialize the object 
    nx_dataset = defaultdict(dict)

    
    
        # Loop through the movies and clips 
    for movie_id, clips in dataset.items():
        # Loop through the scenes_id and graphs
        for clip_id, graph in clips.clip_graphs.items():
            if f"{movie_id}_{clip_id}" in set["graph_id"].values:
                # Get the objectification label for this scene
                label = set[set["graph_id"]==f"{movie_id}_{clip_id}"]["label"].values[0]
                # Generate the scene graph
                scene_graph = generate_graph_3(graph, label)
                # Set the clip graph for this relationship
                nx_dataset[movie_id+"_"+str(clip_id)] = scene_graph

    # Create the file if it doesn't exist 
    with open(f"nx_dataset_objectification_{name}_V2.pkl", "wb") as file: 
        # Save the dictionary 
        pickle.dump(nx_dataset,file)
        # Print a success message 
        print("Nx dataset saved successfully.")

In [100]:
train_set = annotations[annotations["split"]=="train"]
val_set = annotations[annotations["split"]=="val"]

In [101]:
annotations.to_csv("all_objectification.csv", index=False)

In [102]:
val_set.to_csv("raw_objectification.csv", index=False)

In [103]:
train_set["graph_id"].values

array(['tt0822832_66', 'tt0110912_33', 'tt1142988_90', ..., 'tt1142988_2',
       'tt1142988_83', 'tt1193138_215'], dtype=object)

In [104]:
"tt0822832_0000" in train_set["graph_id"].values

False

In [105]:
# Initialize the object 
nx_dataset = defaultdict(dict)
# Get the original relationships dataset
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

# Generate the training set
create_dataset(train_set, "train")
# Generate the validation set
create_dataset(val_set, "val")

Nx dataset saved successfully.
Nx dataset saved successfully.


In [106]:
len(val_set)

765

### Generate the pos/neg dataset

In [107]:
annotations2 = pd.read_csv("ObyGaze12_thresh_02.csv", delimiter=";")

In [108]:
# Remove NaN rows 
annotations2.dropna(how="all", inplace=True)
# Remove the nan graph IDs 
annotations2.dropna(subset=["id"], inplace=True)
# Add the splits 
annotations2["split"] = ["val" for i in range(len(annotations2))]
annotations2.head(5)
# Display results

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split
1,0.0,1.0,tt0108160scene-001.ss-0001.es-0001,Easy Neg,1.0,[''],tt0108160-001,tt0108160,scene-001.ss-0001.es-0001.srt,tt0108160_scene_1.avi,0.0,val
2,1.0,0.0,tt0108160scene-002.ss-0002.es-0002,Easy Neg,1.0,[''],tt0108160-002,tt0108160,scene-002.ss-0002.es-0002.srt,,-1.0,val
3,2.0,1.0,tt0108160scene-003.ss-0003.es-0006,Not Sure,1.0,['Activities'],tt0108160-003,tt0108160,scene-003.ss-0003.es-0006.srt,tt0108160_scene_3.avi,2.0,val
4,3.0,1.0,tt0108160scene-004.ss-0007.es-0017,Easy Neg,0.85,[''],tt0108160-004,tt0108160,scene-004.ss-0007.es-0017.srt,tt0108160_scene_4.avi,3.0,val
5,4.0,1.0,tt0108160scene-005.ss-0018.es-0018,Easy Neg,0.85,[''],tt0108160-005,tt0108160,scene-005.ss-0018.es-0018.srt,tt0108160_scene_5.avi,4.0,val


In [109]:
dict(annotations["concepts"].value_counts())

{"['Speech']": 519,
 "['']": 453,
 "['Activities']": 86,
 "['Speech', ' Activities']": 69,
 "['Speech', ' Exp of  emotion']": 42,
 "['Look']": 27,
 "['Clothes']": 24,
 "['Clothes', ' Speech', ' Activities']": 22,
 "['Posture', ' Speech']": 22,
 "['Clothes', ' Speech']": 21,
 "['Speech', ' Look']": 21,
 "['Speech', ' Activities', ' Exp of  emotion']": 20,
 "['Voice', ' Speech']": 20,
 "['Exp of  emotion']": 18,
 "['Type of plan']": 16,
 "['Posture', ' Speech', ' Activities']": 16,
 "['Body', ' Clothes']": 15,
 "['Voice', ' Exp of  emotion']": 13,
 "['Body', ' Clothes', ' Speech']": 13,
 "['Clothes', ' Activities']": 12,
 "['Type of plan', ' Look']": 12,
 "['Posture']": 11,
 "['Type of plan', ' Body', ' Clothes']": 11,
 "['Voice', ' Appearance', ' Clothes', ' Speech', ' Activities']": 10,
 "['Type of plan', ' Body']": 10,
 "['Type of plan', ' Body', ' Look']": 10,
 "['Body', ' Clothes', ' Activities']": 10,
 "['Type of plan', ' Body', ' Clothes', ' Speech']": 9,
 "['Body', ' Speech']": 9

In [110]:
M = annotations2[annotations2["label"]=="Easy Neg"]
N = annotations2[annotations2["label"]=="Sure"]
annotations3 = pd.concat([M,N], ignore_index=True)

annotations3["label"].value_counts()

Easy Neg    453
Sure        353
Name: label, dtype: int64

In [111]:
annotations["label"].value_counts()

0    711
1    453
2    397
3    353
Name: label, dtype: int64

In [112]:
weight_for_0 = (1/711) * ((453+353+711+397)/4.0)
weight_for_1 = (1 / 453) * ((453+353+711+397)/ 4.0)
weight_for_2 = (1 / 397) * ((453+353+711+397)/ 4.0)
weight_for_3 = (1 / 353) * ((453+353+711+397)/ 4.0)

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3:weight_for_3}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 0: {:.2f}'.format(weight_for_2))
print('Weight for class 1: {:.2f}'.format(weight_for_3))

Weight for class 0: 0.67
Weight for class 1: 1.06
Weight for class 0: 1.21
Weight for class 1: 1.36


In [113]:
weight_for_0 = (1/453) * ((453+353)/2.0)
weight_for_1 = (1 / 353) * ((453+353)/ 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.89
Weight for class 1: 1.14


In [114]:
# Initialize the training ratio
train_ratio = 0.6
# Get the training ratio for each label
easy_neg = round(annotations3.label.value_counts()["Easy Neg"] * train_ratio)
sure = round(annotations3.label.value_counts()["Sure"] * train_ratio)

# Assign the values
samples_left = {"Easy Neg": easy_neg, "Sure": sure}

easy_neg

272

In [115]:
# Loop through the annotations
for i in range(len(annotations3)): 
    # Get the current annotation 
    annotation = annotations3.loc[i]
    # Get the label 
    label = annotation["label"]
    # Check if there's still some train data left
    if samples_left[label]>0:
        # Assign to the training samples
        annotations3.at[i,"split"] = "train"
        # Decrement the samples left for this label
        samples_left[label] = samples_left[label] - 1

In [116]:
def process_label_2(label: str):
    if label=="Easy Neg":
        return 0
    elif label=="Sure":
        return 1

In [117]:
annotations3["label"] = annotations3.label.apply(lambda label: process_label_2(label))
annotations3["graph_id"] = annotations3["id"].apply(lambda x: x.split("-")[0]+"_"+str(int(x.split("-")[1])))
annotations3.head()

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split,graph_id
0,0.0,1.0,tt0108160scene-001.ss-0001.es-0001,0,1.0,[''],tt0108160-001,tt0108160,scene-001.ss-0001.es-0001.srt,tt0108160_scene_1.avi,0.0,train,tt0108160_1
1,1.0,0.0,tt0108160scene-002.ss-0002.es-0002,0,1.0,[''],tt0108160-002,tt0108160,scene-002.ss-0002.es-0002.srt,,-1.0,train,tt0108160_2
2,3.0,1.0,tt0108160scene-004.ss-0007.es-0017,0,0.85,[''],tt0108160-004,tt0108160,scene-004.ss-0007.es-0017.srt,tt0108160_scene_4.avi,3.0,train,tt0108160_4
3,4.0,1.0,tt0108160scene-005.ss-0018.es-0018,0,0.85,[''],tt0108160-005,tt0108160,scene-005.ss-0018.es-0018.srt,tt0108160_scene_5.avi,4.0,train,tt0108160_5
4,12.0,0.0,tt0108160scene-013.ss-0058.es-0061,0,1.0,[''],tt0108160-013,tt0108160,scene-013.ss-0058.es-0061.srt,,-1.0,train,tt0108160_13


In [118]:
def create_dataset_2(set, name):
    """_summary_
    """

    # Initialize the object 
    nx_dataset = defaultdict(dict)
    
        # Loop through the movies and clips 
    for movie_id, clips in dataset.items():
        # Loop through the scenes_id and graphs
        for clip_id, graph in clips.clip_graphs.items():
            if f"{movie_id}_{clip_id}" in set["graph_id"].values:
                # Get the objectification label for this scene
                label = set[set["graph_id"]==f"{movie_id}_{clip_id}"]["label"].values[0]
                # Generate the scene graph
                scene_graph = generate_graph_3(graph, label)
                # Set the clip graph for this relationship
                nx_dataset[movie_id+"_"+str(clip_id)] = scene_graph

    # Create the file if it doesn't exist 
    with open(f"nx_dataset_objectification_{name}_2_V2.pkl", "wb") as file: 
        # Save the dictionary 
        pickle.dump(nx_dataset,file)
        # Print a success message 
        print(f"Nx dataset nx_dataset_objectification_{name}_2_V2.pkl saved successfully.")

In [119]:
train_set_2 = annotations3[annotations3["split"]=="train"]
val_set_2 = annotations3[annotations3["split"]=="val"]

In [120]:
len(train_set_2)

484

In [121]:
# Initialize the object 
nx_dataset = defaultdict(dict)
# Get the original relationships dataset
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

# Generate the training set
create_dataset_2(train_set_2, "train")
# Generate the validation set
create_dataset_2(val_set_2, "val")

Nx dataset nx_dataset_objectification_train_2_V2.pkl saved successfully.
Nx dataset nx_dataset_objectification_val_2_V2.pkl saved successfully.


# Generate the oversampled dataset 

In [122]:
annotations3["label"].value_counts()

0    453
1    353
Name: label, dtype: int64

## Duplicate samples of the class '1'

In [123]:
# Filter the dataset
annotations_binary_1 = annotations3[annotations3["label"]==1]
# Shuffle the dataset 
annotations_binary_1 = annotations_binary_1.sample(random_state=123, frac=1).reset_index(drop=True)
# Take the 100 first elements 
annotations_binary_1 = annotations_binary_1[0:100]
# Concatenate the datasets
annotations2_oversampled = pd.concat([annotations3,annotations_binary_1])

## Check the distribution of the labels 

In [124]:
annotations2_oversampled["label"].value_counts(normalize=True)

0    0.5
1    0.5
Name: label, dtype: float64

## Check the distribution of the splits

In [125]:
annotations2_oversampled["split"].value_counts(normalize=True)

train    0.598234
val      0.401766
Name: split, dtype: float64

## Save the oversampled dataset

In [126]:
train_binary_oversampled = annotations2_oversampled[annotations2_oversampled["split"]=="train"]
validation_binary_oversampled = annotations2_oversampled[annotations2_oversampled["split"]=="val"]

In [127]:
print(len(train_set_2))
print(len(val_set_2))

484
322


In [128]:
print(len(train_binary_oversampled))
print(len(validation_binary_oversampled))

542
364


## Generate the oversampled dataset

In [129]:
def create_dataset_2_oversampled(set, name):
    """_summary_
    """

    # Initialize the object 
    nx_dataset = defaultdict(dict)
    
        # Loop through the movies and clips 
    for movie_id, clips in dataset.items():
        # Loop through the scenes_id and graphs
        for clip_id, graph in clips.clip_graphs.items():
            if f"{movie_id}_{clip_id}" in set["graph_id"].values:
                # Get the objectification label for this scene
                labels = set[set["graph_id"]==f"{movie_id}_{clip_id}"]["label"].values
                for i,label in enumerate(labels):
                    # Generate the scene graph
                    scene_graph = generate_graph_3(graph, label)
                    # Set the clip graph for this relationship
                    nx_dataset[movie_id+"_"+str(clip_id)+f"_{i}"] = scene_graph

    # Create the file if it doesn't exist 
    with open(f"nx_dataset_objectification_{name}_2_V2.pkl", "wb") as file: 
        # Save the dictionary 
        pickle.dump(nx_dataset,file)
        # Print a success message 
        print(f"Nx dataset nx_dataset_objectification_{name}_2_V2.pkl saved successfully.")

In [130]:
train_binary_oversampled[train_binary_oversampled["graph_id"]=="tt1142988_14"]["label"].values

array([1, 1])

In [131]:
# Initialize the object 
nx_dataset = defaultdict(dict)
# Get the original relationships dataset
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

# Generate the training set
create_dataset_2_oversampled(train_binary_oversampled, "oversampled_train")
# Generate the validation set
create_dataset_2_oversampled(validation_binary_oversampled, "oversampled_val")

Nx dataset nx_dataset_objectification_oversampled_train_2_V2.pkl saved successfully.
Nx dataset nx_dataset_objectification_oversampled_val_2_V2.pkl saved successfully.


## Dataset without 'Not Sure'

In [132]:
annotations3 = pd.read_csv("ObyGaze12_thresh_02.csv", delimiter=";")

In [133]:
# Remove NaN rows 
annotations3.dropna(how="all", inplace=True)
# Remove the nan graph IDs 
annotations3.dropna(subset=["id"], inplace=True)
# Add the splits 
annotations3["split"] = ["val" for i in range(len(annotations3))]
annotations3.head(5)
# Display results

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split
1,0.0,1.0,tt0108160scene-001.ss-0001.es-0001,Easy Neg,1.0,[''],tt0108160-001,tt0108160,scene-001.ss-0001.es-0001.srt,tt0108160_scene_1.avi,0.0,val
2,1.0,0.0,tt0108160scene-002.ss-0002.es-0002,Easy Neg,1.0,[''],tt0108160-002,tt0108160,scene-002.ss-0002.es-0002.srt,,-1.0,val
3,2.0,1.0,tt0108160scene-003.ss-0003.es-0006,Not Sure,1.0,['Activities'],tt0108160-003,tt0108160,scene-003.ss-0003.es-0006.srt,tt0108160_scene_3.avi,2.0,val
4,3.0,1.0,tt0108160scene-004.ss-0007.es-0017,Easy Neg,0.85,[''],tt0108160-004,tt0108160,scene-004.ss-0007.es-0017.srt,tt0108160_scene_4.avi,3.0,val
5,4.0,1.0,tt0108160scene-005.ss-0018.es-0018,Easy Neg,0.85,[''],tt0108160-005,tt0108160,scene-005.ss-0018.es-0018.srt,tt0108160_scene_5.avi,4.0,val


In [134]:
M = annotations3[annotations3["label"]=="Easy Neg"]
N = annotations3[annotations3["label"]=="Sure"]
O = annotations3[annotations3["label"]=="Hard Neg"]
annotations3 = pd.concat([M,N,O], ignore_index=True)

annotations3["label"].value_counts()

Hard Neg    711
Easy Neg    453
Sure        353
Name: label, dtype: int64

In [135]:
weight_for_0 = (1/711) * ((453+353+711)/3.0)
weight_for_1 = (1 / 453) * ((453+353+711)/ 3.0)
weight_for_2 = (1 / 353) * ((453+353+711)/ 3.0)

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 0: {:.2f}'.format(weight_for_2))

Weight for class 0: 0.71
Weight for class 1: 1.12
Weight for class 0: 1.43


In [136]:
# Initialize the training ratio
train_ratio = 0.6
# Get the training ratio for each label
easy_neg = round(annotations3.label.value_counts()["Easy Neg"] * train_ratio)
hard_neg = round(annotations3.label.value_counts()["Hard Neg"] * train_ratio)
sure = round(annotations3.label.value_counts()["Sure"] * train_ratio)

# Assign the values
samples_left = {"Easy Neg": easy_neg, "Hard Neg": hard_neg, "Sure": sure}

easy_neg

272

In [137]:
# Loop through the annotations
for i in range(len(annotations3)): 
    # Get the current annotation 
    annotation = annotations3.loc[i]
    # Get the label 
    label = annotation["label"]
    # Check if there's still some train data left
    if samples_left[label]>0:
        # Assign to the training samples
        annotations3.at[i,"split"] = "train"
        # Decrement the samples left for this label
        samples_left[label] = samples_left[label] - 1

In [138]:
def process_label_3(label: str):
    if label=="Easy Neg":
        return 0
    elif label=="Hard Neg":
        return 1
    elif label=="Sure":
        return 2

In [139]:
annotations3["label"] = annotations3.label.apply(lambda label: process_label_3(label))
annotations3["graph_id"] = annotations3["id"].apply(lambda x: x.split("-")[0]+"_"+str(int(x.split("-")[1])))
annotations3.head()

Unnamed: 0,idx,util,clip,label,overlap_ratio,concepts,id,movie,srt_name,video_name,graph_number,split,graph_id
0,0.0,1.0,tt0108160scene-001.ss-0001.es-0001,0,1.0,[''],tt0108160-001,tt0108160,scene-001.ss-0001.es-0001.srt,tt0108160_scene_1.avi,0.0,train,tt0108160_1
1,1.0,0.0,tt0108160scene-002.ss-0002.es-0002,0,1.0,[''],tt0108160-002,tt0108160,scene-002.ss-0002.es-0002.srt,,-1.0,train,tt0108160_2
2,3.0,1.0,tt0108160scene-004.ss-0007.es-0017,0,0.85,[''],tt0108160-004,tt0108160,scene-004.ss-0007.es-0017.srt,tt0108160_scene_4.avi,3.0,train,tt0108160_4
3,4.0,1.0,tt0108160scene-005.ss-0018.es-0018,0,0.85,[''],tt0108160-005,tt0108160,scene-005.ss-0018.es-0018.srt,tt0108160_scene_5.avi,4.0,train,tt0108160_5
4,12.0,0.0,tt0108160scene-013.ss-0058.es-0061,0,1.0,[''],tt0108160-013,tt0108160,scene-013.ss-0058.es-0061.srt,,-1.0,train,tt0108160_13


In [140]:
annotations3["label"].value_counts()

1    711
0    453
2    353
Name: label, dtype: int64

In [141]:
def create_dataset_3(set, name):
    """_summary_
    """

    # Initialize the object 
    nx_dataset = defaultdict(dict)
    
        # Loop through the movies and clips 
    for movie_id, clips in dataset.items():
        # Loop through the scenes_id and graphs
        for clip_id, graph in clips.clip_graphs.items():
            if f"{movie_id}_{clip_id}" in set["graph_id"].values:
                # Get the objectification label for this scene
                label = set[set["graph_id"]==f"{movie_id}_{clip_id}"]["label"].values[0]
                # Generate the scene graph
                scene_graph = generate_graph_3(graph, label)
                # Set the clip graph for this relationship
                nx_dataset[movie_id+"_"+str(clip_id)] = scene_graph

    # Create the file if it doesn't exist 
    with open(f"nx_dataset_objectification_{name}_2_V3.pkl", "wb") as file: 
        # Save the dictionary 
        pickle.dump(nx_dataset,file)
        # Print a success message 
        print("Nx dataset saved successfully.")

In [142]:
train_set_2 = annotations3[annotations3["split"]=="train"]
val_set_2 = annotations3[annotations3["split"]=="val"]

In [143]:
len(train_set_2)

911

In [144]:
# Initialize the object 
nx_dataset = defaultdict(dict)
# Get the original relationships dataset
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

# Generate the training set
create_dataset_3(train_set_2, "train")
# Generate the validation set
create_dataset_3(val_set_2, "val")

Nx dataset saved successfully.
Nx dataset saved successfully.


In [145]:
class_names_3 = [0,1,2]

with open("class_names_3.pkl", "wb") as file: 
    pickle.dump(class_names_3, file)

## Generate the oversampled Without Not Sure dataset

## Check the distribution of data

In [146]:
annotations3["label"].value_counts()

1    711
0    453
2    353
Name: label, dtype: int64

## Duplicate samples of class 0 and 2

In [147]:
# Filter the dataset
annotations_wns_0 = annotations3[annotations3["label"]==0]
annotations_wns_2 = annotations3[annotations3["label"]==2]
# Shuffle the dataset 
annotations_wns_0 = annotations_wns_0.sample(random_state=123, frac=1).reset_index(drop=True)
annotations_wns_2 = annotations_wns_2.sample(random_state=123, frac=1).reset_index(drop=True)
# Take the elements 
annotations_wns_0 = annotations_wns_0[0:711-453]
annotations_wns_2 = annotations_wns_2[0:711-353]
# Concatenate the datasets
annotations3_oversampled = pd.concat([annotations3,annotations_wns_0,annotations_wns_2])

## Check the distributions of values 

In [148]:
annotations3_oversampled["label"].value_counts(normalize=True)

0    0.334117
1    0.334117
2    0.331767
Name: label, dtype: float64

In [149]:
annotations3_oversampled["split"].value_counts(normalize=True)

train    0.599154
val      0.400846
Name: split, dtype: float64

In [150]:
def create_dataset_3_oversampled(set, name):
    """_summary_
    """

    # Initialize the object 
    nx_dataset = defaultdict(dict)
    
        # Loop through the movies and clips 
    for movie_id, clips in dataset.items():
        # Loop through the scenes_id and graphs
        for clip_id, graph in clips.clip_graphs.items():
            if f"{movie_id}_{clip_id}" in set["graph_id"].values:
                # Get the objectification label for this scene
                labels = set[set["graph_id"]==f"{movie_id}_{clip_id}"]["label"].values
                for i,label in enumerate(labels):
                    # Generate the scene graph
                    scene_graph = generate_graph_3(graph, label)
                    # Set the clip graph for this relationship
                    nx_dataset[movie_id+"_"+str(clip_id)+f"_{i}"] = scene_graph

    # Create the file if it doesn't exist 
    with open(f"nx_dataset_objectification_{name}_2_V4.pkl", "wb") as file: 
        # Save the dictionary 
        pickle.dump(nx_dataset,file)
        # Print a success message 
        print("Nx dataset saved successfully.")

In [151]:
train_wns_oversampled = annotations3_oversampled[annotations3_oversampled["split"]=="train"]
val_wns_oversampled = annotations3_oversampled[annotations3_oversampled["split"]=="val"]

print(len(train_wns_oversampled))
print(len(val_wns_oversampled))

1275
853


In [152]:
# Initialize the object 
nx_dataset = defaultdict(dict)
# Get the original relationships dataset
# Initialize the emotions set
dataset_relationships = defaultdict(dict)

# Loop through the movies
for movie in dataset.keys():
    # Loop through the clips
    for clip in dataset[movie].clip_graphs.items(): 
        # Extract the relationships for the current clip 
        relationships = extract_utils.extract_relationships(clip[1].orig_graph_json)
        # Check if it's not null
        if relationships!=[]:
            # Append to the relationships dataset
            dataset_relationships[movie][clip[0]] = relationships

# Generate the training set
create_dataset_3_oversampled(train_wns_oversampled, "train")
# Generate the validation set
create_dataset_3_oversampled(val_wns_oversampled, "val")

Nx dataset saved successfully.
Nx dataset saved successfully.
