# scene_clustering
This notebook contains inital code for clustering frames into shots, identifying the A/B/A/B pattern, and using the image classifier model to see if they're MCUs

In [1]:
import sys
import os
sys.path.append('site-packages') # manually put all packages/libraries into this folder
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import models
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

Using TensorFlow backend.


## Clustering
### For POC, designating a specific scene's worth of frames

In [2]:
# input film and frame 
# film = 'booksmart'
# frame_choice = list(range(1001, 1163)) # good example for Booksmart!!! 6 clusters, 2500 distance_threshold

# film = 'hobbs_shaw'
# frame_choice = list(range(701, 858)) # Hobbs and Shaw, Jason Statham vs. Helen Mirren, 2,500

# film = 'parasite'
# frame_choice = list(range(1666, 1836)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

# film = 'parasite'
# frame_choice = list(range(6687, 6777)) # Parasite, hiding in bushes, eh, 2900

#film = 'hustle'
#frame_choice = list(range(761, 969)) # The Hustle, train 2900

film = 'hustle'
frame_choice = list(range(600, 1000)) # The Hustle, train 3000, +/- 100 frames either side


In [3]:
# establish folder for this film
dialogue_folder = os.path.join('dialogue_frames', film)

print('There are', len(os.listdir(dialogue_folder)), 'images in the folder')
print('Selected', len(frame_choice), 'of those frames')

There are 5877 images in the folder
Selected 400 of those frames


In [4]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

vgg16_feature_list = []


for x in frame_choice:
    img_path = dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

    x += 1

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [5]:
vgg16_feature_list_np = np.array(vgg16_feature_list)
vgg16_feature_list_np.shape

(400, 25088)

In [6]:
hac = AgglomerativeClustering(n_clusters = None, distance_threshold = 3000).fit(vgg16_feature_list_np)
print('Number of clusters:', hac.n_clusters_)
print(hac.labels_)

Number of clusters: 33
[ 8  8 13 13 13 24 24 24 24 24 24 24 24 24 24  8  8  8  8  8  8  8  0  0
  0  0 21 21 21 21  0  0  0  0  0  0  0  0  0 32 32 32  2  2  2 32 32 32
 10 10 10 10 10 28 28 15 15 16 16 16 16 16 16  4  4  4  6  6  6  6  4  4
  4  4  4 20 16 16 16 16  6  6 10 10 10 15 15 15 15  9  9  9  2  2 30 30
 30 30  9  9  2  2 15 15 15 28 28 28 28 10 10 10 20 15 15 15 15 15 15 15
 20 20 20 20 30 30  2  2  2  2 30 30 30 30  2  2  4  4  4  4  4  6  6  6
  6  6 30 30 30 20 20 20 20 30 30 30 20 30  9  9  9 26 26 26 27 27 27 27
 27 25 25 25 25 25 25 25 25  0  0  0  0  0 27 27 27 27 27 27 27 27  0  0
  0  0 27  0  0 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19  1  1  5  5
  5  5  5  5 29 29 29  5  5 29  5  5  5  5  5  5  5  5 29 29 29 29 29  5
  5  5  5  5  5  5  5  5 22 22 22 22 22 22 22 22 22 22 22 29 29 29 29 29
  1  1  1 29 29  5  5 29 29 29  5  5  5  5  5  1  1  1  1  1  0  0  0 29
 29 29  1  1  1  1  1  1  5  5  5  5 29 29  5  5  5  5 29 29 29 29  5  5
  5  5  5  5  5  5 31 31 31 

## Load Saved Model and Identify MCUs

In [7]:
tuned_model = models.load_model('saved_models/tuned_model')

In [8]:
image_list = []
for x in frame_choice:
    image_list.append(img_to_array(load_img(dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg', target_size = (128, 128), color_mode = 'grayscale')))

In [9]:
image_array = np.array(image_list)
y_pred = tuned_model.predict_classes(image_array)

In [10]:
y_pred_values = []
for prediction in y_pred:
    y_pred_values.append(prediction[0])

# Scene Pattern Algorithm

In [None]:
#establish ABA first, and then allow for C

# establish A and B in memory
# when interrupted, alt_break = 1, establish C in memory, if next is A or B, alt_break back to 0

# store C in memory for each A/B pattern
# scene is first A through last B, and then any Cs on the end

# cluster, prev_cluster_1, prev_cluster_2
# if ABA, check for MCU
# look for first A/B, last A/B, and get all Cs in between
# scene is first A/B, last A/B, and any connecting Cs

In [11]:
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'shotid\t')
shot_id = 0
shot_id_list = []
prev_frame = 1000

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if cluster != prev_frame and prev_frame != 1000:
        shot_id += 1
    shot_id_list.append(shot_id)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', shot_id,'\tend')
    prev_frame = cluster

# return shot_id_list only

frame	 mcu	 clust	 prvfrm	 shotid	
600 	 0 	 8 	 1000 	 0 	end
601 	 0 	 8 	 8 	 0 	end
602 	 0 	 13 	 8 	 1 	end
603 	 0 	 13 	 13 	 1 	end
604 	 0 	 13 	 13 	 1 	end
605 	 0 	 24 	 13 	 2 	end
606 	 0 	 24 	 24 	 2 	end
607 	 0 	 24 	 24 	 2 	end
608 	 0 	 24 	 24 	 2 	end
609 	 0 	 24 	 24 	 2 	end
610 	 0 	 24 	 24 	 2 	end
611 	 0 	 24 	 24 	 2 	end
612 	 0 	 24 	 24 	 2 	end
613 	 0 	 24 	 24 	 2 	end
614 	 0 	 24 	 24 	 2 	end
615 	 0 	 8 	 24 	 3 	end
616 	 0 	 8 	 8 	 3 	end
617 	 0 	 8 	 8 	 3 	end
618 	 0 	 8 	 8 	 3 	end
619 	 1 	 8 	 8 	 3 	end
620 	 0 	 8 	 8 	 3 	end
621 	 0 	 8 	 8 	 3 	end
622 	 1 	 0 	 8 	 4 	end
623 	 1 	 0 	 0 	 4 	end
624 	 1 	 0 	 0 	 4 	end
625 	 1 	 0 	 0 	 4 	end
626 	 0 	 21 	 0 	 5 	end
627 	 0 	 21 	 21 	 5 	end
628 	 0 	 21 	 21 	 5 	end
629 	 0 	 21 	 21 	 5 	end
630 	 0 	 0 	 21 	 6 	end
631 	 0 	 0 	 0 	 6 	end
632 	 1 	 0 	 0 	 6 	end
633 	 1 	 0 	 0 	 6 	end
634 	 0 	 0 	 0 	 6 	end
635 	 0 	 0 	 0 	 6 	end
636 	 0 	 0 	 0 	 6 	end
637

955 	 1 	 0 	 0 	 84 	end
956 	 0 	 0 	 0 	 84 	end
957 	 0 	 0 	 0 	 84 	end
958 	 1 	 4 	 0 	 85 	end
959 	 1 	 4 	 4 	 85 	end
960 	 1 	 4 	 4 	 85 	end
961 	 1 	 4 	 4 	 85 	end
962 	 1 	 4 	 4 	 85 	end
963 	 1 	 4 	 4 	 85 	end
964 	 1 	 4 	 4 	 85 	end
965 	 0 	 23 	 4 	 86 	end
966 	 0 	 23 	 23 	 86 	end
967 	 1 	 23 	 23 	 86 	end
968 	 0 	 23 	 23 	 86 	end
969 	 0 	 23 	 23 	 86 	end
970 	 1 	 3 	 23 	 87 	end
971 	 1 	 3 	 3 	 87 	end
972 	 1 	 3 	 3 	 87 	end
973 	 1 	 3 	 3 	 87 	end
974 	 1 	 3 	 3 	 87 	end
975 	 1 	 3 	 3 	 87 	end
976 	 1 	 3 	 3 	 87 	end
977 	 1 	 11 	 3 	 88 	end
978 	 1 	 11 	 11 	 88 	end
979 	 1 	 11 	 11 	 88 	end
980 	 1 	 11 	 11 	 88 	end
981 	 1 	 11 	 11 	 88 	end
982 	 0 	 7 	 11 	 89 	end
983 	 0 	 7 	 7 	 89 	end
984 	 0 	 7 	 7 	 89 	end
985 	 0 	 7 	 7 	 89 	end
986 	 0 	 7 	 7 	 89 	end
987 	 0 	 7 	 7 	 89 	end
988 	 0 	 12 	 7 	 90 	end
989 	 1 	 12 	 12 	 90 	end
990 	 1 	 12 	 12 	 90 	end
991 	 1 	 12 	 12 	 90 	end
992 	 0 	 1

### This is the only one needed (and shot_id above)

In [12]:
prev_clust_1 = 1001
prev_clust_1_list = []
prev_clust_2 = 1002
prev_clust_2_list = []
prev_clust_3 = 1003
prev_clust_3_list = []
prev_shot_id = -1
speaker_a_list = []
speaker_b_list = []

print('frame\t', 'mcu\t', 'clust\t', 'shotid\t', 'prvshid', 'prev1\t', 'prev2\t', 'prev3\t')

for frame_file, cluster, mcu_flag, shot_id in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list):
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tbeg')
    if cluster == prev_clust_2 and prev_clust_1 == prev_clust_3:
        print('found one')
        speaker_a_list.append(min(cluster, prev_clust_1))
        speaker_b_list.append(max(cluster, prev_clust_1))
        
    
    if shot_id != prev_shot_id:
        prev_shot_id = shot_id
        prev_clust_3 = prev_clust_2
        prev_clust_2 = prev_clust_1
        prev_clust_1 = cluster
    prev_clust_1_list.append(prev_clust_1)
    prev_clust_2_list.append(prev_clust_2)
    prev_clust_3_list.append(prev_clust_3)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tend')
    
# return speaker_pairs

frame	 mcu	 clust	 shotid	 prvshid prev1	 prev2	 prev3	
600 	 0 	 8 	 0 	 -1 	 1001 	 1002 	 1003 	beg
600 	 0 	 8 	 0 	 0 	 8 	 1001 	 1002 	end
601 	 0 	 8 	 0 	 0 	 8 	 1001 	 1002 	beg
601 	 0 	 8 	 0 	 0 	 8 	 1001 	 1002 	end
602 	 0 	 13 	 1 	 0 	 8 	 1001 	 1002 	beg
602 	 0 	 13 	 1 	 1 	 13 	 8 	 1001 	end
603 	 0 	 13 	 1 	 1 	 13 	 8 	 1001 	beg
603 	 0 	 13 	 1 	 1 	 13 	 8 	 1001 	end
604 	 0 	 13 	 1 	 1 	 13 	 8 	 1001 	beg
604 	 0 	 13 	 1 	 1 	 13 	 8 	 1001 	end
605 	 0 	 24 	 2 	 1 	 13 	 8 	 1001 	beg
605 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
606 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	beg
606 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
607 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	beg
607 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
608 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	beg
608 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
609 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	beg
609 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
610 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	beg
610 	 0 	 24 	 2 	 2 	 24 	 13 	 8 	end
611 	 0 	 24 	 2 	 2 	 24 	 13 	

706 	 1 	 28 	 28 	 28 	 28 	 15 	 2 	end
707 	 1 	 28 	 28 	 28 	 28 	 15 	 2 	beg
707 	 1 	 28 	 28 	 28 	 28 	 15 	 2 	end
708 	 1 	 28 	 28 	 28 	 28 	 15 	 2 	beg
708 	 1 	 28 	 28 	 28 	 28 	 15 	 2 	end
709 	 0 	 10 	 29 	 28 	 28 	 15 	 2 	beg
709 	 0 	 10 	 29 	 29 	 10 	 28 	 15 	end
710 	 0 	 10 	 29 	 29 	 10 	 28 	 15 	beg
710 	 0 	 10 	 29 	 29 	 10 	 28 	 15 	end
711 	 0 	 10 	 29 	 29 	 10 	 28 	 15 	beg
711 	 0 	 10 	 29 	 29 	 10 	 28 	 15 	end
712 	 1 	 20 	 30 	 29 	 10 	 28 	 15 	beg
712 	 1 	 20 	 30 	 30 	 20 	 10 	 28 	end
713 	 1 	 15 	 31 	 30 	 20 	 10 	 28 	beg
713 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	end
714 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	beg
714 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	end
715 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	beg
715 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	end
716 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	beg
716 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	end
717 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	beg
717 	 1 	 15 	 31 	 31 	 15 	 20 	 10 	end
718 	 1 	 15 	 31

814 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
815 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	beg
815 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
816 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	beg
816 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
817 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	beg
817 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
818 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	beg
818 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
819 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	beg
819 	 1 	 5 	 55 	 55 	 5 	 1 	 19 	end
820 	 1 	 29 	 56 	 55 	 5 	 1 	 19 	beg
820 	 1 	 29 	 56 	 56 	 29 	 5 	 1 	end
821 	 1 	 29 	 56 	 56 	 29 	 5 	 1 	beg
821 	 1 	 29 	 56 	 56 	 29 	 5 	 1 	end
822 	 1 	 29 	 56 	 56 	 29 	 5 	 1 	beg
822 	 1 	 29 	 56 	 56 	 29 	 5 	 1 	end
823 	 1 	 5 	 57 	 56 	 29 	 5 	 1 	beg
823 	 1 	 5 	 57 	 57 	 5 	 29 	 5 	end
824 	 1 	 5 	 57 	 57 	 5 	 29 	 5 	beg
824 	 1 	 5 	 57 	 57 	 5 	 29 	 5 	end
825 	 1 	 29 	 58 	 57 	 5 	 29 	 5 	beg
found one
825 	 1 	 29 	 58 	 58 	 29 	 5 	 29 	end
826 	 1 	 5 	 59 	 58 	 29 	 5 	 29 	beg
found one
826 	 1 	 

937 	 0 	 18 	 81 	 81 	 18 	 26 	 0 	end
938 	 0 	 0 	 82 	 81 	 18 	 26 	 0 	beg
938 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	end
939 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	beg
939 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	end
940 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	beg
940 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	end
941 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	beg
941 	 0 	 0 	 82 	 82 	 0 	 18 	 26 	end
942 	 0 	 14 	 83 	 82 	 0 	 18 	 26 	beg
942 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	end
943 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	beg
943 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	end
944 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	beg
944 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	end
945 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	beg
945 	 0 	 14 	 83 	 83 	 14 	 0 	 18 	end
946 	 0 	 0 	 84 	 83 	 14 	 0 	 18 	beg
946 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	end
947 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	beg
947 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	end
948 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	beg
948 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	end
949 	 0 	 0 	 84 	 84 	 0 	 14 	 0 	beg
949 	 0 	 0 	

In [13]:
speaker_pairs = []
for a, b, in zip(speaker_a_list, speaker_b_list):
    if [int(a), int(b)] not in speaker_pairs:
        speaker_pairs.append([int(a), int(b)])

In [14]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values), columns=['frame_file', 'cluster', 'mcu'])
pd.options.display.max_rows=400
scene_df.head(400)

Unnamed: 0,frame_file,cluster,mcu
0,600,8,0
1,601,8,0
2,602,13,0
3,603,13,0
4,604,13,0
5,605,24,0
6,606,24,0
7,607,24,0
8,608,24,0
9,609,24,0


In [31]:
scenes = []

for pair in speaker_pairs:
    mean_a = scene_df.loc[scene_df['cluster'] == pair[0]]['mcu'].mean()
    mean_b = scene_df.loc[scene_df['cluster'] == pair[1]]['mcu'].mean()
    if mean_a > .5 and mean_b > .5:
        dialogue_start = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.min()
        dialogue_end = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.max()
        cutaways = scene_df.loc[(scene_df['frame_file'] > dialogue_start) & (scene_df['frame_file'] < dialogue_end)].cluster.unique()
        cutaways = cutaways[cutaways != pair[0]]
        cutaways = cutaways[cutaways != pair[1]]
        print('Speaker A and B clusters:', pair)
        print('First, last frames of speakers A and B:', dialogue_start, dialogue_end)
        print('Cutaway clusters:', cutaways)
        
        min_flag = 0
        while min_flag == 0:
            try:
                if int(scene_df.loc[scene_df['frame_file'] == (dialogue_start - 1)].cluster) in cutaways:
                    #print(dialogue_start)
                    dialogue_start -= 1
                else:
                    min_flag = 1
            except TypeError:
                min_flag = 1
                
        max_flag = 0
        while max_flag == 0:
            try:
                if int(scene_df.loc[scene_df['frame_file'] == (dialogue_end + 1)].cluster) in cutaways:
                    #print(dialogue_end)
                    dialogue_end += 1
                else:
                    max_flag = 1
            except TypeError:
                max_flag = 1
        
        print('First, last frames of entire scene:', dialogue_start, dialogue_end)
        
        print('\n')
        scenes.append((dialogue_start, dialogue_end))

Speaker A and B clusters: [20, 30]
First, last frames of speakers A and B: 675 757
Cutaway clusters: [16  6 10 15  9  2 28  4]
First, last frames of entire scene: 648 760


Speaker A and B clusters: [5, 29]
First, last frames of speakers A and B: 814 917
Cutaway clusters: [22  1  0]
First, last frames of entire scene: 812 917




In [16]:
scenes

[(648, 760), (812, 917)]

In [40]:
print('cluster\t', 'count\t', 'mean\t')
for x in [0, 32, 2, 20, 7, 19, 16, 32]:
    print(x, '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].count(), '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

cluster	 count	 mean	
0 	 15 	 0.8666666666666667
32 	 17 	 0.8823529411764706
2 	 33 	 0.24242424242424243
20 	 14 	 1.0
7 	 48 	 0.9791666666666666
19 	 28 	 1.0
16 	 10 	 0.0
32 	 17 	 0.8823529411764706
