# scene_clustering
This notebook contains inital code for clustering frames into shots, identifying the A/B/A/B pattern, and using the image classifier model to see if they're MCUs

In [1]:
import sys
import os
sys.path.append('site-packages') # manually put all packages/libraries into this folder
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import models
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

Using TensorFlow backend.


## Clustering
### For POC, designating a specific scene's worth of frames

In [2]:
# input film and frame 
# film = 'booksmart'
# frame_choice = list(range(1001, 1163)) # good example for Booksmart!!! 6 clusters, 2500 distance_threshold

# film = 'hobbs_shaw'
# frame_choice = list(range(701, 858)) # Hobbs and Shaw, Jason Statham vs. Helen Mirren, 2,500

# film = 'parasite'
# frame_choice = list(range(1666, 1836)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

# film = 'parasite'
# frame_choice = list(range(6687, 6777)) # Parasite, hiding in bushes, eh, 2900

film = 'hustle'
frame_choice = list(range(761, 933)) # Hustle, hiding in bushes, eh, 2900

In [3]:
# establish folder for this film
dialogue_folder = os.path.join('dialogue_frames', film)

print('There are', len(os.listdir(dialogue_folder)), 'images in the folder')
print('Selected', len(frame_choice), 'of those frames')

There are 5877 images in the folder
Selected 172 of those frames


In [4]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

vgg16_feature_list = []


for x in frame_choice:
    img_path = dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

    x += 1

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [5]:
vgg16_feature_list_np = np.array(vgg16_feature_list)
vgg16_feature_list_np.shape

(172, 25088)

In [6]:
hac = AgglomerativeClustering(n_clusters = None, distance_threshold = 3000).fit(vgg16_feature_list_np)
print('Number of clusters:', hac.n_clusters_)
print(hac.labels_)

Number of clusters: 10
[9 9 9 8 8 8 8 8 5 5 5 5 5 5 5 5 1 1 1 1 1 8 8 8 8 8 8 8 8 1 1 1 1 8 1 1 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 4 4 4 4 4 4 6 6 6 4 4 6 4 4 4 4 4 4 4 4 6
 6 6 6 6 4 4 4 4 4 4 4 4 4 7 7 7 7 7 7 7 7 7 7 7 6 6 6 6 6 0 0 0 6 6 4 4 6
 6 6 4 4 4 4 4 0 0 0 0 0 1 1 1 6 6 6 0 0 0 0 0 0 4 4 4 4 6 6 4 4 4 4 6 6 6
 6 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 1 1 1 9 9 9]


## Load Saved Model and Identify MCUs

In [7]:
tuned_model = models.load_model('saved_models/tuned_model')

In [8]:
image_list = []
for x in frame_choice:
    image_list.append(img_to_array(load_img(dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg', target_size = (128, 128), color_mode = 'grayscale')))

In [9]:
image_array = np.array(image_list)
y_pred = tuned_model.predict_classes(image_array)

In [10]:
y_pred_values = []
for prediction in y_pred:
    y_pred_values.append(prediction[0])

# Scene Pattern Algorithm

### Initial try

In [16]:
# prev_cluster changes every single frame
# stored_cluster changes only on cluster change
# alternate_counter changes only on cluster change

prev_cluster = 1000
stored_cluster = 1000
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if stored_cluster == 1000:
        stored_cluster = prev_cluster
    elif cluster != prev_cluster:
        if cluster == stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster = prev_cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 1000 	 1
762 	 0 	 9 	 9 	 9 	 1
763 	 0 	 9 	 9 	 9 	 1
764 	 1 	 8 	 9 	 9 	 1
765 	 1 	 8 	 8 	 9 	 1
766 	 1 	 8 	 8 	 9 	 1
767 	 1 	 8 	 8 	 9 	 1
768 	 1 	 8 	 8 	 9 	 1
769 	 0 	 5 	 8 	 8 	 1
770 	 0 	 5 	 5 	 8 	 1
771 	 0 	 5 	 5 	 8 	 1
772 	 0 	 5 	 5 	 8 	 1
773 	 0 	 5 	 5 	 8 	 1
774 	 0 	 5 	 5 	 8 	 1
775 	 0 	 5 	 5 	 8 	 1
776 	 0 	 5 	 5 	 8 	 1
777 	 0 	 1 	 5 	 5 	 1
778 	 0 	 1 	 1 	 5 	 1
779 	 0 	 1 	 1 	 5 	 1
780 	 1 	 1 	 1 	 5 	 1
781 	 1 	 1 	 1 	 5 	 1
782 	 1 	 8 	 1 	 1 	 1
783 	 1 	 8 	 8 	 1 	 1
784 	 1 	 8 	 8 	 1 	 1
785 	 1 	 8 	 8 	 1 	 1
786 	 1 	 8 	 8 	 1 	 1
787 	 1 	 8 	 8 	 1 	 1
788 	 1 	 8 	 8 	 1 	 1
789 	 1 	 8 	 8 	 1 	 1
790 	 0 	 1 	 8 	 8 	 2
791 	 1 	 1 	 1 	 8 	 2
792 	 1 	 1 	 1 	 8 	 2
793 	 1 	 1 	 1 	 8 	 2
794 	 1 	 8 	 1 	 1 	 3
795 	 1 	 1 	 8 	 8 	 4
796 	 1 	 1 	 1 	 8 	 4
797 	 1 	 2 	 1 	 1 	 1
798 	 0 	 2 	 2 	 1 	 1
799 	 0 	 2 	 2 	 1 	 1
800 	 0 	 2 	 2

### Allow for One Extra Storage (for inserts)

In [18]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1
762 	 0 	 9 	 9 	 [1000, 9] 	 1
763 	 0 	 9 	 9 	 [9, 9] 	 1
764 	 1 	 8 	 9 	 [9, 9] 	 1
765 	 1 	 8 	 8 	 [9, 9] 	 1
766 	 1 	 8 	 8 	 [9, 9] 	 1
767 	 1 	 8 	 8 	 [9, 9] 	 1
768 	 1 	 8 	 8 	 [9, 9] 	 1
769 	 0 	 5 	 8 	 [9, 8] 	 1
770 	 0 	 5 	 5 	 [9, 8] 	 1
771 	 0 	 5 	 5 	 [9, 8] 	 1
772 	 0 	 5 	 5 	 [9, 8] 	 1
773 	 0 	 5 	 5 	 [9, 8] 	 1
774 	 0 	 5 	 5 	 [9, 8] 	 1
775 	 0 	 5 	 5 	 [9, 8] 	 1
776 	 0 	 5 	 5 	 [9, 8] 	 1
777 	 0 	 1 	 5 	 [8, 5] 	 1
778 	 0 	 1 	 1 	 [8, 5] 	 1
779 	 0 	 1 	 1 	 [8, 5] 	 1
780 	 1 	 1 	 1 	 [8, 5] 	 1
781 	 1 	 1 	 1 	 [8, 5] 	 1
782 	 1 	 8 	 1 	 [5, 1] 	 2
783 	 1 	 8 	 8 	 [5, 1] 	 2
784 	 1 	 8 	 8 	 [5, 1] 	 2
785 	 1 	 8 	 8 	 [5, 1] 	 2
786 	 1 	 8 	 8 	 [5, 1] 	 2
787 	 1 	 8 	 8 	 [5, 1] 	 2
788 	 1 	 8 	 8 	 [5, 1] 	 2
789 	 1 	 8 	 8 	 [5, 1] 	 2
790 	 0 	 1 	 8 	 [1, 8] 	 3
791 	 1 	 1 	 1 	 [1, 8] 	 3
792 	 1 	 1 	 1 	 [1, 8] 	 3
793 	 1 	 1 	 1 	

### Assign number for every alternation pattern, save lists for dataframe

In [30]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
alt_list = []
pattern_list = []
pattern = 0
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t\t', 'alt\t', 'pattern')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
            pattern += 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter, '\t', pattern)
    prev_cluster = cluster
    alt_list.append(alternate_counter)
    pattern_list.append(pattern)
    

frame	 mcu	 clust	 prev	 stored		 alt	 pattern
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1 	 0
762 	 0 	 9 	 9 	 [1000, 9] 	 1 	 0
763 	 0 	 9 	 9 	 [9, 9] 	 1 	 0
764 	 1 	 8 	 9 	 [9, 9] 	 1 	 1
765 	 1 	 8 	 8 	 [9, 9] 	 1 	 1
766 	 1 	 8 	 8 	 [9, 9] 	 1 	 1
767 	 1 	 8 	 8 	 [9, 9] 	 1 	 1
768 	 1 	 8 	 8 	 [9, 9] 	 1 	 1
769 	 0 	 5 	 8 	 [9, 8] 	 1 	 2
770 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
771 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
772 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
773 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
774 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
775 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
776 	 0 	 5 	 5 	 [9, 8] 	 1 	 2
777 	 0 	 1 	 5 	 [8, 5] 	 1 	 3
778 	 0 	 1 	 1 	 [8, 5] 	 1 	 3
779 	 0 	 1 	 1 	 [8, 5] 	 1 	 3
780 	 1 	 1 	 1 	 [8, 5] 	 1 	 3
781 	 1 	 1 	 1 	 [8, 5] 	 1 	 3
782 	 1 	 8 	 1 	 [5, 1] 	 2 	 3
783 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
784 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
785 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
786 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
787 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
788 	 1 	 8 	 8 	 [5, 1] 	 2 	 3
789 	 1 	 8 	 8 	

## Populate dataframe

In [42]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values, alt_list, pattern_list), columns=['frame_file', 'cluster', 'mcu', 'alternation', 'pattern'])
pd.options.display.max_rows=200
scene_df.head(200)

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
0,761,9,0,1,0
1,762,9,0,1,0
2,763,9,0,1,0
3,764,8,1,1,1
4,765,8,1,1,1
5,766,8,1,1,1
6,767,8,1,1,1
7,768,8,1,1,1
8,769,5,0,1,2
9,770,5,0,1,2


In [None]:
'''
first pattern with more than 3 alternations is pattern 3, starting with cluster 1 which is NOT an MCU
first MCU in pattern 3 is cluster 8, which is MCU
final pattern with more than 3 alternations is pattern 14, ending with cluster 4, which is an MCU
'''

In [38]:
for x in range(0, 10):
    print(x, '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].count(), '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

0 	 16 	 0.0
1 	 17 	 0.4117647058823529
2 	 15 	 0.06666666666666667
3 	 9 	 0.0
4 	 48 	 0.9791666666666666
5 	 8 	 0.0
6 	 28 	 1.0
7 	 11 	 0.0
8 	 14 	 1.0
9 	 6 	 0.0


In [None]:
# various pandas lookups for debugging
# pd.options.display.max_rows=200
# scene_df.loc[scene_df['cluster'] == 2]
# scene_df.loc[scene_df['mcu'] == 1]
# scene_df.loc[scene_df['frame_file'] == 750]