# scene_clustering
This notebook contains inital code for clustering frames into shots, identifying the A/B/A/B pattern, and using the image classifier model to see if they're MCUs

In [1]:
import sys
import os
sys.path.append('site-packages') # manually put all packages/libraries into this folder
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import models
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

Using TensorFlow backend.


## Clustering
### For POC, designating a specific scene's worth of frames

In [19]:
# input film and frame 
# film = 'booksmart'
# frame_choice = list(range(1001, 1163)) # good example for Booksmart!!! 6 clusters, 2500 distance_threshold

# film = 'hobbs_shaw'
# frame_choice = list(range(701, 858)) # Hobbs and Shaw, Jason Statham vs. Helen Mirren, 2,500

# film = 'parasite'
# frame_choice = list(range(1666, 1836)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

# film = 'parasite'
# frame_choice = list(range(6687, 6777)) # Parasite, hiding in bushes, eh, 2900

film = 'hustle'
frame_choice = list(range(761, 969)) # Hustle, hiding in bushes, eh, 2900

In [20]:
# establish folder for this film
dialogue_folder = os.path.join('dialogue_frames', film)

print('There are', len(os.listdir(dialogue_folder)), 'images in the folder')
print('Selected', len(frame_choice), 'of those frames')

There are 5877 images in the folder
Selected 208 of those frames


In [21]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

vgg16_feature_list = []


for x in frame_choice:
    img_path = dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

    x += 1

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [22]:
vgg16_feature_list_np = np.array(vgg16_feature_list)
vgg16_feature_list_np.shape

(208, 25088)

In [23]:
hac = AgglomerativeClustering(n_clusters = None, distance_threshold = 3000).fit(vgg16_feature_list_np)
print('Number of clusters:', hac.n_clusters_)
print(hac.labels_)

Number of clusters: 12
[ 9  9  9 11 11 11 11 11  7  7  7  7  7  7  7  7  0  0  0  0  0 11 11 11
 11 11 11 11 11  0  0  0  0 11  0  0  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  1  1  2  2  2  2  2  2 10 10 10  2  2 10  2  2  2  2  2  2  2
  2 10 10 10 10 10  2  2  2  2  2  2  2  2  2  8  8  8  8  8  8  8  8  8
  8  8 10 10 10 10 10  1  1  1 10 10  2  2 10 10 10  2  2  2  2  2  1  1
  1  1  1  0  0  0 10 10 10  1  1  1  1  1  1  2  2  2  2 10 10  2  2  2
  2 10 10 10 10  2  2  2  2  2  2  2  2  5  5  5  5  5  5  5  5  5  0  0
  0  9  9  9  9  6  6  6  6  0  0  0  0  4  4  4  4  0  0  0  0  0  0  0
  0  0  0  0  0  2  2  2  2  2  2  2  0  0  0  0]


## Load Saved Model and Identify MCUs

In [24]:
tuned_model = models.load_model('saved_models/tuned_model')

In [25]:
image_list = []
for x in frame_choice:
    image_list.append(img_to_array(load_img(dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg', target_size = (128, 128), color_mode = 'grayscale')))

In [26]:
image_array = np.array(image_list)
y_pred = tuned_model.predict_classes(image_array)

In [27]:
y_pred_values = []
for prediction in y_pred:
    y_pred_values.append(prediction[0])

# Scene Pattern Algorithm

### Initial try

In [28]:
# prev_cluster changes every single frame
# stored_cluster changes only on cluster change
# alternate_counter changes only on cluster change

prev_cluster = 1000
stored_cluster = 1000
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if stored_cluster == 1000:
        stored_cluster = prev_cluster
    elif cluster != prev_cluster:
        if cluster == stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster = prev_cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 1000 	 1
762 	 0 	 9 	 9 	 9 	 1
763 	 0 	 9 	 9 	 9 	 1
764 	 1 	 11 	 9 	 9 	 1
765 	 1 	 11 	 11 	 9 	 1
766 	 1 	 11 	 11 	 9 	 1
767 	 1 	 11 	 11 	 9 	 1
768 	 1 	 11 	 11 	 9 	 1
769 	 0 	 7 	 11 	 11 	 1
770 	 0 	 7 	 7 	 11 	 1
771 	 0 	 7 	 7 	 11 	 1
772 	 0 	 7 	 7 	 11 	 1
773 	 0 	 7 	 7 	 11 	 1
774 	 0 	 7 	 7 	 11 	 1
775 	 0 	 7 	 7 	 11 	 1
776 	 0 	 7 	 7 	 11 	 1
777 	 0 	 0 	 7 	 7 	 1
778 	 0 	 0 	 0 	 7 	 1
779 	 0 	 0 	 0 	 7 	 1
780 	 1 	 0 	 0 	 7 	 1
781 	 1 	 0 	 0 	 7 	 1
782 	 1 	 11 	 0 	 0 	 1
783 	 1 	 11 	 11 	 0 	 1
784 	 1 	 11 	 11 	 0 	 1
785 	 1 	 11 	 11 	 0 	 1
786 	 1 	 11 	 11 	 0 	 1
787 	 1 	 11 	 11 	 0 	 1
788 	 1 	 11 	 11 	 0 	 1
789 	 1 	 11 	 11 	 0 	 1
790 	 0 	 0 	 11 	 11 	 2
791 	 1 	 0 	 0 	 11 	 2
792 	 1 	 0 	 0 	 11 	 2
793 	 1 	 0 	 0 	 11 	 2
794 	 1 	 11 	 0 	 0 	 3
795 	 1 	 0 	 11 	 11 	 4
796 	 1 	 0 	 0 	 11 	 4
797 	 1 	 3 	 0 	 0 	 1
798 	 0 	 3 	 3 	 0 	

### Allow for One Extra Storage (for inserts)

In [29]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1
762 	 0 	 9 	 9 	 [1000, 9] 	 1
763 	 0 	 9 	 9 	 [9, 9] 	 1
764 	 1 	 11 	 9 	 [9, 9] 	 1
765 	 1 	 11 	 11 	 [9, 9] 	 1
766 	 1 	 11 	 11 	 [9, 9] 	 1
767 	 1 	 11 	 11 	 [9, 9] 	 1
768 	 1 	 11 	 11 	 [9, 9] 	 1
769 	 0 	 7 	 11 	 [9, 11] 	 1
770 	 0 	 7 	 7 	 [9, 11] 	 1
771 	 0 	 7 	 7 	 [9, 11] 	 1
772 	 0 	 7 	 7 	 [9, 11] 	 1
773 	 0 	 7 	 7 	 [9, 11] 	 1
774 	 0 	 7 	 7 	 [9, 11] 	 1
775 	 0 	 7 	 7 	 [9, 11] 	 1
776 	 0 	 7 	 7 	 [9, 11] 	 1
777 	 0 	 0 	 7 	 [11, 7] 	 1
778 	 0 	 0 	 0 	 [11, 7] 	 1
779 	 0 	 0 	 0 	 [11, 7] 	 1
780 	 1 	 0 	 0 	 [11, 7] 	 1
781 	 1 	 0 	 0 	 [11, 7] 	 1
782 	 1 	 11 	 0 	 [7, 0] 	 2
783 	 1 	 11 	 11 	 [7, 0] 	 2
784 	 1 	 11 	 11 	 [7, 0] 	 2
785 	 1 	 11 	 11 	 [7, 0] 	 2
786 	 1 	 11 	 11 	 [7, 0] 	 2
787 	 1 	 11 	 11 	 [7, 0] 	 2
788 	 1 	 11 	 11 	 [7, 0] 	 2
789 	 1 	 11 	 11 	 [7, 0] 	 2
790 	 0 	 0 	 11 	 [0, 11] 	 3
791 	 1 	 0 	 0 	 [0, 11] 	 3
792 	

### Assign number for every alternation pattern, save lists for dataframe

In [30]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
alt_list = []
pattern_list = []
pattern = 0
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t\t', 'alt\t', 'pattern')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
            pattern += 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter, '\t', pattern)
    prev_cluster = cluster
    alt_list.append(alternate_counter)
    pattern_list.append(pattern)
    

frame	 mcu	 clust	 prev	 stored		 alt	 pattern
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1 	 0
762 	 0 	 9 	 9 	 [1000, 9] 	 1 	 0
763 	 0 	 9 	 9 	 [9, 9] 	 1 	 0
764 	 1 	 11 	 9 	 [9, 9] 	 1 	 1
765 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
766 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
767 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
768 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
769 	 0 	 7 	 11 	 [9, 11] 	 1 	 2
770 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
771 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
772 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
773 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
774 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
775 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
776 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
777 	 0 	 0 	 7 	 [11, 7] 	 1 	 3
778 	 0 	 0 	 0 	 [11, 7] 	 1 	 3
779 	 0 	 0 	 0 	 [11, 7] 	 1 	 3
780 	 1 	 0 	 0 	 [11, 7] 	 1 	 3
781 	 1 	 0 	 0 	 [11, 7] 	 1 	 3
782 	 1 	 11 	 0 	 [7, 0] 	 2 	 3
783 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
784 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
785 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
786 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
787 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
788 	 1 	 11 	 1

## Populate dataframe for continued analysis

In [32]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values, alt_list, pattern_list), columns=['frame_file', 'cluster', 'mcu', 'alternation', 'pattern'])
pd.options.display.max_rows=210
scene_df.head(210)

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
0,761,9,0,1,0
1,762,9,0,1,0
2,763,9,0,1,0
3,764,11,1,1,1
4,765,11,1,1,1
5,766,11,1,1,1
6,767,11,1,1,1
7,768,11,1,1,1
8,769,7,0,1,2
9,770,7,0,1,2


In [None]:
'''
first pattern with more than 3 alternations is pattern 3, starting with cluster 1 which is NOT an MCU
first MCU in pattern 3 is cluster 8, which is MCU
final pattern with more than 3 alternations is pattern 14, ending with cluster 4, which is an MCU
'''

In [34]:
print('cluster\t', 'count\t', 'mean\t')
for x in range(0, 12):
    print(x, '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].count(), '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

cluster	 count	 mean	
0 	 37 	 0.24324324324324326
1 	 16 	 0.0
2 	 55 	 0.9818181818181818
3 	 15 	 0.06666666666666667
4 	 4 	 0.0
5 	 9 	 0.0
6 	 4 	 0.0
7 	 8 	 0.0
8 	 11 	 0.0
9 	 7 	 0.0
10 	 28 	 1.0
11 	 14 	 1.0


In [None]:
# various pandas lookups for debugging
# pd.options.display.max_rows=200
# scene_df.loc[scene_df['cluster'] == 2]
# scene_df.loc[scene_df['mcu'] == 1]
# scene_df.loc[scene_df['frame_file'] == 750]

In [38]:
scene_df.loc[scene_df['alternation'] > 2].pattern.unique()


array([ 3,  7, 10, 14])

In [39]:
patterns = list(scene_df.loc[scene_df['alternation'] > 2].pattern.unique())

In [36]:
scene_df.loc[scene_df['pattern'] == 4]


Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
36,797,3,1,1,4
37,798,3,0,1,4
38,799,3,0,1,4
39,800,3,0,1,4
40,801,3,0,1,4
41,802,3,0,1,4
42,803,3,0,1,4
43,804,3,0,1,4
44,805,3,0,1,4
45,806,3,0,1,4


In [40]:
scene_df[scene_df.pattern.isin(patterns)]

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
16,777,0,0,1,3
17,778,0,0,1,3
18,779,0,0,1,3
19,780,0,1,1,3
20,781,0,1,1,3
21,782,11,1,2,3
22,783,11,1,2,3
23,784,11,1,2,3
24,785,11,1,2,3
25,786,11,1,2,3


In [41]:
print(hac.labels_)

[ 9  9  9 11 11 11 11 11  7  7  7  7  7  7  7  7  0  0  0  0  0 11 11 11
 11 11 11 11 11  0  0  0  0 11  0  0  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  1  1  2  2  2  2  2  2 10 10 10  2  2 10  2  2  2  2  2  2  2
  2 10 10 10 10 10  2  2  2  2  2  2  2  2  2  8  8  8  8  8  8  8  8  8
  8  8 10 10 10 10 10  1  1  1 10 10  2  2 10 10 10  2  2  2  2  2  1  1
  1  1  1  0  0  0 10 10 10  1  1  1  1  1  1  2  2  2  2 10 10  2  2  2
  2 10 10 10 10  2  2  2  2  2  2  2  2  5  5  5  5  5  5  5  5  5  0  0
  0  9  9  9  9  6  6  6  6  0  0  0  0  4  4  4  4  0  0  0  0  0  0  0
  0  0  0  0  0  2  2  2  2  2  2  2  0  0  0  0]


In [None]:
# currently alternation canceled if 2->10->2->8
# currently alternation applied during loop, but should be applied after the fact
# currently pattern applied during loop, but should be applied after the fact

# create dataframe for file, cluster, mcu, storage, then add alternation, then add pattern

In [67]:
storage = []

In [69]:
storage

[]

In [118]:
prev_shot_1 = 1000
prev_shot_2 = 1001
prev_frame = 2000
same_shot = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
    elif cluster == prev_frame:
        pass
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	
761 	 0 	 9 	 9 	 9 	 1001 	end
762 	 0 	 9 	 9 	 9 	 1001 	end
763 	 0 	 9 	 9 	 9 	 1001 	end
764 	 1 	 11 	 11 	 9 	 9 	end
765 	 1 	 11 	 11 	 9 	 9 	end
766 	 1 	 11 	 11 	 9 	 9 	end
767 	 1 	 11 	 11 	 9 	 9 	end
768 	 1 	 11 	 11 	 9 	 9 	end
769 	 0 	 7 	 7 	 11 	 9 	end
770 	 0 	 7 	 7 	 11 	 9 	end
771 	 0 	 7 	 7 	 11 	 9 	end
772 	 0 	 7 	 7 	 11 	 9 	end
773 	 0 	 7 	 7 	 11 	 9 	end
774 	 0 	 7 	 7 	 11 	 9 	end
775 	 0 	 7 	 7 	 11 	 9 	end
776 	 0 	 7 	 7 	 11 	 9 	end
777 	 0 	 0 	 0 	 7 	 11 	end
778 	 0 	 0 	 0 	 7 	 11 	end
779 	 0 	 0 	 0 	 7 	 11 	end
780 	 1 	 0 	 0 	 7 	 11 	end
781 	 1 	 0 	 0 	 7 	 11 	end
782 	 1 	 11 	 11 	 0 	 7 	end
783 	 1 	 11 	 11 	 0 	 7 	end
784 	 1 	 11 	 11 	 0 	 7 	end
785 	 1 	 11 	 11 	 0 	 7 	end
786 	 1 	 11 	 11 	 0 	 7 	end
787 	 1 	 11 	 11 	 0 	 7 	end
788 	 1 	 11 	 11 	 0 	 7 	end
789 	 1 	 11 	 11 	 0 	 7 	end
790 	 0 	 0 	 0 	 11 	 0 	end
791 	 1 	 0 	 0 	 11 	 0 	end
792 	 1 	

In [125]:
prev_shot_1 = 1000
prev_shot_1_list = [1000]
prev_shot_2 = 1001
prev_shot_2_list = [1001]
prev_frame = 2000
prev_frame_list = [2000]
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
prev_shot_1 = 1000
prev_shot_2 = 1001
prev_frame = 2000
same_shot = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
    elif cluster == prev_frame:
        pass
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')
alt_break = 0
alt_break_list = [0]
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    prev_shot_1_list.append(prev_shot_1)
    prev_shot_2_list.append(prev_shot_2)
    prev_frame_list.append(prev_frame)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	
761 	 0 	 9 	 9 	 9 	 1001 	end
762 	 0 	 9 	 9 	 9 	 1001 	end
763 	 0 	 9 	 9 	 9 	 1001 	end
764 	 1 	 11 	 11 	 9 	 9 	end
765 	 1 	 11 	 11 	 9 	 9 	end
766 	 1 	 11 	 11 	 9 	 9 	end
767 	 1 	 11 	 11 	 9 	 9 	end
768 	 1 	 11 	 11 	 9 	 9 	end
769 	 0 	 7 	 7 	 11 	 9 	end
770 	 0 	 7 	 7 	 11 	 9 	end
771 	 0 	 7 	 7 	 11 	 9 	end
772 	 0 	 7 	 7 	 11 	 9 	end
773 	 0 	 7 	 7 	 11 	 9 	end
774 	 0 	 7 	 7 	 11 	 9 	end
775 	 0 	 7 	 7 	 11 	 9 	end
776 	 0 	 7 	 7 	 11 	 9 	end
777 	 0 	 0 	 0 	 7 	 11 	end
778 	 0 	 0 	 0 	 7 	 11 	end
779 	 0 	 0 	 0 	 7 	 11 	end
780 	 1 	 0 	 0 	 7 	 11 	end
781 	 1 	 0 	 0 	 7 	 11 	end
782 	 1 	 11 	 11 	 0 	 7 	end
783 	 1 	 11 	 11 	 0 	 7 	end
784 	 1 	 11 	 11 	 0 	 7 	end
785 	 1 	 11 	 11 	 0 	 7 	end
786 	 1 	 11 	 11 	 0 	 7 	end
787 	 1 	 11 	 11 	 0 	 7 	end
788 	 1 	 11 	 11 	 0 	 7 	end
789 	 1 	 11 	 11 	 0 	 7 	end
790 	 0 	 0 	 0 	 11 	 0 	end
791 	 1 	 0 	 0 	 11 	 0 	end
792 	 1 	

In [126]:
for a, b, c, d, e, f in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list):
    print(a, b, c, d, e, f)

761 0 9 2000 1000 1001
762 0 9 9 9 1001
763 0 9 9 9 1001
764 1 11 9 9 1001
765 1 11 11 9 9
766 1 11 11 9 9
767 1 11 11 9 9
768 1 11 11 9 9
769 0 7 11 9 9
770 0 7 7 11 9
771 0 7 7 11 9
772 0 7 7 11 9
773 0 7 7 11 9
774 0 7 7 11 9
775 0 7 7 11 9
776 0 7 7 11 9
777 0 0 7 11 9
778 0 0 0 7 11
779 0 0 0 7 11
780 1 0 0 7 11
781 1 0 0 7 11
782 1 11 0 7 11
783 1 11 11 0 7
784 1 11 11 0 7
785 1 11 11 0 7
786 1 11 11 0 7
787 1 11 11 0 7
788 1 11 11 0 7
789 1 11 11 0 7
790 0 0 11 0 7
791 1 0 0 11 0
792 1 0 0 11 0
793 1 0 0 11 0
794 1 11 0 11 0
795 1 0 11 0 11
796 1 0 0 11 0
797 1 3 0 11 0
798 0 3 3 0 11
799 0 3 3 0 11
800 0 3 3 0 11
801 0 3 3 0 11
802 0 3 3 0 11
803 0 3 3 0 11
804 0 3 3 0 11
805 0 3 3 0 11
806 0 3 3 0 11
807 0 3 3 0 11
808 0 3 3 0 11
809 0 3 3 0 11
810 0 3 3 0 11
811 0 3 3 0 11
812 0 1 3 0 11
813 0 1 1 3 0
814 1 2 1 3 0
815 1 2 2 1 3
816 1 2 2 1 3
817 1 2 2 1 3
818 1 2 2 1 3
819 1 2 2 1 3
820 1 10 2 1 3
821 1 10 10 2 1
822 1 10 10 2 1
823 1 2 10 2 1
824 1 2 2 10 2
825 1 10 2 10 2


In [152]:
alt_break = 0
alt_break_list = [0]
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t', 'altbreak\t')

for frame_file, mcu_flag, cluster, prev_frame, prev_shot_1, prev_shot_2 in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list):
    if cluster != prev_frame:
        if cluster not in [prev_shot_1, prev_shot_2]:
            alt_break = 1
        else:
            alt_break = 0
    alt_break_list.append(alt_break)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2, '\t', alt_break)


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	 altbreak	
761 	 0 	 9 	 2000 	 1000 	 1001 	 1
762 	 0 	 9 	 9 	 9 	 1001 	 1
763 	 0 	 9 	 9 	 9 	 1001 	 1
764 	 1 	 11 	 9 	 9 	 1001 	 1
765 	 1 	 11 	 11 	 9 	 9 	 1
766 	 1 	 11 	 11 	 9 	 9 	 1
767 	 1 	 11 	 11 	 9 	 9 	 1
768 	 1 	 11 	 11 	 9 	 9 	 1
769 	 0 	 7 	 11 	 9 	 9 	 1
770 	 0 	 7 	 7 	 11 	 9 	 1
771 	 0 	 7 	 7 	 11 	 9 	 1
772 	 0 	 7 	 7 	 11 	 9 	 1
773 	 0 	 7 	 7 	 11 	 9 	 1
774 	 0 	 7 	 7 	 11 	 9 	 1
775 	 0 	 7 	 7 	 11 	 9 	 1
776 	 0 	 7 	 7 	 11 	 9 	 1
777 	 0 	 0 	 7 	 11 	 9 	 1
778 	 0 	 0 	 0 	 7 	 11 	 1
779 	 0 	 0 	 0 	 7 	 11 	 1
780 	 1 	 0 	 0 	 7 	 11 	 1
781 	 1 	 0 	 0 	 7 	 11 	 1
782 	 1 	 11 	 0 	 7 	 11 	 0
783 	 1 	 11 	 11 	 0 	 7 	 0
784 	 1 	 11 	 11 	 0 	 7 	 0
785 	 1 	 11 	 11 	 0 	 7 	 0
786 	 1 	 11 	 11 	 0 	 7 	 0
787 	 1 	 11 	 11 	 0 	 7 	 0
788 	 1 	 11 	 11 	 0 	 7 	 0
789 	 1 	 11 	 11 	 0 	 7 	 0
790 	 0 	 0 	 11 	 0 	 7 	 0
791 	 1 	 0 	 0 	 11 	 0 	 0
792 	 1 	 0 	 0 	 11 	

In [135]:
len(alt_break_list)

209

In [154]:
alt_counter = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t', 'altbrk\t', 'altcount\t')

for frame_file, mcu_flag, cluster, prev_frame, prev_shot_1, prev_shot_2, alt_break in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list, alt_break_list):
    if cluster != prev_frame:
        alt_counter += 1
        if cluster not in [prev_shot_1, prev_shot_2]:
            if alt_break == 1:
                alt_counter = 0
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2, '\t', alt_break, '\t', alt_counter)


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	 altbrk	 altcount	
761 	 0 	 9 	 2000 	 1000 	 1001 	 0 	 1
762 	 0 	 9 	 9 	 9 	 1001 	 1 	 1
763 	 0 	 9 	 9 	 9 	 1001 	 1 	 1
764 	 1 	 11 	 9 	 9 	 1001 	 1 	 0
765 	 1 	 11 	 11 	 9 	 9 	 1 	 0
766 	 1 	 11 	 11 	 9 	 9 	 1 	 0
767 	 1 	 11 	 11 	 9 	 9 	 1 	 0
768 	 1 	 11 	 11 	 9 	 9 	 1 	 0
769 	 0 	 7 	 11 	 9 	 9 	 1 	 0
770 	 0 	 7 	 7 	 11 	 9 	 1 	 0
771 	 0 	 7 	 7 	 11 	 9 	 1 	 0
772 	 0 	 7 	 7 	 11 	 9 	 1 	 0
773 	 0 	 7 	 7 	 11 	 9 	 1 	 0
774 	 0 	 7 	 7 	 11 	 9 	 1 	 0
775 	 0 	 7 	 7 	 11 	 9 	 1 	 0
776 	 0 	 7 	 7 	 11 	 9 	 1 	 0
777 	 0 	 0 	 7 	 11 	 9 	 1 	 0
778 	 0 	 0 	 0 	 7 	 11 	 1 	 0
779 	 0 	 0 	 0 	 7 	 11 	 1 	 0
780 	 1 	 0 	 0 	 7 	 11 	 1 	 0
781 	 1 	 0 	 0 	 7 	 11 	 1 	 0
782 	 1 	 11 	 0 	 7 	 11 	 1 	 1
783 	 1 	 11 	 11 	 0 	 7 	 0 	 1
784 	 1 	 11 	 11 	 0 	 7 	 0 	 1
785 	 1 	 11 	 11 	 0 	 7 	 0 	 1
786 	 1 	 11 	 11 	 0 	 7 	 0 	 1
787 	 1 	 11 	 11 	 0 	 7 	 0 	 1
788 	 1 	 11 	 11 	 0 	 

# Transition to Shot ID

In [186]:
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'shotid\t')
shot_id = 0
shot_id_list = []
prev_frame = 1000

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if cluster != prev_frame and prev_frame != 1000:
        shot_id += 1
    shot_id_list.append(shot_id)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', shot_id,'\tend')
    prev_frame = cluster

# return shot_id_list only

frame	 mcu	 clust	 prvfrm	 shotid	
761 	 0 	 9 	 1000 	 0 	end
762 	 0 	 9 	 9 	 0 	end
763 	 0 	 9 	 9 	 0 	end
764 	 1 	 11 	 9 	 1 	end
765 	 1 	 11 	 11 	 1 	end
766 	 1 	 11 	 11 	 1 	end
767 	 1 	 11 	 11 	 1 	end
768 	 1 	 11 	 11 	 1 	end
769 	 0 	 7 	 11 	 2 	end
770 	 0 	 7 	 7 	 2 	end
771 	 0 	 7 	 7 	 2 	end
772 	 0 	 7 	 7 	 2 	end
773 	 0 	 7 	 7 	 2 	end
774 	 0 	 7 	 7 	 2 	end
775 	 0 	 7 	 7 	 2 	end
776 	 0 	 7 	 7 	 2 	end
777 	 0 	 0 	 7 	 3 	end
778 	 0 	 0 	 0 	 3 	end
779 	 0 	 0 	 0 	 3 	end
780 	 1 	 0 	 0 	 3 	end
781 	 1 	 0 	 0 	 3 	end
782 	 1 	 11 	 0 	 4 	end
783 	 1 	 11 	 11 	 4 	end
784 	 1 	 11 	 11 	 4 	end
785 	 1 	 11 	 11 	 4 	end
786 	 1 	 11 	 11 	 4 	end
787 	 1 	 11 	 11 	 4 	end
788 	 1 	 11 	 11 	 4 	end
789 	 1 	 11 	 11 	 4 	end
790 	 0 	 0 	 11 	 5 	end
791 	 1 	 0 	 0 	 5 	end
792 	 1 	 0 	 0 	 5 	end
793 	 1 	 0 	 0 	 5 	end
794 	 1 	 11 	 0 	 6 	end
795 	 1 	 0 	 11 	 7 	end
796 	 1 	 0 	 0 	 7 	end
797 	 1 	 3 	 0 	 8 	end
798 	 0 	

In [187]:
shot_id_list

[0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 12,
 12,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 20,
 20,
 21,
 21,
 22,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 24,
 24,
 24,
 25,
 25,
 25,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 28,
 29,
 29,
 30,
 30,
 30,
 30,
 31,
 31,
 31,
 31,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 34,
 34,
 34,
 35,
 35,
 35,
 35,
 36,
 36,
 36,
 36,
 37,
 37,
 37,
 37,
 38,
 38,
 38,
 38,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 41,
 41,
 41,
 41]

In [246]:
prev_clust_0 = 1000
prev_clust_0_list = []
prev_clust_1 = 1001
prev_clust_1_list = []
prev_clust_2 = 1002
prev_clust_2_list = []
prev_clust_3 = 1003
prev_clust_3_list = []
prev_shot_id = -1
print('frame\t', 'mcu\t', 'clust\t', 'shotid\t', 'prvshid','prev0\t', 'prev1\t', 'prev2\t', 'prev3\t')

for frame_file, cluster, mcu_flag, shot_id in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list):
    #print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_0, '\t', prev_clust_1, '\t', prev_clust_2,'\tbeg')
    if shot_id != prev_shot_id:
        prev_clust_3 = prev_clust_2
        prev_clust_2 = prev_clust_1
        prev_clust_1 = prev_clust_0
        prev_clust_0 = cluster
    prev_clust_0_list.append(prev_clust_0)
    prev_clust_1_list.append(prev_clust_1)
    prev_clust_2_list.append(prev_clust_2)
    prev_clust_3_list.append(prev_clust_3)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_0, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tend')
    prev_shot_id = shot_id
    
# return prev_clust_1, prev_clust_2, and prev_clust_3 only


frame	 mcu	 clust	 shotid	 prvshid prev0	 prev1	 prev2	 prev3	
761 	 0 	 9 	 0 	 -1 	 9 	 1000 	 1001 	 1002 	end
762 	 0 	 9 	 0 	 0 	 9 	 1000 	 1001 	 1002 	end
763 	 0 	 9 	 0 	 0 	 9 	 1000 	 1001 	 1002 	end
764 	 1 	 11 	 1 	 0 	 11 	 9 	 1000 	 1001 	end
765 	 1 	 11 	 1 	 1 	 11 	 9 	 1000 	 1001 	end
766 	 1 	 11 	 1 	 1 	 11 	 9 	 1000 	 1001 	end
767 	 1 	 11 	 1 	 1 	 11 	 9 	 1000 	 1001 	end
768 	 1 	 11 	 1 	 1 	 11 	 9 	 1000 	 1001 	end
769 	 0 	 7 	 2 	 1 	 7 	 11 	 9 	 1000 	end
770 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
771 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
772 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
773 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
774 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
775 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
776 	 0 	 7 	 2 	 2 	 7 	 11 	 9 	 1000 	end
777 	 0 	 0 	 3 	 2 	 0 	 7 	 11 	 9 	end
778 	 0 	 0 	 3 	 3 	 0 	 7 	 11 	 9 	end
779 	 0 	 0 	 3 	 3 	 0 	 7 	 11 	 9 	end
780 	 1 	 0 	 3 	 3 	 0 	 7 	 11 	 9 	end
781 	 1 	 0 	

947 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
948 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
949 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
950 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
951 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
952 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
953 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
954 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
955 	 1 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
956 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
957 	 0 	 0 	 39 	 39 	 0 	 4 	 0 	 6 	end
958 	 1 	 2 	 40 	 39 	 2 	 0 	 4 	 0 	end
959 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
960 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
961 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
962 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
963 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
964 	 1 	 2 	 40 	 40 	 2 	 0 	 4 	 0 	end
965 	 0 	 0 	 41 	 40 	 0 	 2 	 0 	 4 	end
966 	 0 	 0 	 41 	 41 	 0 	 2 	 0 	 4 	end
967 	 1 	 0 	 41 	 41 	 0 	 2 	 0 	 4 	end
968 	 0 	 0 	 41 	 41 	 0 	 2 	 0 	 4 	end


In [270]:
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = 0
alt_count_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
        else:
            alt_break = 0
        
        alt_count += 1
        
        if cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tend')
    prev_shot_id = shot_id

# return alt_count only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 1 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 1 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 1 	 1 	end
778 	 0 	 0 	 3 	 3 	 7 	 11 	 9 	 1 	 1 	end
779 	 0 	 0 	 3 	 3 	 

In [245]:
len(prev_clust_3_list)

0

In [None]:
#ABCA or ABCB

In [289]:

#draft
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = 0
alt_count_list = []
pattern = 0
pattern_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt\t', 'pattern')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
            pattern += 1
        else:
            alt_break = 0
        
        alt_count += 1
        
        if cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    pattern_list.append(pattern)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\t', pattern, '\tend')
    prev_shot_id = shot_id

# return alt_count_list, pattern_list only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt	 pattern
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	 2 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 1 	 1 

917 	 1 	 2 	 32 	 32 	 10 	 2 	 10 	 0 	 5 	 10 	end
918 	 0 	 5 	 33 	 32 	 2 	 10 	 2 	 1 	 6 	 10 	end
919 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
920 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
921 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
922 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
923 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
924 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
925 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
926 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
927 	 0 	 0 	 34 	 33 	 5 	 2 	 10 	 1 	 1 	 11 	end
928 	 0 	 0 	 34 	 34 	 5 	 2 	 10 	 1 	 1 	 11 	end
929 	 0 	 0 	 34 	 34 	 5 	 2 	 10 	 1 	 1 	 11 	end
930 	 0 	 9 	 35 	 34 	 0 	 5 	 2 	 1 	 1 	 12 	end
931 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
932 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
933 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
934 	 0 	 6 	 36 	 35 	 9 	 0 	 5 	 1 	 1 	 13 	end
935 	 0 	 6 	 36 	 36 	 9 	 0 	 5 	 1 	 1 	 13 	en

In [295]:
# this is it
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = -1
alt_count_list = []
pattern = 0
pattern_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt\t', 'pattern')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
            pattern += 1
            alt_break = 0
        elif cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            alt_count += 1
        else:
            alt_break = 0
            alt_count += 1
                    

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    pattern_list.append(pattern)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\t', pattern, '\tend')
    prev_shot_id = shot_id

# return alt_count_list, pattern_list only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt	 pattern
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	 1 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 0 	 0 

In [276]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values, alt_count_list, pattern_list), columns=['frame_file', 'cluster', 'mcu', 'alternation', 'pattern'])
pd.options.display.max_rows=210
scene_df.head(210)

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
0,761,9,0,1,0
1,762,9,0,1,0
2,763,9,0,1,0
3,764,11,1,1,1
4,765,11,1,1,1
5,766,11,1,1,1
6,767,11,1,1,1
7,768,11,1,1,1
8,769,7,0,1,2
9,770,7,0,1,2


In [278]:
patterns = list(scene_df.loc[scene_df['alternation'] > 2].pattern.unique())
scene_df[scene_df.pattern.isin(patterns)].shape

(157, 5)

In [280]:
pattern_df = scene_df[scene_df.pattern.isin(patterns)]

In [281]:
pattern_df

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
16,777,0,0,1,3
17,778,0,0,1,3
18,779,0,0,1,3
19,780,0,1,1,3
20,781,0,1,1,3
21,782,11,1,2,3
22,783,11,1,2,3
23,784,11,1,2,3
24,785,11,1,2,3
25,786,11,1,2,3


In [286]:
print('pattern\t', 'count\t', 'mean\t')
for x in range(0, 15):
    print(x, '\t', pattern_df.loc[scene_df['pattern'] == x]['mcu'].count(), '\t', pattern_df.loc[scene_df['pattern'] == x]['mcu'].mean())

pattern	 count	 mean	
0 	 0 	 nan
1 	 0 	 nan
2 	 0 	 nan
3 	 35 	 0.4857142857142857
4 	 0 	 nan
5 	 0 	 nan
6 	 44 	 0.75
7 	 20 	 0.6
8 	 0 	 nan
9 	 0 	 nan
10 	 31 	 0.6774193548387096
11 	 0 	 nan
12 	 0 	 nan
13 	 0 	 nan
14 	 27 	 0.3333333333333333


In [288]:
pattern_df.loc[pattern_df['pattern'] == 6]

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
59,820,10,1,1,6
60,821,10,1,1,6
61,822,10,1,1,6
62,823,2,1,2,6
63,824,2,1,2,6
64,825,10,1,3,6
65,826,2,1,4,6
66,827,2,1,4,6
67,828,2,1,4,6
68,829,2,1,4,6
