# scene_clustering
This notebook contains inital code for clustering frames into shots, identifying the A/B/A/B pattern, and using the image classifier model to see if they're MCUs

In [1]:
import sys
import os
sys.path.append('site-packages') # manually put all packages/libraries into this folder
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input
from keras import models
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

Using TensorFlow backend.


## Clustering
### For POC, designating a specific scene's worth of frames

In [2]:
# input film and frame 
# film = 'booksmart'
# frame_choice = list(range(1001, 1163)) # good example for Booksmart!!! 6 clusters, 2500 distance_threshold

# film = 'hobbs_shaw'
# frame_choice = list(range(701, 858)) # Hobbs and Shaw, Jason Statham vs. Helen Mirren, 2,500

# film = 'parasite'
# frame_choice = list(range(1666, 1836)) # Parasite, tough because Mrs. Park and Jessica are both right-oc

# film = 'parasite'
# frame_choice = list(range(6687, 6777)) # Parasite, hiding in bushes, eh, 2900

#film = 'hustle'
#frame_choice = list(range(761, 969)) # The Hustle, train 2900

film = 'hustle'
frame_choice = list(range(661, 1050)) # The Hustle, train 2900, +/- 100 frames either side


In [3]:
# establish folder for this film
dialogue_folder = os.path.join('dialogue_frames', film)

print('There are', len(os.listdir(dialogue_folder)), 'images in the folder')
print('Selected', len(frame_choice), 'of those frames')

There are 5877 images in the folder
Selected 389 of those frames


In [4]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

vgg16_feature_list = []


for x in frame_choice:
    img_path = dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg'
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)

    vgg16_feature = model.predict(img_data)
    vgg16_feature_np = np.array(vgg16_feature)
    vgg16_feature_list.append(vgg16_feature_np.flatten())

    x += 1

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [5]:
vgg16_feature_list_np = np.array(vgg16_feature_list)
vgg16_feature_list_np.shape

(389, 25088)

In [6]:
hac = AgglomerativeClustering(n_clusters = None, distance_threshold = 3000).fit(vgg16_feature_list_np)
print('Number of clusters:', hac.n_clusters_)
print(hac.labels_)

Number of clusters: 34
[14 14  3  3  3 15 15 15 15  3  3  3  3  3  0 14 14 14 14 15 15  8  8  8
  4  4  4  4 24 24 24 16 16 32 32 32 32 24 24 16 16  4  4  4  0  0  0  0
  8  8  8  0  4  4  4  4  4  4  4  0  0  0  0 32 32 16 16 16 16 32 32 32
 32 16 16  3  3  3  3  3 15 15 15 15 15 32 32 32  0  0  0  0 32 32 32  0
 32 24 24 24 30 30 30 20 20 20 20 20 29 29 29 29 29 29 29 29  2  2  2  2
  2 20 20 20 20 20 20 20 20  2  2  2  2 20  2  2 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23  6  6  7  7  7  7  7  7 19 19 19  7  7 19  7  7  7
  7  7  7  7  7 19 19 19 19 19  7  7  7  7  7  7  7  7  7 25 25 25 25 25
 25 25 25 25 25 25 19 19 19 19 19  6  6  6 19 19  7  7 19 19 19  7  7  7
  7  7  6  6  6  6  6  2  2  2 19 19 19  6  6  6  6  6  6  7  7  7  7 19
 19  7  7  7  7 19 19 19 19  7  7  7  7  7  7  7  7 33 33 33 33 33 33 33
 33 33  2  2  2 30 30 30 30 22 22 22 22  2  2  2  2 31 31 31 31  2  2  2
  2  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3 27 27 27 27 27 11 11 11
 11 11 11 11 12 12 12 12 12 

In [11]:
for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    print (frame_file, cluster, mcu_flag)

661 14 0
662 14 0
663 3 1
664 3 1
665 3 1
666 15 1
667 15 1
668 15 1
669 15 1
670 3 1
671 3 1
672 3 1
673 3 1
674 3 1
675 0 0
676 14 0
677 14 0
678 14 0
679 14 0
680 15 0
681 15 0
682 8 0
683 8 0
684 8 0
685 4 1
686 4 1
687 4 1
688 4 1
689 24 1
690 24 1
691 24 1
692 16 0
693 16 0
694 32 0
695 32 1
696 32 1
697 32 1
698 24 1
699 24 1
700 16 0
701 16 0
702 4 1
703 4 1
704 4 1
705 0 1
706 0 1
707 0 1
708 0 1
709 8 0
710 8 0
711 8 0
712 0 1
713 4 1
714 4 1
715 4 1
716 4 1
717 4 1
718 4 1
719 4 1
720 0 1
721 0 1
722 0 1
723 0 1
724 32 1
725 32 1
726 16 0
727 16 0
728 16 0
729 16 0
730 32 0
731 32 1
732 32 1
733 32 1
734 16 0
735 16 0
736 3 1
737 3 1
738 3 1
739 3 1
740 3 1
741 15 0
742 15 0
743 15 0
744 15 0
745 15 0
746 32 1
747 32 1
748 32 1
749 0 1
750 0 0
751 0 1
752 0 1
753 32 1
754 32 1
755 32 1
756 0 1
757 32 1
758 24 1
759 24 1
760 24 1
761 30 0
762 30 0
763 30 0
764 20 1
765 20 1
766 20 1
767 20 1
768 20 1
769 29 0
770 29 0
771 29 0
772 29 0
773 29 0
774 29 0
775 29 0
776 29 0
777 

## Load Saved Model and Identify MCUs

In [7]:
tuned_model = models.load_model('saved_models/tuned_model')

In [8]:
image_list = []
for x in frame_choice:
    image_list.append(img_to_array(load_img(dialogue_folder + '/' + film + '_frame'+ str(x) + '.jpg', target_size = (128, 128), color_mode = 'grayscale')))

In [9]:
image_array = np.array(image_list)
y_pred = tuned_model.predict_classes(image_array)

In [10]:
y_pred_values = []
for prediction in y_pred:
    y_pred_values.append(prediction[0])

# Scene Pattern Algorithm

### Initial try

In [28]:
# prev_cluster changes every single frame
# stored_cluster changes only on cluster change
# alternate_counter changes only on cluster change

prev_cluster = 1000
stored_cluster = 1000
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    if stored_cluster == 1000:
        stored_cluster = prev_cluster
    elif cluster != prev_cluster:
        if cluster == stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster = prev_cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 1000 	 1
762 	 0 	 9 	 9 	 9 	 1
763 	 0 	 9 	 9 	 9 	 1
764 	 1 	 11 	 9 	 9 	 1
765 	 1 	 11 	 11 	 9 	 1
766 	 1 	 11 	 11 	 9 	 1
767 	 1 	 11 	 11 	 9 	 1
768 	 1 	 11 	 11 	 9 	 1
769 	 0 	 7 	 11 	 11 	 1
770 	 0 	 7 	 7 	 11 	 1
771 	 0 	 7 	 7 	 11 	 1
772 	 0 	 7 	 7 	 11 	 1
773 	 0 	 7 	 7 	 11 	 1
774 	 0 	 7 	 7 	 11 	 1
775 	 0 	 7 	 7 	 11 	 1
776 	 0 	 7 	 7 	 11 	 1
777 	 0 	 0 	 7 	 7 	 1
778 	 0 	 0 	 0 	 7 	 1
779 	 0 	 0 	 0 	 7 	 1
780 	 1 	 0 	 0 	 7 	 1
781 	 1 	 0 	 0 	 7 	 1
782 	 1 	 11 	 0 	 0 	 1
783 	 1 	 11 	 11 	 0 	 1
784 	 1 	 11 	 11 	 0 	 1
785 	 1 	 11 	 11 	 0 	 1
786 	 1 	 11 	 11 	 0 	 1
787 	 1 	 11 	 11 	 0 	 1
788 	 1 	 11 	 11 	 0 	 1
789 	 1 	 11 	 11 	 0 	 1
790 	 0 	 0 	 11 	 11 	 2
791 	 1 	 0 	 0 	 11 	 2
792 	 1 	 0 	 0 	 11 	 2
793 	 1 	 0 	 0 	 11 	 2
794 	 1 	 11 	 0 	 0 	 3
795 	 1 	 0 	 11 	 11 	 4
796 	 1 	 0 	 0 	 11 	 4
797 	 1 	 3 	 0 	 0 	 1
798 	 0 	 3 	 3 	 0 	

### Allow for One Extra Storage (for inserts)

In [29]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t', 'alternate')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter)
    prev_cluster = cluster
    

frame	 mcu	 clust	 prev	 stored	 alternate
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1
762 	 0 	 9 	 9 	 [1000, 9] 	 1
763 	 0 	 9 	 9 	 [9, 9] 	 1
764 	 1 	 11 	 9 	 [9, 9] 	 1
765 	 1 	 11 	 11 	 [9, 9] 	 1
766 	 1 	 11 	 11 	 [9, 9] 	 1
767 	 1 	 11 	 11 	 [9, 9] 	 1
768 	 1 	 11 	 11 	 [9, 9] 	 1
769 	 0 	 7 	 11 	 [9, 11] 	 1
770 	 0 	 7 	 7 	 [9, 11] 	 1
771 	 0 	 7 	 7 	 [9, 11] 	 1
772 	 0 	 7 	 7 	 [9, 11] 	 1
773 	 0 	 7 	 7 	 [9, 11] 	 1
774 	 0 	 7 	 7 	 [9, 11] 	 1
775 	 0 	 7 	 7 	 [9, 11] 	 1
776 	 0 	 7 	 7 	 [9, 11] 	 1
777 	 0 	 0 	 7 	 [11, 7] 	 1
778 	 0 	 0 	 0 	 [11, 7] 	 1
779 	 0 	 0 	 0 	 [11, 7] 	 1
780 	 1 	 0 	 0 	 [11, 7] 	 1
781 	 1 	 0 	 0 	 [11, 7] 	 1
782 	 1 	 11 	 0 	 [7, 0] 	 2
783 	 1 	 11 	 11 	 [7, 0] 	 2
784 	 1 	 11 	 11 	 [7, 0] 	 2
785 	 1 	 11 	 11 	 [7, 0] 	 2
786 	 1 	 11 	 11 	 [7, 0] 	 2
787 	 1 	 11 	 11 	 [7, 0] 	 2
788 	 1 	 11 	 11 	 [7, 0] 	 2
789 	 1 	 11 	 11 	 [7, 0] 	 2
790 	 0 	 0 	 11 	 [0, 11] 	 3
791 	 1 	 0 	 0 	 [0, 11] 	 3
792 	

### Assign number for every alternation pattern, save lists for dataframe

In [30]:
prev_cluster = 1000
stored_cluster = [1000, 1001]
alternate_counter = 1
alt_list = []
pattern_list = []
pattern = 0
print('frame\t', 'mcu\t', 'clust\t', 'prev\t', 'stored\t\t', 'alt\t', 'pattern')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    if 1000 in stored_cluster:
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    elif cluster != prev_cluster:
        if cluster in stored_cluster:
            alternate_counter += 1
        else:
            alternate_counter = 1
            pattern += 1
        stored_cluster.append(prev_cluster)
        del stored_cluster[0]
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_cluster, '\t', stored_cluster, '\t', alternate_counter, '\t', pattern)
    prev_cluster = cluster
    alt_list.append(alternate_counter)
    pattern_list.append(pattern)
    

frame	 mcu	 clust	 prev	 stored		 alt	 pattern
761 	 0 	 9 	 1000 	 [1001, 1000] 	 1 	 0
762 	 0 	 9 	 9 	 [1000, 9] 	 1 	 0
763 	 0 	 9 	 9 	 [9, 9] 	 1 	 0
764 	 1 	 11 	 9 	 [9, 9] 	 1 	 1
765 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
766 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
767 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
768 	 1 	 11 	 11 	 [9, 9] 	 1 	 1
769 	 0 	 7 	 11 	 [9, 11] 	 1 	 2
770 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
771 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
772 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
773 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
774 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
775 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
776 	 0 	 7 	 7 	 [9, 11] 	 1 	 2
777 	 0 	 0 	 7 	 [11, 7] 	 1 	 3
778 	 0 	 0 	 0 	 [11, 7] 	 1 	 3
779 	 0 	 0 	 0 	 [11, 7] 	 1 	 3
780 	 1 	 0 	 0 	 [11, 7] 	 1 	 3
781 	 1 	 0 	 0 	 [11, 7] 	 1 	 3
782 	 1 	 11 	 0 	 [7, 0] 	 2 	 3
783 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
784 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
785 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
786 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
787 	 1 	 11 	 11 	 [7, 0] 	 2 	 3
788 	 1 	 11 	 1

## Populate dataframe for continued analysis

In [32]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values, alt_list, pattern_list), columns=['frame_file', 'cluster', 'mcu', 'alternation', 'pattern'])
pd.options.display.max_rows=210
scene_df.head(210)

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
0,761,9,0,1,0
1,762,9,0,1,0
2,763,9,0,1,0
3,764,11,1,1,1
4,765,11,1,1,1
5,766,11,1,1,1
6,767,11,1,1,1
7,768,11,1,1,1
8,769,7,0,1,2
9,770,7,0,1,2


In [None]:
'''
first pattern with more than 3 alternations is pattern 3, starting with cluster 1 which is NOT an MCU
first MCU in pattern 3 is cluster 8, which is MCU
final pattern with more than 3 alternations is pattern 14, ending with cluster 4, which is an MCU
'''

In [34]:
print('cluster\t', 'count\t', 'mean\t')
for x in range(0, 12):
    print(x, '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].count(), '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

cluster	 count	 mean	
0 	 37 	 0.24324324324324326
1 	 16 	 0.0
2 	 55 	 0.9818181818181818
3 	 15 	 0.06666666666666667
4 	 4 	 0.0
5 	 9 	 0.0
6 	 4 	 0.0
7 	 8 	 0.0
8 	 11 	 0.0
9 	 7 	 0.0
10 	 28 	 1.0
11 	 14 	 1.0


In [None]:
# various pandas lookups for debugging
# pd.options.display.max_rows=200
# scene_df.loc[scene_df['cluster'] == 2]
# scene_df.loc[scene_df['mcu'] == 1]
# scene_df.loc[scene_df['frame_file'] == 750]

In [38]:
scene_df.loc[scene_df['alternation'] > 2].pattern.unique()


array([ 3,  7, 10, 14])

In [39]:
patterns = list(scene_df.loc[scene_df['alternation'] > 2].pattern.unique())

In [36]:
scene_df.loc[scene_df['pattern'] == 4]


Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
36,797,3,1,1,4
37,798,3,0,1,4
38,799,3,0,1,4
39,800,3,0,1,4
40,801,3,0,1,4
41,802,3,0,1,4
42,803,3,0,1,4
43,804,3,0,1,4
44,805,3,0,1,4
45,806,3,0,1,4


In [40]:
scene_df[scene_df.pattern.isin(patterns)]

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
16,777,0,0,1,3
17,778,0,0,1,3
18,779,0,0,1,3
19,780,0,1,1,3
20,781,0,1,1,3
21,782,11,1,2,3
22,783,11,1,2,3
23,784,11,1,2,3
24,785,11,1,2,3
25,786,11,1,2,3


In [41]:
print(hac.labels_)

[ 9  9  9 11 11 11 11 11  7  7  7  7  7  7  7  7  0  0  0  0  0 11 11 11
 11 11 11 11 11  0  0  0  0 11  0  0  3  3  3  3  3  3  3  3  3  3  3  3
  3  3  3  1  1  2  2  2  2  2  2 10 10 10  2  2 10  2  2  2  2  2  2  2
  2 10 10 10 10 10  2  2  2  2  2  2  2  2  2  8  8  8  8  8  8  8  8  8
  8  8 10 10 10 10 10  1  1  1 10 10  2  2 10 10 10  2  2  2  2  2  1  1
  1  1  1  0  0  0 10 10 10  1  1  1  1  1  1  2  2  2  2 10 10  2  2  2
  2 10 10 10 10  2  2  2  2  2  2  2  2  5  5  5  5  5  5  5  5  5  0  0
  0  9  9  9  9  6  6  6  6  0  0  0  0  4  4  4  4  0  0  0  0  0  0  0
  0  0  0  0  0  2  2  2  2  2  2  2  0  0  0  0]


In [None]:
# currently alternation canceled if 2->10->2->8
# currently alternation applied during loop, but should be applied after the fact
# currently pattern applied during loop, but should be applied after the fact

# create dataframe for file, cluster, mcu, storage, then add alternation, then add pattern

In [67]:
storage = []

In [69]:
storage

[]

In [118]:
prev_shot_1 = 1000
prev_shot_2 = 1001
prev_frame = 2000
same_shot = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
    elif cluster == prev_frame:
        pass
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	
761 	 0 	 9 	 9 	 9 	 1001 	end
762 	 0 	 9 	 9 	 9 	 1001 	end
763 	 0 	 9 	 9 	 9 	 1001 	end
764 	 1 	 11 	 11 	 9 	 9 	end
765 	 1 	 11 	 11 	 9 	 9 	end
766 	 1 	 11 	 11 	 9 	 9 	end
767 	 1 	 11 	 11 	 9 	 9 	end
768 	 1 	 11 	 11 	 9 	 9 	end
769 	 0 	 7 	 7 	 11 	 9 	end
770 	 0 	 7 	 7 	 11 	 9 	end
771 	 0 	 7 	 7 	 11 	 9 	end
772 	 0 	 7 	 7 	 11 	 9 	end
773 	 0 	 7 	 7 	 11 	 9 	end
774 	 0 	 7 	 7 	 11 	 9 	end
775 	 0 	 7 	 7 	 11 	 9 	end
776 	 0 	 7 	 7 	 11 	 9 	end
777 	 0 	 0 	 0 	 7 	 11 	end
778 	 0 	 0 	 0 	 7 	 11 	end
779 	 0 	 0 	 0 	 7 	 11 	end
780 	 1 	 0 	 0 	 7 	 11 	end
781 	 1 	 0 	 0 	 7 	 11 	end
782 	 1 	 11 	 11 	 0 	 7 	end
783 	 1 	 11 	 11 	 0 	 7 	end
784 	 1 	 11 	 11 	 0 	 7 	end
785 	 1 	 11 	 11 	 0 	 7 	end
786 	 1 	 11 	 11 	 0 	 7 	end
787 	 1 	 11 	 11 	 0 	 7 	end
788 	 1 	 11 	 11 	 0 	 7 	end
789 	 1 	 11 	 11 	 0 	 7 	end
790 	 0 	 0 	 0 	 11 	 0 	end
791 	 1 	 0 	 0 	 11 	 0 	end
792 	 1 	

In [125]:
prev_shot_1 = 1000
prev_shot_1_list = [1000]
prev_shot_2 = 1001
prev_shot_2_list = [1001]
prev_frame = 2000
prev_frame_list = [2000]
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
prev_shot_1 = 1000
prev_shot_2 = 1001
prev_frame = 2000
same_shot = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t')

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if prev_frame == 2000:
        prev_shot_1 = cluster
    elif cluster == prev_frame:
        pass
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')
alt_break = 0
alt_break_list = [0]
    elif cluster != prev_frame:
        prev_shot_2 = prev_shot_1
        prev_shot_1 = prev_frame
    prev_frame = cluster
    prev_shot_1_list.append(prev_shot_1)
    prev_shot_2_list.append(prev_shot_2)
    prev_frame_list.append(prev_frame)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2,'\tend')


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	
761 	 0 	 9 	 9 	 9 	 1001 	end
762 	 0 	 9 	 9 	 9 	 1001 	end
763 	 0 	 9 	 9 	 9 	 1001 	end
764 	 1 	 11 	 11 	 9 	 9 	end
765 	 1 	 11 	 11 	 9 	 9 	end
766 	 1 	 11 	 11 	 9 	 9 	end
767 	 1 	 11 	 11 	 9 	 9 	end
768 	 1 	 11 	 11 	 9 	 9 	end
769 	 0 	 7 	 7 	 11 	 9 	end
770 	 0 	 7 	 7 	 11 	 9 	end
771 	 0 	 7 	 7 	 11 	 9 	end
772 	 0 	 7 	 7 	 11 	 9 	end
773 	 0 	 7 	 7 	 11 	 9 	end
774 	 0 	 7 	 7 	 11 	 9 	end
775 	 0 	 7 	 7 	 11 	 9 	end
776 	 0 	 7 	 7 	 11 	 9 	end
777 	 0 	 0 	 0 	 7 	 11 	end
778 	 0 	 0 	 0 	 7 	 11 	end
779 	 0 	 0 	 0 	 7 	 11 	end
780 	 1 	 0 	 0 	 7 	 11 	end
781 	 1 	 0 	 0 	 7 	 11 	end
782 	 1 	 11 	 11 	 0 	 7 	end
783 	 1 	 11 	 11 	 0 	 7 	end
784 	 1 	 11 	 11 	 0 	 7 	end
785 	 1 	 11 	 11 	 0 	 7 	end
786 	 1 	 11 	 11 	 0 	 7 	end
787 	 1 	 11 	 11 	 0 	 7 	end
788 	 1 	 11 	 11 	 0 	 7 	end
789 	 1 	 11 	 11 	 0 	 7 	end
790 	 0 	 0 	 0 	 11 	 0 	end
791 	 1 	 0 	 0 	 11 	 0 	end
792 	 1 	

In [126]:
for a, b, c, d, e, f in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list):
    print(a, b, c, d, e, f)

761 0 9 2000 1000 1001
762 0 9 9 9 1001
763 0 9 9 9 1001
764 1 11 9 9 1001
765 1 11 11 9 9
766 1 11 11 9 9
767 1 11 11 9 9
768 1 11 11 9 9
769 0 7 11 9 9
770 0 7 7 11 9
771 0 7 7 11 9
772 0 7 7 11 9
773 0 7 7 11 9
774 0 7 7 11 9
775 0 7 7 11 9
776 0 7 7 11 9
777 0 0 7 11 9
778 0 0 0 7 11
779 0 0 0 7 11
780 1 0 0 7 11
781 1 0 0 7 11
782 1 11 0 7 11
783 1 11 11 0 7
784 1 11 11 0 7
785 1 11 11 0 7
786 1 11 11 0 7
787 1 11 11 0 7
788 1 11 11 0 7
789 1 11 11 0 7
790 0 0 11 0 7
791 1 0 0 11 0
792 1 0 0 11 0
793 1 0 0 11 0
794 1 11 0 11 0
795 1 0 11 0 11
796 1 0 0 11 0
797 1 3 0 11 0
798 0 3 3 0 11
799 0 3 3 0 11
800 0 3 3 0 11
801 0 3 3 0 11
802 0 3 3 0 11
803 0 3 3 0 11
804 0 3 3 0 11
805 0 3 3 0 11
806 0 3 3 0 11
807 0 3 3 0 11
808 0 3 3 0 11
809 0 3 3 0 11
810 0 3 3 0 11
811 0 3 3 0 11
812 0 1 3 0 11
813 0 1 1 3 0
814 1 2 1 3 0
815 1 2 2 1 3
816 1 2 2 1 3
817 1 2 2 1 3
818 1 2 2 1 3
819 1 2 2 1 3
820 1 10 2 1 3
821 1 10 10 2 1
822 1 10 10 2 1
823 1 2 10 2 1
824 1 2 2 10 2
825 1 10 2 10 2


In [152]:
alt_break = 0
alt_break_list = [0]
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t', 'altbreak\t')

for frame_file, mcu_flag, cluster, prev_frame, prev_shot_1, prev_shot_2 in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list):
    if cluster != prev_frame:
        if cluster not in [prev_shot_1, prev_shot_2]:
            alt_break = 1
        else:
            alt_break = 0
    alt_break_list.append(alt_break)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2, '\t', alt_break)


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	 altbreak	
761 	 0 	 9 	 2000 	 1000 	 1001 	 1
762 	 0 	 9 	 9 	 9 	 1001 	 1
763 	 0 	 9 	 9 	 9 	 1001 	 1
764 	 1 	 11 	 9 	 9 	 1001 	 1
765 	 1 	 11 	 11 	 9 	 9 	 1
766 	 1 	 11 	 11 	 9 	 9 	 1
767 	 1 	 11 	 11 	 9 	 9 	 1
768 	 1 	 11 	 11 	 9 	 9 	 1
769 	 0 	 7 	 11 	 9 	 9 	 1
770 	 0 	 7 	 7 	 11 	 9 	 1
771 	 0 	 7 	 7 	 11 	 9 	 1
772 	 0 	 7 	 7 	 11 	 9 	 1
773 	 0 	 7 	 7 	 11 	 9 	 1
774 	 0 	 7 	 7 	 11 	 9 	 1
775 	 0 	 7 	 7 	 11 	 9 	 1
776 	 0 	 7 	 7 	 11 	 9 	 1
777 	 0 	 0 	 7 	 11 	 9 	 1
778 	 0 	 0 	 0 	 7 	 11 	 1
779 	 0 	 0 	 0 	 7 	 11 	 1
780 	 1 	 0 	 0 	 7 	 11 	 1
781 	 1 	 0 	 0 	 7 	 11 	 1
782 	 1 	 11 	 0 	 7 	 11 	 0
783 	 1 	 11 	 11 	 0 	 7 	 0
784 	 1 	 11 	 11 	 0 	 7 	 0
785 	 1 	 11 	 11 	 0 	 7 	 0
786 	 1 	 11 	 11 	 0 	 7 	 0
787 	 1 	 11 	 11 	 0 	 7 	 0
788 	 1 	 11 	 11 	 0 	 7 	 0
789 	 1 	 11 	 11 	 0 	 7 	 0
790 	 0 	 0 	 11 	 0 	 7 	 0
791 	 1 	 0 	 0 	 11 	 0 	 0
792 	 1 	 0 	 0 	 11 	

In [135]:
len(alt_break_list)

209

In [154]:
alt_counter = 0
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'prev_1\t', 'prev2\t', 'altbrk\t', 'altcount\t')

for frame_file, mcu_flag, cluster, prev_frame, prev_shot_1, prev_shot_2, alt_break in zip(frame_choice, y_pred_values, hac.labels_, prev_frame_list, prev_shot_1_list, prev_shot_2_list, alt_break_list):
    if cluster != prev_frame:
        alt_counter += 1
        if cluster not in [prev_shot_1, prev_shot_2]:
            if alt_break == 1:
                alt_counter = 0
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', prev_shot_1, '\t', prev_shot_2, '\t', alt_break, '\t', alt_counter)


frame	 mcu	 clust	 prvfrm	 prev_1	 prev2	 altbrk	 altcount	
761 	 0 	 9 	 2000 	 1000 	 1001 	 0 	 1
762 	 0 	 9 	 9 	 9 	 1001 	 1 	 1
763 	 0 	 9 	 9 	 9 	 1001 	 1 	 1
764 	 1 	 11 	 9 	 9 	 1001 	 1 	 0
765 	 1 	 11 	 11 	 9 	 9 	 1 	 0
766 	 1 	 11 	 11 	 9 	 9 	 1 	 0
767 	 1 	 11 	 11 	 9 	 9 	 1 	 0
768 	 1 	 11 	 11 	 9 	 9 	 1 	 0
769 	 0 	 7 	 11 	 9 	 9 	 1 	 0
770 	 0 	 7 	 7 	 11 	 9 	 1 	 0
771 	 0 	 7 	 7 	 11 	 9 	 1 	 0
772 	 0 	 7 	 7 	 11 	 9 	 1 	 0
773 	 0 	 7 	 7 	 11 	 9 	 1 	 0
774 	 0 	 7 	 7 	 11 	 9 	 1 	 0
775 	 0 	 7 	 7 	 11 	 9 	 1 	 0
776 	 0 	 7 	 7 	 11 	 9 	 1 	 0
777 	 0 	 0 	 7 	 11 	 9 	 1 	 0
778 	 0 	 0 	 0 	 7 	 11 	 1 	 0
779 	 0 	 0 	 0 	 7 	 11 	 1 	 0
780 	 1 	 0 	 0 	 7 	 11 	 1 	 0
781 	 1 	 0 	 0 	 7 	 11 	 1 	 0
782 	 1 	 11 	 0 	 7 	 11 	 1 	 1
783 	 1 	 11 	 11 	 0 	 7 	 0 	 1
784 	 1 	 11 	 11 	 0 	 7 	 0 	 1
785 	 1 	 11 	 11 	 0 	 7 	 0 	 1
786 	 1 	 11 	 11 	 0 	 7 	 0 	 1
787 	 1 	 11 	 11 	 0 	 7 	 0 	 1
788 	 1 	 11 	 11 	 0 	 

# Transition to Shot ID

In [12]:
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'shotid\t')
shot_id = 0
shot_id_list = []
prev_frame = 1000

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if cluster != prev_frame and prev_frame != 1000:
        shot_id += 1
    shot_id_list.append(shot_id)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', shot_id,'\tend')
    prev_frame = cluster

# return shot_id_list only

frame	 mcu	 clust	 prvfrm	 shotid	
661 	 0 	 14 	 1000 	 0 	end
662 	 0 	 14 	 14 	 0 	end
663 	 1 	 3 	 14 	 1 	end
664 	 1 	 3 	 3 	 1 	end
665 	 1 	 3 	 3 	 1 	end
666 	 1 	 15 	 3 	 2 	end
667 	 1 	 15 	 15 	 2 	end
668 	 1 	 15 	 15 	 2 	end
669 	 1 	 15 	 15 	 2 	end
670 	 1 	 3 	 15 	 3 	end
671 	 1 	 3 	 3 	 3 	end
672 	 1 	 3 	 3 	 3 	end
673 	 1 	 3 	 3 	 3 	end
674 	 1 	 3 	 3 	 3 	end
675 	 0 	 0 	 3 	 4 	end
676 	 0 	 14 	 0 	 5 	end
677 	 0 	 14 	 14 	 5 	end
678 	 0 	 14 	 14 	 5 	end
679 	 0 	 14 	 14 	 5 	end
680 	 0 	 15 	 14 	 6 	end
681 	 0 	 15 	 15 	 6 	end
682 	 0 	 8 	 15 	 7 	end
683 	 0 	 8 	 8 	 7 	end
684 	 0 	 8 	 8 	 7 	end
685 	 1 	 4 	 8 	 8 	end
686 	 1 	 4 	 4 	 8 	end
687 	 1 	 4 	 4 	 8 	end
688 	 1 	 4 	 4 	 8 	end
689 	 1 	 24 	 4 	 9 	end
690 	 1 	 24 	 24 	 9 	end
691 	 1 	 24 	 24 	 9 	end
692 	 0 	 16 	 24 	 10 	end
693 	 0 	 16 	 16 	 10 	end
694 	 0 	 32 	 16 	 11 	end
695 	 1 	 32 	 32 	 11 	end
696 	 1 	 32 	 32 	 11 	end
697 	 1 	 32 	 32 

1018 	 0 	 5 	 5 	 83 	end
1019 	 1 	 5 	 5 	 83 	end
1020 	 1 	 5 	 5 	 83 	end
1021 	 1 	 18 	 5 	 84 	end
1022 	 1 	 18 	 18 	 84 	end
1023 	 1 	 18 	 18 	 84 	end
1024 	 0 	 1 	 18 	 85 	end
1025 	 0 	 1 	 1 	 85 	end
1026 	 0 	 1 	 1 	 85 	end
1027 	 0 	 17 	 1 	 86 	end
1028 	 0 	 17 	 17 	 86 	end
1029 	 0 	 17 	 17 	 86 	end
1030 	 0 	 17 	 17 	 86 	end
1031 	 0 	 9 	 17 	 87 	end
1032 	 0 	 9 	 9 	 87 	end
1033 	 0 	 9 	 9 	 87 	end
1034 	 0 	 1 	 9 	 88 	end
1035 	 0 	 1 	 1 	 88 	end
1036 	 0 	 1 	 1 	 88 	end
1037 	 0 	 26 	 1 	 89 	end
1038 	 1 	 26 	 26 	 89 	end
1039 	 1 	 26 	 26 	 89 	end
1040 	 1 	 26 	 26 	 89 	end
1041 	 0 	 26 	 26 	 89 	end
1042 	 1 	 26 	 26 	 89 	end
1043 	 1 	 26 	 26 	 89 	end
1044 	 1 	 26 	 26 	 89 	end
1045 	 0 	 1 	 26 	 90 	end
1046 	 0 	 1 	 1 	 90 	end
1047 	 0 	 1 	 1 	 90 	end
1048 	 0 	 21 	 1 	 91 	end
1049 	 0 	 21 	 21 	 91 	end


In [187]:
shot_id_list

[0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 6,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 12,
 12,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 20,
 20,
 21,
 21,
 22,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 24,
 24,
 24,
 25,
 25,
 25,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 28,
 29,
 29,
 30,
 30,
 30,
 30,
 31,
 31,
 31,
 31,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 32,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 34,
 34,
 34,
 35,
 35,
 35,
 35,
 36,
 36,
 36,
 36,
 37,
 37,
 37,
 37,
 38,
 38,
 38,
 38,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 39,
 40,
 40,
 40,
 40,
 40,
 40,
 40,
 41,
 41,
 41,
 41]

In [13]:
prev_clust_0 = 1000
prev_clust_0_list = []
prev_clust_1 = 1001
prev_clust_1_list = []
prev_clust_2 = 1002
prev_clust_2_list = []
prev_clust_3 = 1003
prev_clust_3_list = []
prev_shot_id = -1
print('frame\t', 'mcu\t', 'clust\t', 'shotid\t', 'prvshid','prev0\t', 'prev1\t', 'prev2\t', 'prev3\t')

for frame_file, cluster, mcu_flag, shot_id in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list):
    #print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_0, '\t', prev_clust_1, '\t', prev_clust_2,'\tbeg')
    if shot_id != prev_shot_id:
        prev_clust_3 = prev_clust_2
        prev_clust_2 = prev_clust_1
        prev_clust_1 = prev_clust_0
        prev_clust_0 = cluster
    prev_clust_0_list.append(prev_clust_0)
    prev_clust_1_list.append(prev_clust_1)
    prev_clust_2_list.append(prev_clust_2)
    prev_clust_3_list.append(prev_clust_3)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_0, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tend')
    prev_shot_id = shot_id
    
# return prev_clust_1, prev_clust_2, and prev_clust_3 only


frame	 mcu	 clust	 shotid	 prvshid prev0	 prev1	 prev2	 prev3	
661 	 0 	 14 	 0 	 -1 	 14 	 1000 	 1001 	 1002 	end
662 	 0 	 14 	 0 	 0 	 14 	 1000 	 1001 	 1002 	end
663 	 1 	 3 	 1 	 0 	 3 	 14 	 1000 	 1001 	end
664 	 1 	 3 	 1 	 1 	 3 	 14 	 1000 	 1001 	end
665 	 1 	 3 	 1 	 1 	 3 	 14 	 1000 	 1001 	end
666 	 1 	 15 	 2 	 1 	 15 	 3 	 14 	 1000 	end
667 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	 1000 	end
668 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	 1000 	end
669 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	 1000 	end
670 	 1 	 3 	 3 	 2 	 3 	 15 	 3 	 14 	end
671 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	 14 	end
672 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	 14 	end
673 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	 14 	end
674 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	 14 	end
675 	 0 	 0 	 4 	 3 	 0 	 3 	 15 	 3 	end
676 	 0 	 14 	 5 	 4 	 14 	 0 	 3 	 15 	end
677 	 0 	 14 	 5 	 5 	 14 	 0 	 3 	 15 	end
678 	 0 	 14 	 5 	 5 	 14 	 0 	 3 	 15 	end
679 	 0 	 14 	 5 	 5 	 14 	 0 	 3 	 15 	end
680 	 0 	 15 	 6 	 5 	 15 	 14 	 0 	 3 	end
681 	 0 	 15 	 6 	 6 	 

854 	 0 	 25 	 49 	 49 	 25 	 7 	 19 	 7 	end
855 	 0 	 25 	 49 	 49 	 25 	 7 	 19 	 7 	end
856 	 0 	 25 	 49 	 49 	 25 	 7 	 19 	 7 	end
857 	 0 	 25 	 49 	 49 	 25 	 7 	 19 	 7 	end
858 	 0 	 25 	 49 	 49 	 25 	 7 	 19 	 7 	end
859 	 1 	 19 	 50 	 49 	 19 	 25 	 7 	 19 	end
860 	 1 	 19 	 50 	 50 	 19 	 25 	 7 	 19 	end
861 	 1 	 19 	 50 	 50 	 19 	 25 	 7 	 19 	end
862 	 1 	 19 	 50 	 50 	 19 	 25 	 7 	 19 	end
863 	 1 	 19 	 50 	 50 	 19 	 25 	 7 	 19 	end
864 	 0 	 6 	 51 	 50 	 6 	 19 	 25 	 7 	end
865 	 0 	 6 	 51 	 51 	 6 	 19 	 25 	 7 	end
866 	 0 	 6 	 51 	 51 	 6 	 19 	 25 	 7 	end
867 	 1 	 19 	 52 	 51 	 19 	 6 	 19 	 25 	end
868 	 1 	 19 	 52 	 52 	 19 	 6 	 19 	 25 	end
869 	 1 	 7 	 53 	 52 	 7 	 19 	 6 	 19 	end
870 	 1 	 7 	 53 	 53 	 7 	 19 	 6 	 19 	end
871 	 1 	 19 	 54 	 53 	 19 	 7 	 19 	 6 	end
872 	 1 	 19 	 54 	 54 	 19 	 7 	 19 	 6 	end
873 	 1 	 19 	 54 	 54 	 19 	 7 	 19 	 6 	end
874 	 1 	 7 	 55 	 54 	 7 	 19 	 7 	 19 	end
875 	 1 	 7 	 55 	 55 	 7 	 19 	 

1040 	 1 	 26 	 89 	 89 	 26 	 1 	 9 	 17 	end
1041 	 0 	 26 	 89 	 89 	 26 	 1 	 9 	 17 	end
1042 	 1 	 26 	 89 	 89 	 26 	 1 	 9 	 17 	end
1043 	 1 	 26 	 89 	 89 	 26 	 1 	 9 	 17 	end
1044 	 1 	 26 	 89 	 89 	 26 	 1 	 9 	 17 	end
1045 	 0 	 1 	 90 	 89 	 1 	 26 	 1 	 9 	end
1046 	 0 	 1 	 90 	 90 	 1 	 26 	 1 	 9 	end
1047 	 0 	 1 	 90 	 90 	 1 	 26 	 1 	 9 	end
1048 	 0 	 21 	 91 	 90 	 21 	 1 	 26 	 1 	end
1049 	 0 	 21 	 91 	 91 	 21 	 1 	 26 	 1 	end


In [270]:
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = 0
alt_count_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
        else:
            alt_break = 0
        
        alt_count += 1
        
        if cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tend')
    prev_shot_id = shot_id

# return alt_count only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 1 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 1 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 1 	 1 	end
778 	 0 	 0 	 3 	 3 	 7 	 11 	 9 	 1 	 1 	end
779 	 0 	 0 	 3 	 3 	 

In [245]:
len(prev_clust_3_list)

0

In [None]:
#ABCA or ABCB

In [289]:

#draft
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = 0
alt_count_list = []
pattern = 0
pattern_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt\t', 'pattern')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
            pattern += 1
        else:
            alt_break = 0
        
        alt_count += 1
        
        if cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    pattern_list.append(pattern)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\t', pattern, '\tend')
    prev_shot_id = shot_id

# return alt_count_list, pattern_list only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt	 pattern
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 1 	 0 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 1 	 1 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	 2 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 2 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 1 	 1 

917 	 1 	 2 	 32 	 32 	 10 	 2 	 10 	 0 	 5 	 10 	end
918 	 0 	 5 	 33 	 32 	 2 	 10 	 2 	 1 	 6 	 10 	end
919 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
920 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
921 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
922 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
923 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
924 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
925 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
926 	 0 	 5 	 33 	 33 	 2 	 10 	 2 	 1 	 6 	 10 	end
927 	 0 	 0 	 34 	 33 	 5 	 2 	 10 	 1 	 1 	 11 	end
928 	 0 	 0 	 34 	 34 	 5 	 2 	 10 	 1 	 1 	 11 	end
929 	 0 	 0 	 34 	 34 	 5 	 2 	 10 	 1 	 1 	 11 	end
930 	 0 	 9 	 35 	 34 	 0 	 5 	 2 	 1 	 1 	 12 	end
931 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
932 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
933 	 0 	 9 	 35 	 35 	 0 	 5 	 2 	 1 	 1 	 12 	end
934 	 0 	 6 	 36 	 35 	 9 	 0 	 5 	 1 	 1 	 13 	end
935 	 0 	 6 	 36 	 36 	 9 	 0 	 5 	 1 	 1 	 13 	en

In [295]:
# this is it
alt_break = 0
alt_break_list = []
prev_shot_id = -1
alt_count = -1
alt_count_list = []
pattern = 0
pattern_list = []
print('frame\t', 'mcu\t', 'clust\t', 'shotid', 'previd\t', 'prev_1\t', 'prev2\t', 'prev3\t', 'altbrk\t', 'altcnt\t', 'pattern')

for frame_file, cluster, mcu_flag, shot_id, prev_clust_1, prev_clust_2, prev_clust_3 in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list, prev_clust_1_list, prev_clust_2_list, prev_clust_3_list):
    # print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\tbeg')
    if shot_id != prev_shot_id:
        

        if alt_break == 1 and cluster not in [prev_clust_1, prev_clust_2, prev_clust_3]: #ABCA or ABCB
            alt_count = 0
            pattern += 1
            alt_break = 0
        elif cluster not in [prev_clust_1, prev_clust_2]: #ABC, in danger of breaking altneration unless A or B next
            alt_break = 1
            alt_count += 1
        else:
            alt_break = 0
            alt_count += 1
                    

    alt_break_list.append(alt_break)
    alt_count_list.append(alt_count)
    pattern_list.append(pattern)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2,'\t', prev_clust_3,'\t', alt_break, '\t', alt_count, '\t', pattern, '\tend')
    prev_shot_id = shot_id

# return alt_count_list, pattern_list only

frame	 mcu	 clust	 shotid previd	 prev_1	 prev2	 prev3	 altbrk	 altcnt	 pattern
761 	 0 	 9 	 0 	 -1 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
762 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
763 	 0 	 9 	 0 	 0 	 1000 	 1001 	 1002 	 1 	 0 	 0 	end
764 	 1 	 11 	 1 	 0 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
765 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
766 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
767 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
768 	 1 	 11 	 1 	 1 	 9 	 1000 	 1001 	 0 	 0 	 1 	end
769 	 0 	 7 	 2 	 1 	 11 	 9 	 1000 	 1 	 1 	 1 	end
770 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
771 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
772 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
773 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
774 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
775 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
776 	 0 	 7 	 2 	 2 	 11 	 9 	 1000 	 1 	 1 	 1 	end
777 	 0 	 0 	 3 	 2 	 7 	 11 	 9 	 0 	 0 

In [276]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values, alt_count_list, pattern_list), columns=['frame_file', 'cluster', 'mcu', 'alternation', 'pattern'])
pd.options.display.max_rows=210
scene_df.head(210)

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
0,761,9,0,1,0
1,762,9,0,1,0
2,763,9,0,1,0
3,764,11,1,1,1
4,765,11,1,1,1
5,766,11,1,1,1
6,767,11,1,1,1
7,768,11,1,1,1
8,769,7,0,1,2
9,770,7,0,1,2


In [296]:
okay = [2, 10]
scene_df[scene_df.cluster.isin(okay)]

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
53,814,2,1,1,5
54,815,2,1,1,5
55,816,2,1,1,5
56,817,2,1,1,5
57,818,2,1,1,5
58,819,2,1,1,5
59,820,10,1,1,6
60,821,10,1,1,6
61,822,10,1,1,6
62,823,2,1,2,6


In [278]:
patterns = list(scene_df.loc[scene_df['alternation'] > 2].pattern.unique())
scene_df[scene_df.pattern.isin(patterns)].shape

(157, 5)

In [280]:
pattern_df = scene_df[scene_df.pattern.isin(patterns)]

In [281]:
pattern_df

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
16,777,0,0,1,3
17,778,0,0,1,3
18,779,0,0,1,3
19,780,0,1,1,3
20,781,0,1,1,3
21,782,11,1,2,3
22,783,11,1,2,3
23,784,11,1,2,3
24,785,11,1,2,3
25,786,11,1,2,3


In [286]:
print('pattern\t', 'count\t', 'mean\t')
for x in range(0, 15):
    print(x, '\t', pattern_df.loc[scene_df['pattern'] == x]['mcu'].count(), '\t', pattern_df.loc[scene_df['pattern'] == x]['mcu'].mean())

pattern	 count	 mean	
0 	 0 	 nan
1 	 0 	 nan
2 	 0 	 nan
3 	 35 	 0.4857142857142857
4 	 0 	 nan
5 	 0 	 nan
6 	 44 	 0.75
7 	 20 	 0.6
8 	 0 	 nan
9 	 0 	 nan
10 	 31 	 0.6774193548387096
11 	 0 	 nan
12 	 0 	 nan
13 	 0 	 nan
14 	 27 	 0.3333333333333333


In [288]:
pattern_df.loc[pattern_df['pattern'] == 6]

Unnamed: 0,frame_file,cluster,mcu,alternation,pattern
59,820,10,1,1,6
60,821,10,1,1,6
61,822,10,1,1,6
62,823,2,1,2,6
63,824,2,1,2,6
64,825,10,1,3,6
65,826,2,1,4,6
66,827,2,1,4,6
67,828,2,1,4,6
68,829,2,1,4,6


In [None]:
#establish ABA first, and then allow for C

# establish A and B in memory
# when interrupted, alt_break = 1, establish C in memory, if next is A or B, alt_break back to 0

# store C in memory for each A/B pattern
# scene is first A through last B, and then any Cs on the end

# cluster, prev_cluster_1, prev_cluster_2
# if ABA, check for MCU
# look for first A/B, last A/B, and get all Cs in between
# scene is first A/B, last A/B, and any connecting Cs

In [14]:
print('frame\t', 'mcu\t', 'clust\t', 'prvfrm\t', 'shotid\t')
shot_id = 0
shot_id_list = []
prev_frame = 1000

for frame_file, cluster, mcu_flag in zip(frame_choice, hac.labels_, y_pred_values):
    #print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame,'\t', prev_shot_1, '\t', prev_shot_2,'\tbeg')
    if cluster != prev_frame and prev_frame != 1000:
        shot_id += 1
    shot_id_list.append(shot_id)
    print(frame_file, '\t', mcu_flag, '\t', cluster, '\t', prev_frame, '\t', shot_id,'\tend')
    prev_frame = cluster

# return shot_id_list only

frame	 mcu	 clust	 prvfrm	 shotid	
661 	 0 	 14 	 1000 	 0 	end
662 	 0 	 14 	 14 	 0 	end
663 	 1 	 3 	 14 	 1 	end
664 	 1 	 3 	 3 	 1 	end
665 	 1 	 3 	 3 	 1 	end
666 	 1 	 15 	 3 	 2 	end
667 	 1 	 15 	 15 	 2 	end
668 	 1 	 15 	 15 	 2 	end
669 	 1 	 15 	 15 	 2 	end
670 	 1 	 3 	 15 	 3 	end
671 	 1 	 3 	 3 	 3 	end
672 	 1 	 3 	 3 	 3 	end
673 	 1 	 3 	 3 	 3 	end
674 	 1 	 3 	 3 	 3 	end
675 	 0 	 0 	 3 	 4 	end
676 	 0 	 14 	 0 	 5 	end
677 	 0 	 14 	 14 	 5 	end
678 	 0 	 14 	 14 	 5 	end
679 	 0 	 14 	 14 	 5 	end
680 	 0 	 15 	 14 	 6 	end
681 	 0 	 15 	 15 	 6 	end
682 	 0 	 8 	 15 	 7 	end
683 	 0 	 8 	 8 	 7 	end
684 	 0 	 8 	 8 	 7 	end
685 	 1 	 4 	 8 	 8 	end
686 	 1 	 4 	 4 	 8 	end
687 	 1 	 4 	 4 	 8 	end
688 	 1 	 4 	 4 	 8 	end
689 	 1 	 24 	 4 	 9 	end
690 	 1 	 24 	 24 	 9 	end
691 	 1 	 24 	 24 	 9 	end
692 	 0 	 16 	 24 	 10 	end
693 	 0 	 16 	 16 	 10 	end
694 	 0 	 32 	 16 	 11 	end
695 	 1 	 32 	 32 	 11 	end
696 	 1 	 32 	 32 	 11 	end
697 	 1 	 32 	 32 

1006 	 1 	 18 	 18 	 80 	end
1007 	 1 	 5 	 18 	 81 	end
1008 	 1 	 5 	 5 	 81 	end
1009 	 1 	 5 	 5 	 81 	end
1010 	 1 	 5 	 5 	 81 	end
1011 	 0 	 1 	 5 	 82 	end
1012 	 0 	 1 	 1 	 82 	end
1013 	 0 	 1 	 1 	 82 	end
1014 	 0 	 1 	 1 	 82 	end
1015 	 0 	 1 	 1 	 82 	end
1016 	 0 	 1 	 1 	 82 	end
1017 	 1 	 5 	 1 	 83 	end
1018 	 0 	 5 	 5 	 83 	end
1019 	 1 	 5 	 5 	 83 	end
1020 	 1 	 5 	 5 	 83 	end
1021 	 1 	 18 	 5 	 84 	end
1022 	 1 	 18 	 18 	 84 	end
1023 	 1 	 18 	 18 	 84 	end
1024 	 0 	 1 	 18 	 85 	end
1025 	 0 	 1 	 1 	 85 	end
1026 	 0 	 1 	 1 	 85 	end
1027 	 0 	 17 	 1 	 86 	end
1028 	 0 	 17 	 17 	 86 	end
1029 	 0 	 17 	 17 	 86 	end
1030 	 0 	 17 	 17 	 86 	end
1031 	 0 	 9 	 17 	 87 	end
1032 	 0 	 9 	 9 	 87 	end
1033 	 0 	 9 	 9 	 87 	end
1034 	 0 	 1 	 9 	 88 	end
1035 	 0 	 1 	 1 	 88 	end
1036 	 0 	 1 	 1 	 88 	end
1037 	 0 	 26 	 1 	 89 	end
1038 	 1 	 26 	 26 	 89 	end
1039 	 1 	 26 	 26 	 89 	end
1040 	 1 	 26 	 26 	 89 	end
1041 	 0 	 26 	 26 	 89 	end
10

### This is the only one needed

In [64]:
prev_clust_1 = 1001
prev_clust_1_list = []
prev_clust_2 = 1002
prev_clust_2_list = []
prev_clust_3 = 1003
prev_clust_3_list = []
prev_shot_id = -1
speaker_a_list = []
speaker_b_list = []

print('frame\t', 'mcu\t', 'clust\t', 'shotid\t', 'prvshid', 'prev1\t', 'prev2\t', 'prev3\t')

for frame_file, cluster, mcu_flag, shot_id in zip(frame_choice, hac.labels_, y_pred_values, shot_id_list):
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tbeg')
    if cluster == prev_clust_2 and prev_clust_1 == prev_clust_3:
        print('found one')
        speaker_a_list.append(min(cluster, prev_clust_1))
        speaker_b_list.append(max(cluster, prev_clust_1))
        
    
    if shot_id != prev_shot_id:
        prev_shot_id = shot_id
        prev_clust_3 = prev_clust_2
        prev_clust_2 = prev_clust_1
        prev_clust_1 = cluster
    prev_clust_1_list.append(prev_clust_1)
    prev_clust_2_list.append(prev_clust_2)
    prev_clust_3_list.append(prev_clust_3)
    print(frame_file, '\t', mcu_flag, '\t', cluster,'\t', shot_id, '\t', prev_shot_id, '\t', prev_clust_1, '\t', prev_clust_2, '\t', prev_clust_3, '\tend')
    
# return prev_clust_1, prev_clust_2, and prev_clust_3 only
    #if prev_clust

frame	 mcu	 clust	 shotid	 prvshid prev1	 prev2	 prev3	
661 	 0 	 14 	 0 	 -1 	 1001 	 1002 	 1003 	beg
661 	 0 	 14 	 0 	 0 	 14 	 1001 	 1002 	end
662 	 0 	 14 	 0 	 0 	 14 	 1001 	 1002 	beg
662 	 0 	 14 	 0 	 0 	 14 	 1001 	 1002 	end
663 	 1 	 3 	 1 	 0 	 14 	 1001 	 1002 	beg
663 	 1 	 3 	 1 	 1 	 3 	 14 	 1001 	end
664 	 1 	 3 	 1 	 1 	 3 	 14 	 1001 	beg
664 	 1 	 3 	 1 	 1 	 3 	 14 	 1001 	end
665 	 1 	 3 	 1 	 1 	 3 	 14 	 1001 	beg
665 	 1 	 3 	 1 	 1 	 3 	 14 	 1001 	end
666 	 1 	 15 	 2 	 1 	 3 	 14 	 1001 	beg
666 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	end
667 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	beg
667 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	end
668 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	beg
668 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	end
669 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	beg
669 	 1 	 15 	 2 	 2 	 15 	 3 	 14 	end
670 	 1 	 3 	 3 	 2 	 15 	 3 	 14 	beg
670 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	end
671 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	beg
671 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	end
672 	 1 	 3 	 3 	 3 	 3 	 15 	 3 	beg

769 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
770 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
770 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
771 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
771 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
772 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
772 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
773 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
773 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
774 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
774 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
775 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
775 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
776 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	beg
776 	 0 	 29 	 34 	 34 	 29 	 20 	 30 	end
777 	 0 	 2 	 35 	 34 	 29 	 20 	 30 	beg
777 	 0 	 2 	 35 	 35 	 2 	 29 	 20 	end
778 	 0 	 2 	 35 	 35 	 2 	 29 	 20 	beg
778 	 0 	 2 	 35 	 35 	 2 	 29 	 20 	end
779 	 0 	 2 	 35 	 35 	 2 	 29 	 20 	beg
779 	 0 	 2 	 35 	 35 	 2 	 29 	 20 	end
780 	 1 	 2 	 35 	 35 	 2 	 29 	 20 	beg
780 	 1 	 2 	 35 	 35 	 2 	 29 	 20 	end
781 	 1 	 2 	 35 	 35 	 2 

876 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	beg
876 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	end
877 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	beg
877 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	end
878 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	beg
878 	 1 	 7 	 55 	 55 	 7 	 19 	 7 	end
879 	 0 	 6 	 56 	 55 	 7 	 19 	 7 	beg
879 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	end
880 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	beg
880 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	end
881 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	beg
881 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	end
882 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	beg
882 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	end
883 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	beg
883 	 0 	 6 	 56 	 56 	 6 	 7 	 19 	end
884 	 0 	 2 	 57 	 56 	 6 	 7 	 19 	beg
884 	 0 	 2 	 57 	 57 	 2 	 6 	 7 	end
885 	 0 	 2 	 57 	 57 	 2 	 6 	 7 	beg
885 	 0 	 2 	 57 	 57 	 2 	 6 	 7 	end
886 	 0 	 2 	 57 	 57 	 2 	 6 	 7 	beg
886 	 0 	 2 	 57 	 57 	 2 	 6 	 7 	end
887 	 1 	 19 	 58 	 57 	 2 	 6 	 7 	beg
887 	 1 	 19 	 58 	 58 	 19 	 2 	 6 	end
888 	 1 	 19 	 58 	 58 	 19 	 2 	 6 	beg
888

In [65]:
speaker_a_list

[16, 0, 0, 2, 2, 7, 7, 7, 7, 7, 7, 7]

In [127]:
type(speaker_a_list[0])

numpy.int64

In [135]:
speaker_pairs = []
for a, b, in zip(speaker_a_list, speaker_b_list):
    if [int(a), int(b)] not in speaker_pairs:
        speaker_pairs.append([int(a), int(b)])

In [143]:
speaker_pairs

[[16, 32], [0, 32], [2, 20], [7, 19]]

In [138]:
speaker_pairs[0][0]

16

In [179]:
scenes = []

for pair in speaker_pairs:
    mean_a = scene_df.loc[scene_df['cluster'] == pair[0]]['mcu'].mean()
    mean_b = scene_df.loc[scene_df['cluster'] == pair[1]]['mcu'].mean()
    if mean_a > .5 and mean_b > .5:
        dialogue_start = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.min()
        dialogue_end = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.max()
        cutaways = scene_df.loc[(scene_df['frame_file'] > dialogue_start) & (scene_df['frame_file'] < dialogue_end)].cluster.unique()
        print(pair)
        print(dialogue_start, dialogue_end)
        print(cutaways)
        
        min_flag = 0
        while min_flag == 0:
            try:
                if int(scene_df.loc[scene_df['frame_file'] == (dialogue_start - 1)].cluster) in cutaways:
                    print(dialogue_start)
                    dialogue_start -= 1
                else:
                    min_flag = 1
            except TypeError:
                min_flag = 1
                
        max_flag = 0
        while max_flag == 0:
            try:
                if int(scene_df.loc[scene_df['frame_file'] == (dialogue_end + 1)].cluster) in cutaways:
                    print(dialogue_end)
                    dialogue_end += 1
                else:
                    max_flag = 1
            except TypeError:
                max_flag = 1
        
        scenes.append((dialogue_start, dialogue_end))

[0, 32]
675 757
[14 15  8  4 24 16 32  0  3]
675
674
673
672
671
670
669
668
667
666
665
664
663
662
757
758
759
[7, 19]
814 917
[ 7 19 25  6  2]
814
813


In [180]:
scenes

[(661, 760), (812, 917)]

In [139]:
for pair in speaker_pairs:
    first_speaker = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.min()
    last_speaker = scene_df.loc[(scene_df['cluster'] == pair[0]) | (scene_df['cluster'] == pair[1])].frame_file.max()
    print(first_speaker, last_speaker)

692 757
675 757
764 957
814 917


In [131]:
for pair in speaker_pairs:
    print(pair)

[16, 32]
[0, 32]
[2, 20]
[7, 19]


In [71]:
scene_df.loc[(scene_df['cluster'] == a) | (scene_df['cluster'] == b)].frame_file.min()

814

In [72]:
scene_df.loc[(scene_df['cluster'] == a) | (scene_df['cluster'] == b)].frame_file.max()

917

In [38]:
scene_df = pd.DataFrame(zip(frame_choice, hac.labels_, y_pred_values), columns=['frame_file', 'cluster', 'mcu'])
pd.options.display.max_rows=400
scene_df.head(400)

Unnamed: 0,frame_file,cluster,mcu
0,661,14,0
1,662,14,0
2,663,3,1
3,664,3,1
4,665,3,1
5,666,15,1
6,667,15,1
7,668,15,1
8,669,15,1
9,670,3,1


In [40]:
print('cluster\t', 'count\t', 'mean\t')
for x in [0, 32, 2, 20, 7, 19, 16, 32]:
    print(x, '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].count(), '\t', scene_df.loc[scene_df['cluster'] == x]['mcu'].mean())

cluster	 count	 mean	
0 	 15 	 0.8666666666666667
32 	 17 	 0.8823529411764706
2 	 33 	 0.24242424242424243
20 	 14 	 1.0
7 	 48 	 0.9791666666666666
19 	 28 	 1.0
16 	 10 	 0.0
32 	 17 	 0.8823529411764706


In [41]:
scene_df.loc[scene_df['cluster'] == 7]

Unnamed: 0,frame_file,cluster,mcu
153,814,7,1
154,815,7,1
155,816,7,1
156,817,7,1
157,818,7,1
158,819,7,1
162,823,7,1
163,824,7,1
165,826,7,1
166,827,7,1


In [54]:
scene_df.loc[(scene_df['frame_file'] > 813) & (scene_df['frame_file'] < 917)].cluster.unique()

array([ 7, 19, 25,  6,  2])

In [55]:
scene_df.loc[(scene_df['frame_file'] > 813) & (scene_df['frame_file'] < 917)]

Unnamed: 0,frame_file,cluster,mcu
153,814,7,1
154,815,7,1
155,816,7,1
156,817,7,1
157,818,7,1
158,819,7,1
159,820,19,1
160,821,19,1
161,822,19,1
162,823,7,1


In [56]:
clusters = [ 7, 19, 25,  6,  2]
scene_df[scene_df.cluster.isin(clusters)]

Unnamed: 0,frame_file,cluster,mcu
116,777,2,0
117,778,2,0
118,779,2,0
119,780,2,1
120,781,2,1
129,790,2,0
130,791,2,1
131,792,2,1
132,793,2,1
134,795,2,1


In [58]:
scene_df.loc[(scene_df['cluster'] == 0) | (scene_df['cluster'] == 32)]

Unnamed: 0,frame_file,cluster,mcu
14,675,0,0
33,694,32,0
34,695,32,1
35,696,32,1
36,697,32,1
44,705,0,1
45,706,0,1
46,707,0,1
47,708,0,1
51,712,0,1


In [59]:
scene_df.loc[(scene_df['frame_file'] > 675) & (scene_df['frame_file'] < 757)].cluster.unique()

array([14, 15,  8,  4, 24, 16, 32,  0,  3])

In [60]:
clusters = [14, 15,  8,  4, 24, 16, 32,  0,  3]
scene_df[scene_df.cluster.isin(clusters)]

Unnamed: 0,frame_file,cluster,mcu
0,661,14,0
1,662,14,0
2,663,3,1
3,664,3,1
4,665,3,1
5,666,15,1
6,667,15,1
7,668,15,1
8,669,15,1
9,670,3,1


In [154]:
int(scene_df.loc[scene_df['frame_file'] == 674].cluster)

3

In [159]:
clusters = [16, 2, 20]
scene_df[scene_df.cluster.isin(clusters)]

Unnamed: 0,frame_file,cluster,mcu
31,692,16,0
32,693,16,0
39,700,16,0
40,701,16,0
65,726,16,0
66,727,16,0
67,728,16,0
68,729,16,0
73,734,16,0
74,735,16,0
