# __[:+:]__ Transcript Processing

In [1]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
# from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import plotly.express as px
from umap import UMAP
import pandas as pd
import numpy as np
import umap.plot
import hdbscan
import re


  from tqdm.autonotebook import tqdm, trange
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
#Extract each line from the transcript, with no wordwrap, each line is a new speaker
def readFile(filePath):
    with open(filePath, 'r') as file:
        lines= file.read().split('\n')
        print('[+] #lines read from file: ',len(lines))
        return lines
    
# Create a dictionary where each speaker is a key and their spoken sentences are the values
def transMapper(lines):
    trans_map= {}
    regex= r'(.+): (.+)'
    for i in range(len(lines)):
        if re.match(regex, lines[i]):
            speaker= lines[i][0:9]
            sentence= lines[i][10:]

            if speaker not in trans_map:
                trans_map[speaker]= []
            trans_map[speaker].append(sentence)
        else:
            print('--> Non-match: ', lines[i])
    print(' --> Total Dictionary entries: ', sum([len(trans_map[k]) for k in trans_map.keys()])  )
    return trans_map


def trans_dataFramer(trans_map):
    ts_df= pd.DataFrame(columns= ['speaker', 'sentence'])
    for speaker in trans_map.keys():
        for sentence in trans_map[speaker]:
            newEntry= pd.DataFrame([{'speaker': speaker, 'sentence':sentence}])
            ts_df= pd.concat([ts_df, newEntry], ignore_index= True)
    print(' --> Total Dataframe entries: ', len(ts_df))
    return ts_df

#------------------------------------------------------------------------------------------------
filePath= r'transcript_noTimeStamp.txt'
ts_df= trans_dataFramer(transMapper(readFile(filePath)))
ts_df



[+] #lines read from file:  238
 --> Total Dictionary entries:  238
 --> Total Dataframe entries:  238


Unnamed: 0,speaker,sentence
0,Speaker 1,You're saying that you think it's the two nar...
1,Speaker 1,So are you surprised by that? Because the poi...
2,Speaker 1,"Wait, you think the Jews left voluntarily? Yo..."
3,Speaker 1,"Wait, wait, wait. I have some thoughts about ..."
4,Speaker 1,"First archaeological would probably be 4,000 ..."
...,...,...
233,Speaker 2,Yeah.
234,Speaker 2,"Yeah, me too."
235,Speaker 2,"No, you're fine. Yeah. I never really like sh..."
236,Speaker 2,Want to project that onto another group and l...


## __[+]__ Embed + UMAP

In [16]:
emb_model= SentenceTransformer('all-mpnet-base-v2')
sentences = ts_df.loc[:,'sentence']
type(sentences)


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



pandas.core.series.Series

In [4]:
print('\nSentences sans Labels: \n', sentences.head(), '\n', len(sentences))

#[+:] Embed the extracted sentences
snt_emb_full = emb_model.encode(sentences, show_progress_bar=True)
print(' --> Embeded Sentences <=400 chars: ', type(snt_emb_full), len(snt_emb_full),'x', len(snt_emb_full[0]))


Sentences sans Labels: 
 0     You're saying that you think it's the two nar...
1     So are you surprised by that? Because the poi...
2     Wait, you think the Jews left voluntarily? Yo...
3     Wait, wait, wait. I have some thoughts about ...
4     First archaeological would probably be 4,000 ...
Name: sentence, dtype: object 
 238


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

 --> Embeded Sentences <=400 chars:  238 x 768


In [5]:
#[+:] Embed the extracted sentences, but only up to the first 400 chars
snt_emb_400=[]
for s in sentences:
    maxL= min(len(s), 400)
    trunc= s[:maxL]
    # print((trunc))
    snt_emb_400.append(emb_model.encode(trunc))
print(' --> Full Embeded Sentences: ', len(snt_emb_400),'x', len(snt_emb_400[0]))

 --> Full Embeded Sentences:  238 x 768


In [6]:
vectors= snt_emb_400
proj_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine').fit_transform(vectors)
print('\n -->3D Projection: ', len(proj_3d))

#[+] Create a new dataframe where each sentence is not the sentence embedding projection
proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['speaker'] = ts_df.speaker
proj_3d_df['index'] = ts_df.index
print(proj_3d_df.head(), '\n', len(proj_3d_df))

#[+] Map the dataframe of projected embeddings in 3d
fig_3d = px.scatter_3d(
    proj_3d_df,
    x='x', y='y', z='z',
    color='speaker', 
    #labels={'color': 'stance'}
    hover_data= 'index',
    title= 'Full Sentences'
)
fig_3d.update_traces(marker_size=3.5)
fig_3d.show()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")



 -->3D Projection:  238
           x         y         z    speaker  index
0  10.514611  3.002629  7.387030  Speaker 1      0
1  11.189832  1.490113  7.388382  Speaker 1      1
2  11.837924  1.637674  7.936531  Speaker 1      2
3   8.653387  4.525954  6.872148  Speaker 1      3
4  11.296014  2.922088  7.787458  Speaker 1      4 
 238


In [7]:
vectors= snt_emb_full
proj_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine').fit_transform(vectors)
print('\n -->3D Projection: ', len(proj_3d))

#[+] Create a new dataframe where each sentence is not the sentence embedding projection
proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['speaker'] = ts_df.speaker
proj_3d_df['index'] = ts_df.index
print(proj_3d_df.head(), '\n', len(proj_3d_df))

#[+] Map the dataframe of projected embeddings in 3d
fig_3d = px.scatter_3d(
    proj_3d_df,
    x='x', y='y', z='z',
    color='speaker', 
    #labels={'color': 'stance'}
    hover_data= 'index',
    title= 'First 400 Chars'
)
fig_3d.update_traces(marker_size=3.5)
fig_3d.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




 -->3D Projection:  238
          x         y          z    speaker  index
0  0.026387  5.377555  11.741396  Speaker 1      0
1 -0.497412  5.928132  13.300177  Speaker 1      1
2 -1.412828  5.625690  13.035725  Speaker 1      2
3  2.430344  5.711737  11.170720  Speaker 1      3
4 -1.202994  5.026009  13.192148  Speaker 1      4 
 238


## [_] Topic Modelling
https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6

In [8]:
snt_emb= snt_emb_400

nhood= 15
proj_5d = umap.UMAP(n_neighbors= nhood, 
                    n_components=5, 
                    metric='cosine').fit_transform(snt_emb)

cluster = hdbscan.HDBSCAN(min_cluster_size=5,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(proj_5d)
print(len(set(cluster.labels_)), 'cluster labels: ', set(cluster.labels_), '\n' )


# Prepare data
# proj_2d_2 = umap.UMAP(n_neighbors=nhood, 
#                       n_components=2, 
#                       min_dist=0.0, 
#                       metric='cosine').fit_transform(snt_emb)

# res_df = pd.DataFrame(proj_2d_2, columns=['x', 'y'])
# res_df['labels'] = cluster.labels_
# print(res_df.head(), '\n',res_df.shape)

# # Visualize clusters
# fig, ax = plt.subplots(figsize=(20, 10))
# outliers = res_df.loc[res_df.labels == -1, :]
# clustered = res_df.loc[res_df.labels != -1, :]
# plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=20)
# plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=20, cmap='hsv_r')
# plt.colorbar()

proj_3d = UMAP(n_neighbors=nhood,
               n_components=3,
                init='random',
                 min_dist=0.0,
                 metric='cosine').fit_transform(snt_emb)
print('\n -->3D Projection: ', len(proj_3d))

proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['speaker'] = ts_df.speaker
proj_3d_df['index'] = ts_df.index
proj_3d_df['labels'] = cluster.labels_
print(proj_3d_df.head(), '\n lenght:', len(proj_3d_df))

#[+] Map the dataframe of projected embeddings in 3d
fig_3d = px.scatter_3d(
    proj_3d_df,
    x='x', y='y', z='z',
    # color= 'speaker',
    color='labels', 
    #labels={'color': 'stance'}
    hover_data= 'index',
    title= 'DBScan Clustering'
)
fig_3d.update_traces(marker_size=3.5)
fig_3d.show()

13 cluster labels:  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1} 


 -->3D Projection:  238
          x         y         z    speaker  index  labels
0  6.108739  1.223565 -2.393724  Speaker 1      0      -1
1  6.401168  2.050539 -4.049073  Speaker 1      1      -1
2  7.062026  1.900195 -4.054942  Speaker 1      2      -1
3  5.029031  0.173908 -0.277231  Speaker 1      3      -1
4  7.143356  1.377014 -3.883331  Speaker 1      4      -1 
 lenght: 238


In [9]:
for i,e in enumerate(cluster.labels_):
    if e== -1: print(sentences[i])

 You're saying that you think it's the two narratives. Yeah, but we can't hear you very loudly. Maybe it's much better, much better.
 So are you surprised by that? Because the point is that there are Native Americans that then seem to support Israel because of the claim for indigeneity. They understand what it's like to be a people, sometimes tribes that have the spiritual dimension to these tribes, they're kicked out of their land and now they want to come back to their land. That's also exactly the story of the Jews, no?
 Wait, you think the Jews left voluntarily? You think the Jews left Judea voluntarily?
 Wait, wait, wait. I have some thoughts about this, but Beata, maybe you jump in.
 First archaeological would probably be 4,000 years ago. Wow.
 Hmm. Ah. Sounds a little bit like a free pass, man. I got. They they're they're.
 Can you explain that last sentence again?
 Beata, what's your take on either option? I mean, it's like I have a video. Oh yeah. And it feels like, did they h

## Summary Processing

In [20]:
topics ={
    'topic1': ["I think it's fascinating how some Native Americans relate to the Jewish experience of displacement and return, especially through their claim of indigeneity. They see a parallel between their spiritual connection to their land and the Jewish return to Israel, which makes them sympathize with Israel’s cause, even though it might seem counterintuitive given their own history of being displaced.",
                "There’s a lot of similarity between Native Americans and Palestinians. Both were indigenous peoples who remained on their land for centuries, while others—like Jews—left, resettled elsewhere, and then later tried to come back. This shared experience of colonialism is why Native Americans naturally side with Palestinians.",
               "It makes sense that Native Americans would support Palestinians. Both groups have suffered under colonialist rule. What surprised me, though, was finding out that there are Native Americans who support Israel. It’s hard for me to see that connection, especially given the colonial dynamic."
                ],
    'topic2': ["The Jewish claim to Israel is deeply rooted in their historical indigeneity. They were displaced from their homeland, just like Native Americans, and many other groups in history. This longing to return to their ancestral land is not just political—it’s spiritual. They’ve maintained a connection to this land for thousands of years, and that’s why their claim resonates with the concept of indigeneity.",
                "I see the argument for Jewish indigeneity, but there’s a difference. Jews left their homeland and were able to adapt elsewhere, even if they faced challenges. Palestinians, on the other hand, stayed on their land until they were forcibly removed in 1948. It’s not the same kind of displacement.",
                "I don’t buy the argument that Jews never faced colonialism. They were kicked out of Israel multiple times and had to endure persecution wherever they settled—be it in Europe or the Middle East. The claim of indigeneity works for both Palestinian Arabs and Jews, especially the Jews from the Middle East who never left the region."
                ],
    'topic3': ["Colonialism has shaped the history of so many groups, including Jews. Jews were not just victims of European colonialism, but also of Ottoman and other powers. It’s important not to view colonialism only through the European lens—it’s been a universal phenomenon, and Jews have had to survive under various forms of colonial rule.",
                "Colonialism is the shared trauma that connects Native Americans and Palestinians. Both experienced being ruled by outsiders, and Palestinians are still under what feels like a form of colonial rule today. That’s why the Palestinian struggle resonates so much with people who have experienced colonialism firsthand.",
                "Colonialism is a constant in human history. Different empires—be they European, Ottoman, or others—have always taken land from people. To single out one form of colonialism while ignoring others is to miss the larger picture. Jews, too, were affected by colonization, and we need to see these histories in a more interconnected way."
                ],
}

topic_emb_df= pd.DataFrame(columns= ['topic','sentence', 'embedding'])
for k in topics.keys():
    for s in topics[k]:
        newEntry= pd.DataFrame([{'topic': k, 'sentence':s, 'embedding': emb_model.encode(s)}])
        topic_emb_df= pd.concat([topic_emb_df, newEntry], ignore_index= True)
print(topic_emb_df.head(),topic_emb_df.shape, '\n')

topic_emb_avg= {}
for k in topics.keys():
    emb= emb_model.encode(topics[k])
    emb_avg= np.mean(emb, axis=0)
    topic_emb_avg[k]= emb_avg
for k,v in topic_emb_avg.items():print(k,'_avg: 1x',len(v))

    topic                                           sentence  \
0  topic1  I think it's fascinating how some Native Ameri...   
1  topic1  There’s a lot of similarity between Native Ame...   
2  topic1  It makes sense that Native Americans would sup...   
3  topic2  The Jewish claim to Israel is deeply rooted in...   
4  topic2  I see the argument for Jewish indigeneity, but...   

                                           embedding  
0  [-0.07896777, 0.11289133, -0.0103871375, 0.030...  
1  [-0.09552511, 0.09052141, 0.0006565145, 0.0350...  
2  [-0.092669226, 0.09681143, -0.0112264315, 0.05...  
3  [-0.017652608, 0.05714198, 0.018672587, 0.0042...  
4  [-0.033343498, 0.05749682, 0.000992715, 0.0276...   (9, 3) 

topic1 _avg 1x 768
topic2 _avg 1x 768
topic3 _avg 1x 768


In [19]:
vectors= topic_emb_df.loc[:,'embedding'].tolist()
print('vectors:', type(vectors),'\n',vectors[0][:5])
proj_3d = UMAP(n_components=3, init='random', random_state=0, metric='cosine').fit_transform(vectors)
print('\n -->3D Projection: ', len(proj_3d))

proj_3d_df= pd.DataFrame(proj_3d, columns= ['x', 'y', 'z'])
proj_3d_df['topic']= topic_emb_df['topic']
proj_3d_df['index'] = topic_emb_df.index
print(proj_3d_df.head(), '\n', len(proj_3d_df))

#[+] Map the dataframe of projected embeddings in 3d
fig_3d = px.scatter_3d(
    proj_3d_df,
    x='x', y='y', z='z',
    color='topic', 
    #labels={'color': 'stance'}
    hover_data= 'index',
    title= 'Topic Summary Embeddings'
)
fig_3d.update_traces(marker_size=9)
fig_3d.show()

vectors: <class 'list'> 
 [-0.07896777  0.11289133 -0.01038714  0.03086104 -0.03055891]



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1




 -->3D Projection:  9
          x         y         z   topic  index
0  5.621573  3.577007 -1.926784  topic1      0
1  6.356977  3.265422 -1.059515  topic1      1
2  6.226356  3.026010 -1.879378  topic1      2
3  4.865344  3.184132 -1.202710  topic2      3
4  5.225440  3.059881 -0.477132  topic2      4 
 9


### Agreement Spectrum

First 3 Topics: 
1. Native American and Palestinian Solidarity

2. Indigeneity and Jewish Return

3. Colonialism and Its Impact

According to GPT4o:3,1,2

In [66]:
# filtered_df = df.loc[df['Label'] == specific_label, 'Value']
cos_sigma= []

topic1_emb= topic_emb_df.loc[topic_emb_df['topic']=='topic1', 'embedding'].tolist()
print(len(topic1_emb), topic1_emb[0][:5])
d1= np.mean([util.cos_sim(i, topic_emb_avg['topic1']) for i in topic1_emb ])
cos_sigma.append(d1)

topic2_emb= topic_emb_df.loc[topic_emb_df['topic']=='topic2', 'embedding'].tolist()
print(len(topic2_emb), topic2_emb[0][:5])
d1= np.mean([util.cos_sim(i, topic_emb_avg['topic1']) for i in topic2_emb ])
cos_sigma.append(d1)

topic3_emb= topic_emb_df.loc[topic_emb_df['topic']=='topic3', 'embedding'].tolist()
print(len(topic3_emb), topic3_emb[0][:5])
d1= np.mean([util.cos_sim(i, topic_emb_avg['topic1']) for i in topic3_emb ])
cos_sigma.append(d1)


cos_sort= np.sort(cos_sigma)
print(cos_sigma)

print("\nMost to least Agreement: ")
for i in cos_sort:
    print('Topic', cos_sigma.index(i)+1)









#cos_sim = util.cos_sim(emb1, emb2)

# topic2_avg= topic_emb_df.loc[topic_emb_df['topic']=='topic2', 'embedding'].tolist()
# print(len(topic1_avg),topic2_avg[0][:5])


3 [-0.07896777  0.11289133 -0.01038714  0.03086104 -0.03055891]
3 [-0.01765261  0.05714198  0.01867259  0.0042201  -0.08135743]
3 [ 0.00727112  0.0651238   0.01484105 -0.04710075 -0.02037156]
[0.90188354, 0.60046387, 0.6289834]

Most to least Agreement: 
Topic 2
Topic 3
Topic 1
