## Adapted from Clustering_with_sentence_embeddings.ipynb from info 256 repo

This notebook explores the use of SentenceBERT to generate representations of sequences (sentences, documents) and clustering those representations using K-means.

In [1]:
# !pip install sentence-transformers

In [2]:
# Get movies summaries and book titles to cluster
# !wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/plot_summaries.txt
# !wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/loc/dev.tsv -O book_titles.txt

In [3]:
from sklearn.cluster import KMeans
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

In [4]:
stacked_ins_df = pd.read_csv('scraping/stacked_ins_df.csv')
stacked_ins_df.head()

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
0,NBCNews,CzE3AMurPNg,2023-10-31 20:04:18,GraphVideo,Boston University students dropped pumpkins fi...,11396,415,False,False
1,NBCNews,CzEUNv9LvG5,2023-10-31 15:00:14,GraphImage,Tampa is finding that more people equals more ...,3367,252,False,False
2,NBCNews,CzCVMsVr8QO,2023-10-30 20:30:21,GraphSidecar,"Becky Pepper-Jackson, a transgender teen, stan...",7145,1572,False,False
3,NBCNews,CzCOrYtsJJy,2023-10-30 19:33:22,GraphSidecar,A small but growing band of Arab and Jewish Is...,34640,1076,True,True
4,NBCNews,CzCHkNFxaZS,2023-10-30 18:31:19,GraphVideo,Thousands of people broke into several U.N war...,8218,1127,True,True


In [5]:
stacked_ins_df.tail()

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
4536,Tabletmag,CzL6tdaszja,2023-11-03 13:51:17,GraphSidecar,Hezbollah leader Hassan Nasrallah is currently...,1552,54,True,True
4537,Tabletmag,CzJ9F7pPanZ,2023-11-02 19:33:34,GraphSidecar,"Since the Houthis, Yemen’s Islamist political ...",8233,268,True,True
4538,Tabletmag,CzJXi38xVl3,2023-11-02 14:07:30,GraphSidecar,"Be safe, and pray for the safety of others. Al...",7074,239,False,False
4539,Tabletmag,CzHR2zsRRvb,2023-11-01 18:37:17,GraphImage,"How did everyone get Hamas wrong, including th...",10808,391,True,True
4540,Tabletmag,CzG0DdDLu64,2023-11-01 14:18:22,GraphVideo,If you’re curious why Israel has refused calls...,2383,240,True,True


In [6]:
stacked_ins_df['owner_username'].unique()

array(['NBCNews', 'FoxNews', 'CNN', 'NYTimes', 'WashTimes', 'WSJ',
       'BBCNews', 'Reuters', 'APNews', 'Time', 'Aljazeeraenglish',
       'Tabletmag'], dtype=object)

In [7]:
stacked_ins_df_gaza = stacked_ins_df[stacked_ins_df['is_about_gaza_1'] == True]
stacked_ins_df_gaza.tail()

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
4534,Tabletmag,CzTxFuSMm95,2023-11-06 15:01:08,GraphSidecar,"From the archives: American universities, thei...",2402,37,True,True
4536,Tabletmag,CzL6tdaszja,2023-11-03 13:51:17,GraphSidecar,Hezbollah leader Hassan Nasrallah is currently...,1552,54,True,True
4537,Tabletmag,CzJ9F7pPanZ,2023-11-02 19:33:34,GraphSidecar,"Since the Houthis, Yemen’s Islamist political ...",8233,268,True,True
4539,Tabletmag,CzHR2zsRRvb,2023-11-01 18:37:17,GraphImage,"How did everyone get Hamas wrong, including th...",10808,391,True,True
4540,Tabletmag,CzG0DdDLu64,2023-11-01 14:18:22,GraphVideo,If you’re curious why Israel has refused calls...,2383,240,True,True


In [8]:
len(stacked_ins_df_gaza)

1982

In [9]:
stacked_news_df = pd.read_csv('scraping/stacked_news_df.csv')
stacked_news_df.tail()

Unnamed: 0,datetime,url,title,owner_username
3394,2023-10-04 08:34:00,/article/idUSKBN31410C,Gaza unrest sends message about economic miser...,Reuters
3395,2023-09-27 11:01:00,/article/idUSKBN30X1FE,"UN peace envoy, Egypt working to restore calm ...",Reuters
3396,2023-09-24 06:14:00,/article/idUSL1N3B001W,UPDATE 2-Israeli forces kill two Palestinians ...,Reuters
3397,2023-09-22 14:54:00,/article/idUSKBN30S1KD,Israeli drone hits Gaza as violent protests rage,Reuters
3398,2023-09-22 14:53:00,/article/idUSKBN30S0GV,Israeli drone hits Gaza as violent protests rage,Reuters


In [10]:
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [11]:
def cosine(one, two):
    return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))

In [12]:
def run_all(data, model, num_clusters=10):

    X=[]

    # Get sentence embeddings for each doc
    
    for doc in data:
        embedding=model.encode(doc)
        X.append(embedding)

    X=np.array(X)

    # Run K-means
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)

    # For each cluster, print out the n documents closest to the cluster center
    clusters={}
    for idx, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label]=[]
        clusters[label].append((idx, data[idx][1]))

    for label in clusters:
        sims={}
        cluster_center=kmeans.cluster_centers_[label]
        for idx, doc in clusters[label]:
            sim=cosine(cluster_center, X[idx])
            sims[idx]=sim
        for k, v in sorted(sims.items(), key=lambda item: item[1], reverse=True)[:5]:
            # print(k,"%.3f" % v, data[k][1])
            if len(data[k].split()) < 20:
                print(k,"%.3f" % v, data[k])
            else:
                print(k,"%.3f" % v, data[k][:200]+'...')

        print()


# News Article Titles

In [13]:
stacked_news_df['title']

0       IDF continues Gaza ground offensive that Netan...
1       Israel says more IDF soldiers were killed insi...
2       US, partners discussing foreign troops serving...
3       Veterans help Americans leaving Gaza: ‘Easier ...
4       300 miles of Hamas subterranean terror tunnels...
                              ...                        
3394    Gaza unrest sends message about economic miser...
3395    UN peace envoy, Egypt working to restore calm ...
3396    UPDATE 2-Israeli forces kill two Palestinians ...
3397     Israeli drone hits Gaza as violent protests rage
3398     Israeli drone hits Gaza as violent protests rage
Name: title, Length: 3399, dtype: object

In [14]:
run_all(list(stacked_news_df['title']), sentence_model, num_clusters=10)

  super()._check_params_vs_input(X, default_n_init=10)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
444 0.871 Israel strikes Gaza as its military describes ‘a significant operation.’
1365 0.855 Israel Prepares for a Potential Invasion of Gaza
1301 0.854 Israel Sticks to Call for Gaza Evacuation and Readies a Possible Invasion
1263 0.847 Poised for Attack, Israel Steps Up Calls for Gaza Residents to Leave ‘Battle Zone’
207 0.845 Israel Steps Up Gaza Bombing; U.S. Officials Urge Caution Over Ground Offensive

3099 0.833 UPDATE 1-UN aid chief: situation is Gaza 'fast becoming untenable'
33 0.833 Foreign officials raise alarm over lack of humanitarian aid in Gaza as Israel prepares for ground operation
945 0.827 First Humanitarian Aid Reaches a Hard-Pressed Gaza
2504 0.826 UN agencies call for ceasefire and hu

# Instagram Post Captions

In [15]:
list(stacked_ins_df_gaza['caption'])

['A small but growing band of Arab and Jewish Israelis have formed an unofficial civil guard in Jaffa in an attempt to prevent the type of urban unrest that has accompanied previous rounds of conflict between Israel and militants in Gaza.\n\nThey know their voice represents a tiny minority. \n\nThey have taken on a range of tasks: accompanying both Arab and Jewish Israelis who are scared to journey across other neighborhoods; sending supplies to Israeli communities displaced by the recent conflict; and, in the event of significant unrest, planning to dispatch volunteers to document and even mediate.\n\nRead more at the link in bio.\n\n📷 @mayalevinphotography for @nbcnews',
 'Thousands of people broke into several U.N warehouses in the Gaza Strip, where the arrival of aid has dramatically slowed since the beginning of the war.',
 'In the city of Khan Younis, a group of men digging through the rubble found first one hand, then another. When they lifted the body of 11-year-old Sila Hamdan

In [16]:
run_all(list(stacked_ins_df_gaza['caption']), sentence_model, num_clusters=10)

  super()._check_params_vs_input(X, default_n_init=10)


1242 0.797 Tens of thousands of people have poured onto the streets around the world in support of the Palestinians as Israel prepares for a possible ground #invasion of the #Gaza Strip.⁠
.⁠
Large crowds gathere...
363 0.794 Demonstrations, rallies and vigils have been staged around the world after Hamas militants launched a series of brutal attacks on Israel over the weekend, moving the long-running conflict into unchart...
433 0.790 Thousands of protesters across the Middle East marched on Tuesday night and Wednesday in grief, fury and solidarity with Palestinians after hundreds of Palestinian civilians were feared dead in an exp...
989 0.786 Thousands of people in Kuala Lumpur rallied in support of Palestinians during the Israeli bombardment of #Gaza.⁠
.⁠
The rally, organized by local NGOs, called for peace in Palestine and an end to #US ...
1286 0.765 "We will continue to organise protests until Israeli occupation is ended.” Police briefly detained #protesters in #Berlin on Wednesd