## Adapted from Clustering_with_sentence_embeddings.ipynb from info 256 repo

This notebook explores the use of SentenceBERT to generate representations of sequences (sentences, documents) and clustering those representations using K-means.

In [1]:
# !pip install sentence-transformers

In [2]:
# Get movies summaries and book titles to cluster
# !wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/plot_summaries.txt
# !wget https://raw.githubusercontent.com/dbamman/anlp23/main/data/loc/dev.tsv -O book_titles.txt

In [3]:
from sklearn.cluster import KMeans
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

In [15]:
stacked_ins_df = pd.read_csv('scraping/stacked_ins_df.csv')
stacked_ins_df.head()

Unnamed: 0,owner_username,url_code,time_utc,type,caption,likes,comments,is_about_gaza_1,is_about_gaza_2
0,NBCNews,CzHiqkVOtlV,2023-11-01 21:04:19,GraphVideo,The 2023 Rockefeller Center Christmas tree is ...,5140,144,False,False
1,NBCNews,CzHbdY8rQIf,2023-11-01 20:01:24,GraphVideo,Foreign passport-holders and critically injure...,4595,349,True,True
2,NBCNews,CzHDmj0rXos,2023-11-01 16:32:48,GraphSidecar,"Clutching suitcases and foreign passports, on ...",1561,340,True,True
3,NBCNews,CzE3AMurPNg,2023-10-31 20:04:18,GraphVideo,Boston University students dropped pumpkins fi...,11396,415,False,False
4,NBCNews,CzEUNv9LvG5,2023-10-31 15:00:14,GraphImage,Tampa is finding that more people equals more ...,3367,252,False,False


In [16]:
stacked_news_df = pd.read_csv('scraping/stacked_news_df.csv')
stacked_news_df.head()

Unnamed: 0.1,Unnamed: 0,datetime,url,title,owner_username
0,0,2023-10-28 21:22:28.503212,https://www.nbcnews.com/news/world/israel-hama...,Israel-Gaza conflict could spread into Middle ...,NBCNews
1,1,2023-10-30 17:22:28.503295,https://www.nbcnews.com/news/world/live-blog/i...,Israel-Hamas war live updates: Fears rise over...,NBCNews
2,2,2023-10-29 21:22:28.503362,https://www.nbcnews.com/nightly-news/video/isr...,Israel-Hamas war enters new phase as Israel ex...,NBCNews
3,3,2023-10-30 00:22:28.503487,https://www.nbcnews.com/news/world/live-blog/i...,Thousands break into Gaza aid warehouse,NBCNews
4,4,2023-10-28 21:22:28.503709,https://www.nbcnews.com/news/world/live-blog/i...,Airstrikes pound Gaza as Israel expands ground...,NBCNews


In [5]:
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

(…)68ef928103d92f95afc487e68/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

(…)103d92f95afc487e68/1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(…)28ba968ef928103d92f95afc487e68/README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

(…)ba968ef928103d92f95afc487e68/config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

(…)487e68/config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)ef928103d92f95afc487e68/data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

(…)8ba968ef928103d92f95afc487e68/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

(…)92f95afc487e68/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)3d92f95afc487e68/special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

(…)68ef928103d92f95afc487e68/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

(…)103d92f95afc487e68/tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

(…)8ef928103d92f95afc487e68/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

(…)8ba968ef928103d92f95afc487e68/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

(…)a968ef928103d92f95afc487e68/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
def cosine(one, two):
    return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))

In [18]:
def run_all(data, model, num_clusters=10):

    X=[]

    # Get sentence embeddings for each doc
    
    for doc in data:
        embedding=model.encode(doc)
        X.append(embedding)

    X=np.array(X)

    # Run K-means
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)

    # For each cluster, print out the n documents closest to the cluster center
    clusters={}
    for idx, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label]=[]
        clusters[label].append((idx, data[idx][1]))

    for label in clusters:
        sims={}
        cluster_center=kmeans.cluster_centers_[label]
        for idx, doc in clusters[label]:
            sim=cosine(cluster_center, X[idx])
            sims[idx]=sim
        for k, v in sorted(sims.items(), key=lambda item: item[1], reverse=True)[:5]:
            # print(k,"%.3f" % v, data[k][1])
            if len(data[k].split()) < 20:
                print(k,"%.3f" % v, data[k])
            else:
                print(k,"%.3f" % v, data[k][:200]+'...')

        print()


# News titles

In [19]:
run_all(list(stacked_news_df['title']), sentence_model, num_clusters=10)

  super()._check_params_vs_input(X, default_n_init=10)


29 0.845 Israel intensifies war in Gaza with troops on the ground
57 0.840 Israel conducts 'targeted raid' inside Gaza in 'next step of combat'
38 0.835 Dramatic increase in Israeli bombardment of Gaza; escalation may ...
52 0.826 Israel conducts new raids inside Gaza as humanitarian crisis grows
216 0.822 Israeli forces attack Gaza's main city from two directions

940 0.844 Israel and Hamas at war: Latest news
1217 0.807 Analysis-In striking Israel, Hamas also took aim at Middle East security realignment
1054 0.806 Israel, Hamas at war: Live updates
121 0.803 Israel, Hamas terrorists and the reoccurring battles around the Gaza Strip
73 0.801 Israel-Hamas war: images of Gaza devastation, newly released ...

930 0.821 UPDATE 1-UN aid chief: situation is Gaza 'fast becoming untenable'
206 0.802 UN refugee agency: Gaza becoming a 'hellhole' on the 'brink of collapse'
269 0.796 Britain, France stress need to get aid into Gaza
93 0.795 First round of humanitarian aid reaches Gaza

533 0.758

# Instagram Captions

In [20]:
run_all(list(stacked_ins_df['caption']), sentence_model, num_clusters=10)

  super()._check_params_vs_input(X, default_n_init=10)


1967 0.580 🐩✨Gavino Garay takes us inside the Humane Rescue Alliance’s 36th annual ‘Bark Ball’ in Washington, D.C., where the cause goes beyond the glitz and glam.
 
#Reuters #video #news #BarkBall #HumaneRescue...
1689 0.546 People have splurged on their cats and dogs for ages, but now the pet high life is trickling down to tinier creatures.⁠
⁠
Small animal lovers say it’s about time.⁠
⁠
At Norm’s Piggy Pen in Rockford, I...
1791 0.534 The steely gaze of this endangered tigress has won the 2023 Mangrove Photography Awards. 🐯

Soham Bhattacharyya captured the tigress peering at him in the Sundarbans Biosphere Reserve, India. 

"The i...
152 0.533 NEW AND IMPROVED: Customers are lovin' the latest rebrand. How the beloved item became a billion-dollar brand at the link in bio....
2054 0.527 What made our list of the Best Inventions of 2023?

Moonwalkers, battery-­powered wheeled shoes that allow you to walk normally (not skate), just faster and more easily. ChefDoodler's pen for easy cak.