In [45]:
import pandas as pd
import numpy as np 
from sklearn.cluster import DBSCAN
import json
from transformers import BertModel, BertTokenizerFast
import torch
from collections import defaultdict

In [18]:
# load the sentences and correct labels
all_sentences = []

with open("../data/annotations.json", "r") as f:
    data = json.load(f)

for sentence in data:
    text = sentence["data"]["sentence"]
    labels = []
    results = sentence["annotations"][0]["result"]
    labels = [r["value"]["text"] for r in results]
    spans = [
        {
            "start": r["value"]["start"],
            "end": r["value"]["end"]
        }
        for r in results if r["type"] == "labels"
    ]

    all_sentences.append({
        "text": text,
        "labels": labels,
        "spans": spans
    })

In [19]:
# extract all group mentions present
group_mentions_present = [ex for ex in all_sentences if ex["labels"]]
len(group_mentions_present)

64

In [5]:
# define tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# download model with pretrained weights 
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True) 

# set model to evaluation mode
model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [40]:
len(group_mentions_present)

64

In [36]:
# define function that returns the word embeddings for all group mentions within sentence
def extract_embedding(tokenizer, model, sentence, labels):
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    offset_mapping = encoding.pop("offset_mapping")[0].tolist()

    with torch.no_grad():
        outputs = model(**encoding)
        hidden_states = outputs.last_hidden_state[0]

    mention_embeddings = []

    for mention in labels:
        mention_start = int(mention["start"])
        mention_end = int(mention["end"])
        token_indices = [
            i for i, (start, end) in enumerate(offset_mapping)
            if start >= mention_start and end <= mention_end
        ]

        if token_indices:
            mention_embedding = hidden_states[torch.tensor(token_indices)].mean(dim=0)
            mention_embeddings.append(mention_embedding)

    return mention_embeddings
    

In [37]:
# get embeddings for all mentions
all_embeddings = []
for sentence in group_mentions_present:
    text = sentence["text"]
    labels = sentence["spans"]

# flatten the embeddings
flat_embeddings = [embedding for sentence_mentions in all_embeddings for embedding in sentence_mentions]

In [56]:
X = torch.stack(flat_embeddings).cpu().numpy()
dbscan = DBSCAN(eps=.35, min_samples=2, metric='cosine') 
labels = dbscan.fit_predict(X)

In [57]:
labels

array([ 0,  0, -1,  1,  2, -1, -1, -1, -1, -1, -1,  3,  3,  3, -1, -1, -1,
        0,  4,  5,  6,  0,  7,  1,  1,  3,  8,  4,  5,  3,  1,  1, -1,  3,
        3,  4,  7,  7,  7,  3,  3, -1, -1,  9, -1, -1, -1,  0,  3,  8,  1,
        1,  7, -1, -1,  3, -1,  3, 10,  3, -1,  9,  3,  6,  3, 10,  0, 10,
       -1, 10,  1,  1,  2, -1,  0,  3,  9,  1,  0])

In [58]:
group_mentions_present

[{'text': 'Since Labour took office, 10,000 more children have been plunged into poverty by the refusal to scrap the two-child benefit cap.',
  'labels': ['children'],
  'spans': [{'start': 38, 'end': 46}]},
 {'text': 'This , I hope every child has the chance to get to ‘know themselves to grow themselves’.',
  'labels': ['every child'],
  'spans': [{'start': 14, 'end': 25}]},
 {'text': "We'll hear from planners, the council and police leaders as well as women themselves.",
  'labels': ['planners', 'women', 'police leaders'],
  'spans': [{'start': 16, 'end': 24},
   {'start': 68, 'end': 73},
   {'start': 42, 'end': 56}]},
 {'text': 'I was reassured today to hear the government reiterate its commitment to legislation that will protect leaseholders and ensure they get fair treatment.',
  'labels': ['leaseholders'],
  'spans': [{'start': 103, 'end': 115}]},
 {'text': 'All different communities and religions were present today - all proud Rochdalians united in making our town such a great p

In [61]:
# get all mentions
all_mentions = [mention for sentence in group_mentions_present for mention in sentence["labels"]]

# pair them with their labels
mentions_labels = zip(all_mentions, labels)

In [62]:
cluster_dict = defaultdict(list)

for mention, label in zip(all_mentions, labels):
    cluster_dict[label].append(mention)

for cluster_id, mentions in cluster_dict.items():
    print(f"Cluster {cluster_id} ({len(mentions)} mentions):")
    for mention in mentions:
        print(f"  - {mention}")
    print()

Cluster 0 (8 mentions):
  - children
  - every child
  - vulnerable children
  - parents
  - children
  - homeless young people
  - disabled people
  - children

Cluster -1 (22 mentions):
  - planners
  - leaseholders
  - Rochdalians
  - bankers
  - Female fisherwomen
  - migrants
  - families
  - Oxbridge grads
  - Monmouthshire’s Ukrainian community
  - constituents
  - pupil
  - older people
  - senior citizens
  - divisive, xenophobic, racist pack
  - waste producers
  - family
  - pupils
  - constituents
  - taxpayers
  - soldiers
  - taxpayer
  - buskers

Cluster 1 (10 mentions):
  - women
  - women
  - girls
  - women
  - babies
  - women
  - girls
  - women’s
  - women
  - WOMEN

Cluster 2 (2 mentions):
  - police leaders
  - civil servants

Cluster 3 (16 mentions):
  - those who are able to return to work
  - WASPI women
  - 1950s-born women
  - workers
  - constituents of Bournemouth East
  - Residents in Westerhope
  - people in Slatyford and Newbiggin
  - skilled workers
  