In [1]:
!pip install datasets
!pip install transformers
!pip install rouge
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install sentence-transformers
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 50.6 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 27.5 MB/s 
[?25hCollecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 534 kB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 10.7 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1

In [2]:
import tqdm
import os
import pandas as pd
import torch
import numpy as np
import datasets
import torch.nn as nn
import transformers
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
import spacy
import string
from sklearn.cluster import KMeans
import sklearn.metrics as metrics

from sentence_transformers import SentenceTransformer

In [6]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

In [3]:
from datasets import load_dataset
dataset = load_dataset("multi_news",split = 'train')

Downloading builder script:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading and preparing dataset multi_news/default to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.30M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/69.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.31M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Dataset multi_news downloaded and prepared to /root/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72. Subsequent calls will reuse this data.


#Score the sentences

In [4]:
nlp = spacy.load("en_core_web_sm")

In [7]:
def score_sentence(document):
    text = document['document']
    doc = nlp(text)
    #print('1')
    rouge_scores = []
    entity_counts = []
    
    
    for s in doc.sents:
        target = text[s.start_char:s.end_char]
        rest_doc = text[:s.start_char] + text[s.end_char:]
#         print(target)
        try:
            score = scorer.score(target,
                      rest_doc)['rouge1'][2]
            rouge_scores.append(score)
        except:
            rouge_scores.append(0)
        

        entity_counts.append(len(nlp(str(s)).ents))   
            
    document['rouge_scores'] = rouge_scores
    document['entity_counts'] = entity_counts
    return document

In [8]:
small_dataset = dataset.select(list(range(1)))

small_dataset = small_dataset.add_column("rouge_scores", [[0]] * len(small_dataset))
small_dataset = small_dataset.add_column("entity_counts", [[0]] * len(small_dataset))
small_dataset = small_dataset.add_column("filter_sentences", [[0]] * len(small_dataset))

In [9]:
d = small_dataset.map(score_sentence, num_proc = 5)



  0%|          | 0/1 [00:00<?, ?ex/s]

#sentence selection

In [10]:
def filter_sentence(document):
  percentage = 0.8 #percentage to keep
  top = 5 # keep first t sentences 
  #row = d['document'][0]
  sentences = list(nlp(document['document']).sents)
  pointers = [i for i in range(len(sentences)) if '|||||' in sentences[i].text] #seperate articles
  scores = document['rouge_scores']
  filter = []
  score_splits = [sl.tolist()for sl in np.split(scores, pointers)]
  for a in range(len(score_splits)):
    rest = score_splits[a][5:]
    n_leave = int(len(rest)*percentage) #leave 0.8 percent of the rest of the sentences (except top sentences)
    index_leave = sorted(range(len(rest)), key = lambda sub: rest[sub])[-n_leave:] #index of top n_leave scores
    if len(score_splits[a])>=top:
      filter.append([1]*top+[0]*len(rest))
    else:
      filter.append([1]*(len(score_splits[a])))
    for i in index_leave:
      filter[a][i+top] = 1
  filter = sum(filter,[])
  document['filter_sentences'] = filter
  return document

In [11]:
m = d.map(filter_sentence, num_proc = 5)
m



  0%|          | 0/1 [00:00<?, ?ex/s]

Dataset({
    features: ['document', 'summary', 'rouge_scores', 'entity_counts', 'filter_sentences'],
    num_rows: 1
})

# Embedding and clusering

In [12]:
# doc = nlp(train_doc)
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [30]:
def get_clusters(document):
    doc = nlp(document["document"]) #use spacy to get the document sentences

    #bert embedding
    embedding = []
    for sentence in doc.sents:
        embedding.append(model.encode(sentence.text, show_progress_bar=True))
    document["embedding"] = embedding
    
    
    cur = -1
    best_label = np.ndarray(len(embedding))
    cluster = np.arange(2,6)
    
    for i in range(2,6): # get best num_clusters from silhouette score
        labels = KMeans(n_clusters=i).fit(embedding).labels_ #,init="k-means++",random_state=200
        score = metrics.silhouette_score(embedding,labels,metric="euclidean")
        if score > cur:
            cur = score
            best_label = labels

    document["labels"] = best_label 
    return document

In [32]:
result = m.map(filter_sentence, num_proc = 5)
result

Dataset({
    features: ['document', 'summary', 'rouge_scores', 'entity_counts', 'filter_sentences', 'embedding', 'labels'],
    num_rows: 5
})

In [None]:
def get_clusters(document):
    doc = nlp(document["document"]) #use spacy to get the document sentences

    #bert embedding
    embedding = []
    for sentence in doc.sents:
        embedding.append(model.encode(sentence.text, show_progress_bar=True))
    document["embedding"] = embedding

    
    cur = -1
    best_label = np.ndarray(len(embedding))
    cluster = np.arange(2,6)
    
    for i in range(2,6): # get best num_clusters from silhouette score
        labels = KMeans(n_clusters=i).fit(embedding).labels_ #,init="k-means++",random_state=200
        score = metrics.silhouette_score(embedding,labels,metric="euclidean")
        if score > cur:
            cur = score
            best_label = labels

    document["labels"] = best_label 
    return document

In [13]:
m

Dataset({
    features: ['document', 'summary', 'rouge_scores', 'entity_counts', 'filter_sentences'],
    num_rows: 1
})

In [26]:
idx = m["filter_sentences"].index(m["filter_sentences"] == 1)

ValueError: ignored