#Nested Summarization With Heading Hierarchy: Syntax Analysis

In [None]:
from os import listdir
from string import punctuation
punctuation+='\n'
import re
from bs4 import BeautifulSoup
import pandas as pd
import math
from tqdm import tqdm

Loading Data

In [None]:
def load_doc(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [None]:
def split_story(doc):
    index = doc.find('@highlight')
    story, highlights = doc[:index], doc[index:].split('@highlight')
    highlights = [h.strip() for h in highlights if len(h) > 0]
    return story, highlights

In [None]:
def load_stories(directory):
    all_stories = list()
    for name in tqdm(listdir(directory)):
        filename = directory + '/' + name
        doc = load_doc(filename)
        story, highlights = split_story(doc)
        all_stories.append({'story':story, 'highlights':highlights})
    return all_stories

In [None]:
directory = '/content/Input/cnn-dailymail/stories'
data = load_stories(directory)
print('Loaded Stories %d' % len(data))

100%|██████████| 82/82 [00:00<00:00, 10199.97it/s]

Loaded Stories 82





In [None]:
stories = [story['story'] for story in data]

In [None]:
len(stories)

82

In [None]:
data_df = [[d['story'], d['highlights']] for d in data]
df = pd.DataFrame(data_df).rename(columns={0:"text", 1:"summary"})
df.head()

Unnamed: 0,text,summary
0,Washington (CNN) -- Facing low approval rating...,[NEW: Sen. Reid says the plan is a litmus test...
1,(CNN) -- From Morocco to the foothills of the ...,[Better intelligence has made an attack on U.S...
2,"April 23 is, according to some reports, Willia...",[Tokyo's National Noh Theatre has a subtitling...
3,Gaza City (CNN) -- Explosions rumbled through ...,[Israel says 422 rockets fired from Gaza into ...
4,"(Travel + Leisure)Quick, imagine a castle: it ...","[Each day, tens of thousands of visitors pour ..."


In [None]:
df['text'] = df['text'].apply(lambda x: x.replace('\n',''))
df['text'] = df['text'].apply(lambda x: x.replace("\"", ""))
df['summary'] = df['summary'].apply(lambda x: ". ".join(x))
df.head()

Unnamed: 0,text,summary
0,Washington (CNN) -- Facing low approval rating...,NEW: Sen. Reid says the plan is a litmus test ...
1,(CNN) -- From Morocco to the foothills of the ...,Better intelligence has made an attack on U.S....
2,"April 23 is, according to some reports, Willia...",Tokyo's National Noh Theatre has a subtitling ...
3,Gaza City (CNN) -- Explosions rumbled through ...,Israel says 422 rockets fired from Gaza into I...
4,"(Travel + Leisure)Quick, imagine a castle: it ...","Each day, tens of thousands of visitors pour t..."


#Preprocessing

Expanding Contractions:

In [None]:
!pip install contractions
import contractions



In [None]:
def expand_contractions(sentence):
    contractions_expanded = [contractions.fix(word) for word in sentence.split()]
    return ' '.join(contractions_expanded)

In [None]:
df['text'] = df['text'].apply(lambda x: expand_contractions(x))
df['summary'] = df['summary'].apply(lambda x: expand_contractions(x))

#Syntax Analysis

In [None]:
text = df['summary'][25]
text

'NEW: Disaster management agency says 2,487 people have been injured. 1,774 people are confirmed dead from Haiyan. Another 14 dead in Vietnam and five in China, those governments say. International relief heads for stricken islands, but roads a problem'

POS Tagging

In [None]:
text = "Disaster management agency says 2,487 people have been injured. 1,774 people are confirmed dead from Haiyan. Another 14 dead in Vietnam and five in China, those governments say. International relief heads for stricken islands, but roads a problem"

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize
!pip install graphviz

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!




In [None]:
tagged = pos_tag(word_tokenize(text))
tagged

[('NEW', 'NN'),
 (':', ':'),
 ('Disaster', 'NNP'),
 ('management', 'NN'),
 ('agency', 'NN'),
 ('says', 'VBZ'),
 ('2,487', 'CD'),
 ('people', 'NNS'),
 ('have', 'VBP'),
 ('been', 'VBN'),
 ('injured', 'VBN'),
 ('.', '.'),
 ('1,774', 'CD'),
 ('people', 'NNS'),
 ('are', 'VBP'),
 ('confirmed', 'VBN'),
 ('dead', 'JJ'),
 ('from', 'IN'),
 ('Haiyan', 'NNP'),
 ('.', '.'),
 ('Another', 'DT'),
 ('14', 'CD'),
 ('dead', 'NN'),
 ('in', 'IN'),
 ('Vietnam', 'NNP'),
 ('and', 'CC'),
 ('five', 'CD'),
 ('in', 'IN'),
 ('China', 'NNP'),
 (',', ','),
 ('those', 'DT'),
 ('governments', 'NNS'),
 ('say', 'VBP'),
 ('.', '.'),
 ('International', 'NNP'),
 ('relief', 'NN'),
 ('heads', 'NNS'),
 ('for', 'IN'),
 ('stricken', 'JJ'),
 ('islands', 'NNS'),
 (',', ','),
 ('but', 'CC'),
 ('roads', 'VBZ'),
 ('a', 'DT'),
 ('problem', 'NN')]

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Dependency parsing through stanza Library:

Dependency parsing is a crucial aspect of natural language processing (NLP) that involves analyzing the grammatical structure of a sentence to determine the relationships between words. The stanza library is a powerful Python NLP library that provides pre-trained models for various tasks, including dependency parsing.

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.8.1-py3-none-any.whl (970 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m970.4/970.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3.0->stanza)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3.0->stanza)
  Using cached nvidia_cudnn_cu12-8.9.2.

♦ We first download the English model for dependency parsing using stanza.download('en')





In [None]:
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


* Then, we initialize the English pipeline using stanza.Pipeline('en').

In [None]:
nlp=stanza.Pipeline()

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


♦ Next, we define a sample sentence.

In [None]:
print(text)

Disaster management agency says 2,487 people have been injured. 1,774 people are confirmed dead from Haiyan. Another 14 dead in Vietnam and five in China, those governments say. International relief heads for stricken islands, but roads a problem


♦ We process the text using the pipeline and obtain a doc object.

♦ Finally, we iterate through the sentences in the document, accessing each senetence's dependency relation.

In [None]:
doc=nlp(text)
for i in doc.sentences:
  print(i.dependencies)

[({
  "id": 2,
  "text": "management",
  "lemma": "management",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 3,
  "deprel": "compound",
  "start_char": 9,
  "end_char": 19
}, 'compound', {
  "id": 1,
  "text": "Disaster",
  "lemma": "disaster",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 2,
  "deprel": "compound",
  "start_char": 0,
  "end_char": 8
}), ({
  "id": 3,
  "text": "agency",
  "lemma": "agency",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj",
  "start_char": 20,
  "end_char": 26
}, 'compound', {
  "id": 2,
  "text": "management",
  "lemma": "management",
  "upos": "NOUN",
  "xpos": "NN",
  "feats": "Number=Sing",
  "head": 3,
  "deprel": "compound",
  "start_char": 9,
  "end_char": 19
}), ({
  "id": 4,
  "text": "says",
  "lemma": "say",
  "upos": "VERB",
  "xpos": "VBZ",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "start

#Parse Tree

We utilized the Stanza library for syntactic analysis, parsing the text to understand its grammatical structure. Additionally, we employed Spacy to visualize the parse tree, providing a graphical representation of how words in the sentence relate to each other syntactically.



In [None]:
!pip install spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(". pos:", token.pos_,"text:", token.text, " dep:", token.dep_, " headtext:", token.head.text)

In [None]:
doc = nlp(text)

In [None]:
from spacy import displacy

In [None]:
displacy.render(doc,style="dep",jupyter=True)

In [None]:
options = {
    'compact': True,   # Display a compact tree
    'bg': '#ffffff',   # Background color
    'color': '#000000', # Text color
    'font': 'Arial',   # Font family
    'arrow_stroke': 2, # Width of arrow stroke
    'arrow_width': 8   # Width of arrow head
}

In [None]:
displacy.render(doc, style="dep", jupyter=True, options=options)

Parse Tree using NLTK library

In [None]:
from nltk.tree import Tree

In [None]:
tree ={}

In [None]:
for token in doc:
    tree[token.i] = {
        'text': token.text,
         'pos': token.pos_,
        'dep': token.dep_,
        'children': []
    }

In [None]:
for token in doc:
    if token.head.i != token.i:
        tree[token.head.i]['children'].append(token.i)

In [None]:
root_index = [index for index, node in tree.items() if node['dep'] == 'ROOT'][0]

In [None]:
def convert_to_nltk_tree(index, tree):
    node = tree[index]
    children = [convert_to_nltk_tree(child_index, tree) for child_index in node['children']]
    label = f"{node['text']} ({node['pos']})"
    return Tree(label, children)

In [None]:
nltk_tree = convert_to_nltk_tree(root_index, tree)

In [None]:
print(nltk_tree)

(says (VERB)
  (NEW (ADJ) )
  (: (PUNCT) )
  (agency (NOUN) (management (NOUN) (Disaster (NOUN) )))
  (injured (VERB)
    (people (NOUN) (2,487 (NUM) ))
    (have (AUX) )
    (been (AUX) ))
  (. (PUNCT) ))


In [None]:
nltk_tree.pretty_print()

                                       says (VERB)                                     
     _______________________________________|_____________________________________      
    |         |       agency (NOUN)                 injured (VERB)                |    
    |         |             |                _____________|____________           |     
    |         |     management (NOUN) people (NOUN)       |            |          |    
    |         |             |               |             |            |          |     
NEW (ADJ) : (PUNCT)  Disaster (NOUN)   2,487 (NUM)    have (AUX)   been (AUX) . (PUNCT)
    |         |             |               |             |            |          |     
   ...       ...           ...             ...           ...          ...        ...   



# Coreference Resolution

In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.3


In [None]:
!pip install 'lightning-flash[text]' --upgrade

Collecting lightning-flash[text]
  Downloading lightning_flash-0.8.2-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics<0.11.0,>0.7.0 (from lightning-flash[text])
  Downloading torchmetrics-0.10.3-py3-none-any.whl (529 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m529.7/529.7 kB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning<2.0.0,>1.8.0 (from lightning-flash[text])
  Downloading pytorch_lightning-1.9.5-py3-none-any.whl (829 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.5/829.5 kB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyDeprecate>0.2.0 (from lightning-flash[text])
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting jsonargparse[signatures]>=4.22.0 (from lightning-flash[text])
  Downloading jsonargparse-4.27.7-py3-none-any.whl (192 kB)
[2K     [90m━━━━

In [None]:
import transformers

In [None]:
!pip show transformers

Name: transformers
Version: 4.39.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers


In [None]:
import tensorflow as tf
from transformers import TFBertModel, BertModel

In [None]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import numpy as np


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

text = df['summary'][25]


inputs = tokenizer(text, return_tensors="tf", max_length=128, truncation=True)


outputs = bert_model(**inputs)


embeddings = outputs.last_hidden_state


num_mentions = inputs['input_ids'].shape[1]
coreference_matrix = np.random.randint(2, size=(num_mentions, num_mentions))  # Random binary matrix
print(coreference_matrix)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

[[1 0 0 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 0 0 ... 1 0 1]
 ...
 [1 0 1 ... 1 0 1]
 [0 0 0 ... 1 1 0]
 [1 1 1 ... 1 1 1]]


In [None]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import numpy as np


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


text = df['summary'][25]


inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)


outputs = bert_model(inputs)


last_hidden_states = outputs.last_hidden_state.numpy()


num_tokens = last_hidden_states.shape[1]
coref_scores = np.random.rand(num_tokens, num_tokens)


print("Coreference Scores Matrix:")
print("Columns: Tokens")
print("Rows: Tokens")
print()
print("       ", end="")
for i in range(num_tokens):
    print(f"{i:<6}", end="")
print()
print("-" * (6 * num_tokens + 8))
for i in range(num_tokens):
    print(f"Token {i}: ", end="")
    for j in range(num_tokens):
        print(f"{coref_scores[i][j]:.2f}  ", end="")
    print()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Coreference Scores Matrix:
Columns: Tokens
Rows: Tokens

       0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25    26    27    28    29    30    31    32    33    34    35    36    37    38    39    40    41    42    43    44    45    46    47    48    49    50    51    52    53    
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Token 0: 0.49  0.46  0.60  0.84  0.24  0.00  0.04  0.49  0.41  0.50  0.08  0.77  0.85  0.94  0.46  0.36  0.47  0.89  0.38  0.58  0.95  0.64  0.57  0.72  0.30  0.16  0.08  0.92  0.87  0.61  0.14  0.19  0.07  0.18  1.00  0.32  0.25  0.29  0.46  0.33  0.78  0.41  0.05  0.64  0.75 