In [28]:
import nltk

from nltk.tokenize import word_tokenize
import spacy
import pandas as pd
import re, glob
import pandas as pd
import spacy
import nltk 
nltk.download('punkt')
import requests
import wikipedia
import wptools
import os
from nltk.metrics import ConfusionMatrix
from nltk.parse.stanford import StanfordDependencyParser

nlp = spacy.load("en_core_web_sm")
DATA_DIR = "./"
OUTPUT_PATH = './data'

[nltk_data] Downloading package punkt to /home/manil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data Collection

In [109]:
def scraper(src, n): #function to get text from web
    count = 0 # counter to count that the loop runs n times only
    try:
        page = wikipedia.page(src) #wikipedia page for the item
        l = page.links #get all links from the page
        
        for link in l: # loop to extract text and make n text files from these wiki links in the said wikipedia page
            if count < n+1:
                p = wikipedia.page(title = link)
                file_name = "{}.txt".format(p.title)
                complete_path = os.path.join(OUTPUT_PATH, file_name)
                with open(complete_path, "w", encoding= 'utf-8' ) as text_file: # writing text into text files
                    text_file.write(p.summary)
                count = count +1    
            else:
                break     
    except Exception as e:
        pass

In [116]:
topic_list = ["human rights", "Pyramids", "Classical conditioning", "Great Barrier Reef"]
for topic in topic_list:
    scraper(topic, 30)



  lis = BeautifulSoup(html).find_all('li')


### Sentence segmentation

In [117]:
def segment_data(directory):
  """ This function takes a directory (str) as an input 
  merges all files and performs sentence segmentation with spacy and nltk.
  Returns: DataFrame with columns :
  [original_text,shared_sentences,unique_to_spacy,unique_to_nltk,nltk_segmentation,spacy_segmentation]
  """
  content = []
  for entry in glob.glob(os.path.join(directory, "*.txt")): # reading each file in the object
      with open(entry,'r') as f: # opening each file
        summaries = f.read()
        content.append(summaries) # append text data into content list
        string_text = ' '.join(content) # converting all list elements of content object into one long string to be accepted by spacy and nltk

  nlp.max_length = 20000000
  spacy_sents1 = [sent.text for sent in nlp(string_text).sents] # creating list of segmented sentences of spacy
  sentences_nltk = nltk.sent_tokenize(string_text) # segmentation nltk

  df = pd.DataFrame()
  one_spacy = pd.Series(spacy_sents1)
  one_nltk = pd.Series(sentences_nltk)

  shared = one_spacy[one_spacy.isin(one_nltk)] # filtering only shared sentences that are segmented the same by the two modules
  unique_to_spacy = one_spacy[~one_spacy.isin(one_nltk)] # filtering unique sentences segmented by spacy
  unique_to_nltk = one_nltk[~one_nltk.isin(one_spacy)] # filtering unique sentences segmented by spacy
  df['shared_sentences'] = shared
  df['unique_to_spacy'] = unique_to_spacy.reset_index(drop=True)
  df['unique_to_nltk'] = unique_to_nltk.reset_index(drop=True)
  df['nltk_segmentation']  = one_nltk.reset_index(drop=True)
  df['spacy_segmentation'] = one_spacy.reset_index(drop=True)
  df = pd.concat([pd.Series(string_text, name='original_text'), df], axis=1)
  return df

In [118]:
df = segment_data(OUTPUT_PATH)
df.to_csv('./segmentation_data.csv', index=False)

### Tokenisation & PosTag

In [9]:
df = pd.read_csv("./segmentation_data.csv")


Set of unique tokens and their respective size

In [10]:
unsegmented_text= str(df['original_text'][0]).lower() # the original text without sentence segmentation
unsegmented_text_clean = re.sub(r'[\s+|\n+]', ' ', unsegmented_text).strip() # remove linebreaks and extra spaces from text before tokenizing

spacy_token_list = [token.text for token in nlp(unsegmented_text_clean)] # get all tokens in the text using spacy
nltk_token_list = nltk.word_tokenize(unsegmented_text_clean) # get all tokens in the text using nltk

uniq_token_nltk = set(nltk_token_list) # remove repetitions by converting to set for nltk
uniq_token_spacy = set(spacy_token_list) # remove repetitions by converting to set for spacy
print("Set of unique tokens (Spacy)", uniq_token_spacy)
print("Set of unique tokens (nltk)", uniq_token_nltk)

Set of unique tokens (Spacy) {'occasional', 'play', 'elam', 'proved', '2017', 'woodworking', 'encountered', 'damage', 'world', 'spill', "'s", 'visitors', 'struck', 'loading', 'retroduction', '5', 'monocultures', 'layers', 'basis', 'formally', 'rest', 'many', 'advisor', 'p.l', 'energy', 'push', 'distress', '100', 'absent', 'actually', 'deficit', '35', 'investigations', 'common', 'charles', 'article', 'sentenced', 'displays', 'trade', 'acheuléen', 'saudi', 'vehicle', '2014', 'removed', 'blockade', 'shimamura', 'lvt', 'amnestia', 'period', 'items', 'unify', 'зубков', 'consisting', 'sum', 'important', 'medium', 'neuropsychologist', 'maladaptive', 'crustaceans', 'first', 'necessary', 'i', 'watch', 'francesco', 'cargo', 'inter', 'distinctive', '>', 'wagner', 'surroundings', 'varuna', 'opération', 'technology.artificial', 'his', '1973', 'patients', 'garry', 'italy', 'm', 'decreasing', '1976', 'fleshy', 'environmental', 'shrines', 'anatoly', 'lives', 'country', 'technologies', '1990s', 'nearby

In [11]:
size_vocab_nltk = len(uniq_token_nltk) # number of unique tokens identified by nltk in unsegmented text
size_vocab_spacy = len(uniq_token_spacy) # number of unique tokens identified by spacy in unsegmented text
print("Size of vocab (Spacy) :", size_vocab_spacy)
print("Size of vocab (nltk) :", size_vocab_nltk)

Size of vocab (Spacy) : 3235
Size of vocab (nltk) : 3270


Intersection of these two vocabularies

In [123]:
common_tokens_noseg = uniq_token_nltk.intersection(uniq_token_spacy) # common tokens identified by both spacy and nltk libraries
print(common_tokens_noseg)

{'polyps', 'soil', 'decreasing', 'nominations', 'борис', 'orders', 'many', 'proceeds', 'abuse', 'works', 'assumes', 'metropolitan', 'before', 'he', 'integration', 'emetic', 'tuning', '2016.the', 'assessed', 'force', 'emits', 'challenging', 'shallow', 'around', 'ben', 'amnesty', 'opening', 'reflex', 'parliament', 'it', 'been', 'kub', 'birmingham', 'thrasybulus', 'cançado', 'april', 'ukraine', 'held', 'equivalent.in', 'consensus', 'erie', 'g.', 'into', 'buffalo', 'testing', 'videos', 'italy', 'pioneered', 'creation', 'changed', 'adviser', 'diplomatic', 'suggested', 'third', 'task', 'earlier', 'movement', 'slovakia', 'complex', 'coined', 'fraud', 'fertility', 'functions', 'over', 'passively', 'gorgonin', 'key', 'semantic', 'cultural', 'park', 'fired', 'heavy', 'named', 'unnacepted', 'served', 'reflexes', 'landing', 'chatgpt', 'boris', 'other', 'products', '1977', 'claimants', 'vessels', '94', 'export', 'credited', 'individual', 'airavatesvara', 'anne', 'agent', 'continent', 'atlantic', 't

Now we will extract, sharedTokensInSentences, but we will also at the same time extract the pos-tags for each library to ease the processing later

In [12]:
# Create both tokenisations in sentences
spacy_tokens_sentences  = []
nltk_tokens_sentences = []
shared_token_in_sentences = []
for sentence in df['shared_sentences']: # loop to get tokens after sentence segmentation 
    segmented_text_clean = re.sub(r'[\s+|\n+]', ' ', str(sentence)).strip()
    spacy_segm = [token for token in nlp(str(sentence))]
    nltk_segm = nltk.pos_tag(nltk.word_tokenize(str(sentence)), tagset='universal')
    spacy_tokens_sentences.append(spacy_segm)
    nltk_tokens_sentences.append(nltk_segm)

In [13]:
# Extract shared tokens as well as each of nltk and spacy tags
for i, sent in enumerate(spacy_tokens_sentences):
    for token in sent:
        # Split nltk postag list into 2, one for token and one for tag
        nltk_token_tag = list(zip(*nltk_tokens_sentences[i]))
        if token.text in nltk_token_tag[0]:
            # Lookup for token index in nltk token list
            nltk_tag = nltk_token_tag[1][nltk_token_tag[0].index(token.text)]
            shared_token_in_sentences.append((token.text, token.pos_, nltk_tag))

Set of shared tokens in sentences

In [126]:
print(set(list(zip(*shared_token_in_sentences))[0]))

{'polyps', 'soil', 'decreasing', 'nominations', 'orders', 'many', 'proceeds', 'abuse', 'Instead', 'works', 'assumes', 'before', 'metropolitan', 'he', 'integration', 'tuning', 'Simon', 'around', 'challenging', 'amnesty', 'Export', 'reflex', 'it', 'been', 'Alba', 'held', 'consensus', 'into', 'testing', 'videos', 'Magdalenian', 'Italy', 'diplomatic', 'Sebutinde', 'Thus', 'suggested', 'third', 'task', 'earlier', 'fraud', 'complex', 'French', 'fertility', 'over', 'functions', 'key', 'semantic', 'cultural', 'named', 'served', 'Bay', 'other', 'US', '1977', 'products', 'claimants', '94', 'vessels', 'credited', 'individual', 'agent', 'continent', 'tang', 'Neuroscience', 'selectively', 'aversives', 'oldest', 'under', 'newly', 'source', 'that', 'include', 'activities', 'conditioning', 'sculptor', 'cattle', 'reef', 'Regional', 'children', '1817', 'synchronously', 'breeding', 'Dordogne', 'immediately', 'owner', 'practice', 'generative', 'strongly', 'structure', 'Scleractinia', 'whilst', 'coronation

Get the number of times & ratio for which each token is assigned to the same tag by both libraries

In [127]:
from collections import defaultdict, Counter
token_count = Counter(list(zip(*shared_token_in_sentences))[0])
agreement = defaultdict(int)
for token in shared_token_in_sentences :
    if token[1]==token[2]:
        agreement[token[0]]+=1

print("Number of times spacy and nltk agree for each token: ", agreement)
ratios = {k:agreement[k]/token_count[k] for k in agreement}
print("Ratio of agreement: ", ratios)

Number of times spacy and nltk agree for each token:  defaultdict(<class 'int'>, {'The': 52, '2005': 2, 'held': 14, 'between': 15, '14': 3, '16': 2, 'a': 122, 'summit': 1, 'meeting': 1, 'the': 414, '2000': 1, 'which': 23, 'led': 3, 'of': 304, 'Representatives': 1, 'including': 13, 'nearly': 1, '200': 2, 'leaders': 1, 'then': 3, '191': 1, 'member': 3, 'states': 2, 'met': 1, 'in': 155, 'for': 40, 'what': 4, 'described': 4, 'opportunity': 1, 'take': 3, 'bold': 1, 'decisions': 1, 'areas': 2, 'development': 3, 'security': 1, 'human': 10, 'rights': 1, 'reform': 1, 'Adzes': 1, 'used': 10, 'They': 7, 'smoothing': 1, 'carving': 1, 'wood': 2, 'hand': 3, 'woodworking': 1, 'hoe': 1, 'agriculture': 1, 'horticulture': 1, 'Two': 2, 'basic': 1, 'forms': 1, 'an': 17, 'adze': 3, 'tool': 3, 'with': 28, 'one': 10, 'foot': 2, 'capable': 2, 'powerful': 1, 'swings': 1, 'using': 1, 'hands': 1, 'edge': 1, 'usually': 4, 'striking': 2, 'at': 23, 'shin': 1, 'level': 4, 'A': 8, 'similar': 4, 'called': 5, 'mattock'

Frequency mapping for each PosTag

In [128]:
# ref:spacy row:nltk
print(ConfusionMatrix(list(zip(*shared_token_in_sentences))[1], list(zip(*shared_token_in_sentences))[2]).pretty_format(show_percents=True))

      |                                         C                                                       P             P      S               |
      |                                         C      C             I      N             P      P      R             U      C      V        |
      |             A      A      A      A      O      O      D      N      O      N      A      R      O      P      N      O      E        |
      |             D      D      D      U      N      N      E      T      U      U      R      O      P      R      C      N      R        |
      |      .      J      P      V      X      J      J      T      J      N      M      T      N      N      T      T      J      B      X |
------+--------------------------------------------------------------------------------------------------------------------------------------+
    . |     <.>     .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      . |

### Bonus

### Agreement on dependency relations

In [None]:
!unzip ./stanford-corenlp-4.2.2.zip

In [31]:
jar_path = './stanford-corenlp-4.2.2/stanford-corenlp-4.2.2.jar'

# Path to CoreNLP model jar
models_jar_path = './stanford-corenlp-4.2.2-models-english.jar'

dependency_parser = StanfordDependencyParser(path_to_jar=jar_path, path_to_models_jar=models_jar_path)

Please use [91mnltk.parse.corenlp.CoreNLPDependencyParser[0m instead.
  


In [7]:
# nltk dependency parsing (based on Stanford parser)
nltk_dep_sentences = []
for sentence in df['shared_sentences']: 
    segmented_text_clean = re.sub(r'[\s+|\n+]', ' ', str(sentence)).strip()
    nltk_segm = dependency_parser.raw_parse(str(sentence))
    dependency = nltk_segm.__next__()
    nltk_dep_sentences.append([(link[0][0], link[1], link[2][0]) for link in list(dependency.triples())])

In [26]:
shared_rel_in_sentences = []
for i, sent in enumerate(spacy_tokens_sentences):
    for token in sent:
        # Split nltk dependency list into 3 lists:
        # 0: head, 1: relation, 2:governee
        nltk_deps = list(zip(*nltk_dep_sentences[i]))
        if len(nltk_deps) >= 3:
            if token.text in nltk_deps[2] and token.head.text in nltk_deps[0]:
                rel_idx = nltk_deps[2].index(token.text)
                nltk_rel = nltk_deps[1][rel_idx]
                shared_rel_in_sentences.append((token.text, token.head.text, nltk_rel, token.dep_))
            else:
                # if spacy identified one relation and nltk didnt we mark the relation as NLTKNOREL
                shared_rel_in_sentences.append((token.text, token.head.text, token.dep_, "NLTKNOREL"))

Frequency mapping for each dependency

In [29]:
print(ConfusionMatrix(list(zip(*shared_rel_in_sentences))[2], list(zip(*shared_rel_in_sentences))[3]).pretty_format(show_percents=True))

             |                                                                                                                                                  c                                                                                                                                                                                                                                                             |
             |                                                                                                                                                  o                                                                                                                                                                                                                                                             |
             |                                                                                                                             c                    m       