In [20]:
import nltk

from nltk.tokenize import word_tokenize
import spacy
import pandas as pd
import re
import pandas as pd
import spacy
import nltk 
nltk.download('punkt')
import requests
import wikipedia
import wptools
import os

nlp = spacy.load("en_core_web_sm")


### Data Collection

In [21]:
def scarpper(src, n): #function to get text from web
    count = 0 # counter to count that the loop runs n times only
    try:
        page = wikipedia.page(src) #wikipedia page for the item
        l = page.links #get all links from the page
        
        for link in l: # loop to extract text and make n text files from these wiki links in the said wikipedia page
            if count < n+1:
                p = wikipedia.page(title = link)
                path = '/content/arcs'
                file_name = "{}.txt".format(p.title)
                complete_path = os.path.join(path, file_name)
                with open(complete_path, "w", encoding= 'utf-8' ) as text_file: # writing text into text files
                    text_file.write(p.summary)
                count = count +1    
            else:
                break    
            
    except Exception as e:
        print(e)

### Sentence segmentation

In [None]:
#### reading files from folder and extracting texts
def segment_data(directory:str):
  ''' the function takes a directory in a string format as an input to read files from a directory then outputs the result in a string format
  that is compatible with spacy and nltk for further preprocessing of data.
  Then it creates a global object: 'string_text' to be used for other functions if needed.
  then the function calls for spacy and nltk to preprocess the text and segment the sentences. THE FINAL OUTPUTS IS :
  a global dataframe with four columns that contain:
  orignal_text, shared_sentences, unique_to_spacy, unique_to_nltk '''
  print('FETCHING ARTICLES FROM DIRECTORY', '*' * 60)
  files = os.scandir(directory) # creating an os object 
  content = []  
  print("\nOBJECTS : 'string_text' and 'df' ARE GLOBAL OBJECTS FOR FURTHER MANIPULATION OF YOUR CHOICE")
  for entry in files: # reading each file in the object
    if entry.is_file():
      with open(entry.path,'r') as f: # opening each file
        summaries = f.read()
        content.append(summaries) # append text data into content list
        ################################
        global string_text # making string_text global to be used in other preprocessing functions
        string_text = ' '.join(content) # converting all list elements of content object into one long string to be accepted by spacy and nltk
  print('\nDataFrame' , '*' * 60)
  nlp = spacy.load('en_core_web_sm')
  nlp.max_length = 20000000
  spacy_ = nlp(string_text)
  spacy_sents1 = [sent.text for sent in spacy_.sents] # creating list of segmented sentences of spacy
  print('\nSpacy Done With No Errors!')
  print('*'* 50)
  sentences_nltk = nltk.sent_tokenize(string_text) # segmentation nltk
  print('Nltk Done With No Errors!')
  print('\nBe careful the resulting dataframe may be unbalanced due to different results from spacy and nltk in the unique sentences:')
  global df # making the df global so we can manipulate it after if needed
  df = pd.DataFrame()
  one_spacy = pd.Series(spacy_sents1)
  print(len(one_spacy))
  one_nltk = pd.Series(sentences_nltk)
  shared = one_spacy[one_spacy.isin(one_nltk)] # filtering only shared sentences that are segmented the same by the two modules
  unique_to_spacy = one_spacy[~one_spacy.isin(one_nltk)] # filtering unique sentences segmented by spacy
  unique_to_nltk = one_nltk[~one_nltk.isin(one_spacy)] # filtering unique sentences segmented by spacy
  df['shared_sentences'] = shared
  df['unique_to_spacy'] = unique_to_spacy.reset_index(drop=True)
  df['unique_to_nltk'] = unique_to_nltk.reset_index(drop=True)
  df['nltk_segmentation']  = one_nltk.reset_index(drop=True)
  df['spacy_segmentation'] = one_spacy.reset_index(drop=True)
  df = pd.concat([pd.Series(string_text, name='original_text'), df], axis=1) # create new Series for original text and concatenate with DataFrame  df = df.reset_index(drop=True)
  return df

### Tokenisation & PosTag

the vocabulary (set of unique tokens) recognised by each library
and their respective size

• the intersection of these two vocabularies (which tokens are part of
both vocabularies?)
• the sets of tokens that is specific to each library (i.e., SpacyOnly:
tokens that are in Spacy vocabulary but not in Stanza’s vs. Stanza-
Only: tokens that are in Stanza vocabulary but not in Spacy’s).
• the set of shared token occurrences i.e., tokens that are identified
by both libraries before and after

In [2]:
df = pd.read_csv("./segmentation_data.csv")


In [3]:
unsegmented_text= str(df['original_text'][0]).lower() # the original text without sentence segmentation
unsegmented_text_clean = re.sub(r'[\s+|\n+]', ' ', unsegmented_text).strip() # remove linebreaks and extra spaces from text before tokenizing

spacy_token_list = [token.text for token in nlp(unsegmented_text_clean)] # get all tokens in the text using spacy
nltk_token_list = nltk.word_tokenize(unsegmented_text_clean) # get all tokens in the text using nltk

uniq_token_nltk = set(nltk_token_list) # remove repetitions by converting to set for nltk
uniq_token_spacy = set(spacy_token_list) # remove repetitions by converting to set for spacy
print("Set of unique tokens (Spacy)", uniq_token_spacy)
print("Set of unique tokens (nltk)", uniq_token_nltk)

Set of unique tokens (Spacy) {'detention', 'political', 'underwater', 'old', 'specialists', 'sociedad', "tul'skiy", 'supervised', 'sum', 'bimonthly', 'sunflower', 'bodied', 'potato', 'transitional', 'bycatch', 'lectured', 'strategies.cbt', 'crescent', 'put', 'emblematic', 'student', '21st', 'glycerol', 're', 'masterminded', 'subtropical', '.', '216', 'spain', '—', 'kingdom', 'tv', 'josé', 'hypothesized', '"the', 'capability', 'fraud', 'islands', 'travels', 'exercised', 'tharsis', 'neuron', 'alcohol', 'ufc', 'test', 'introduced', 'austrian', 'criteria', 'harmful', 'sprouts', 'hunting', 'h.', 'centered', 'even', 'conversion', 'membrane', 'evoking', 'contemporary', '33', 'somewhat', 'underside', 'switzerland', 'beating', 'cues', 'asm', 'features', 'extremely', 'publishes', 'carlsbad', 'diaphragm', 'areas.the', 'tomahawk', 'permitted', 'recollected', 'emperor', 'standards', 'bacteria', 'behaviorism', 'altepemeh', 'buoyancy', 'depends', 'sorties', 'negotiations', 'pay', 'mi', 'disproven', '

In [4]:
size_vocab_nltk = len(uniq_token_nltk) # number of unique tokens identified by nltk in unsegmented text
size_vocab_spacy = len(uniq_token_spacy) # number of unique tokens identified by spacy in unsegmented text
print("Size of vocab (Spacy) :", size_vocab_spacy)
print("Size of vocab (nltk) :", size_vocab_nltk)

Size of vocab (Spacy) : 7717
Size of vocab (nltk) : 7889


In [28]:
common_tokens_noseg = uniq_token_nltk.intersection(uniq_token_spacy) # common tokens identified by both spacy and nltk libraries
print(common_tokens_noseg)

{'crimes', 'lands', 'organic', 'lothar', 'quantitative', 'shaped', 'remained', 'enacted', 'bay', 'spheres', 'odyssey.odyssey', '100–150', 'gormley', 'passing', 'weaving', 'seals', 'viktor', 'centre', 'predisposed', '1994', 'italian', 'metal.in', 'recordings', 'newcastle', 'professional', 'city.in', 'evaluated', 'unnacepted', 'ravaged', 'salvador', 'reported', 'am', 'min', 'define', 'decade', 'ingredient', 'torture', 'strategic', 'sangoan', 'kubrick', 'cases', 'f4', 'gaining', 'logical', 'raised', 'paleolithic', 'sailing', 'travelling', 'ufc', 'sometimes', 'latest', 'interacting', 'elsevier', 'tangible', 'surgeonfishes', 'frequently', 'gambier', 'bottom', 'hippocampus', 'trains', 'mexica', 'operant', 'societies', 'land', 'peace', 'contracts', 'failure', 'integration', 'labour', '1905–1946', 'blanche', 'graduates', 'unexpected', 'laysan', 'articles', 'mimbreño', 'irwin', 'monday', 'g15', 'seizing', 'repairs', 'éké', 'arámburu', 'liga', 'unconscious', '9,900', 'injuries', 'along', '10th',

Now we will extract, sharedTokensInSentences, but we will also at the same time extract the pos-tags for each library to ease the processing later

In [5]:
spacy_tokens_sentences  = []
nltk_tokens_sentences = []
shared_token_in_sentences = []
for sentence in df['shared_sentences']: # loop to get tokens after sentence segmentation 
    segmented_text_clean = re.sub(r'[\s+|\n+]', ' ', str(sentence)).strip()
    spacy_segm = [token for token in nlp(str(sentence))]
    nltk_segm = nltk.pos_tag(nltk.word_tokenize(str(sentence)), tagset='universal')
    spacy_tokens_sentences.append(spacy_segm)
    nltk_tokens_sentences.append(nltk_segm)

In [None]:
tagset_map = {
    "":""
}

In [6]:
for i, sent in enumerate(spacy_tokens_sentences):
    for token in sent:
        nltk_token_tag = list(zip(*nltk_tokens_sentences[i]))
        if token.text in nltk_token_tag[0]:
            nltk_tag = nltk_token_tag[1][nltk_token_tag[0].index(token.text)]
            shared_token_in_sentences.append((token.text, token.pos_, nltk_tag))

In [34]:
# The set of shared tokens in sentences
print(set(list(zip(*shared_token_in_sentences))[0]))

{'organic', 'Indiana', 'quantitative', 'shaped', 'enacted', 'remained', '100–150', 'passing', 'Norte', 'Summit', 'centre', 'Law', 'predisposed', '1994', 'Online', 'recordings', 'professional', 'ravaged', 'reported', 'Dodan', 'Harris', 'define', 'decade', 'Ben', 'Pressearchiv', 'ingredient', 'torture', 'strategic', 'Ismenius', 'Buttes', 'cases', 'Austria', 'gaining', 'logical', 'raised', 'sailing', 'travelling', 'Arévalo', 'sometimes', 'latest', 'interacting', 'tangible', 'surgeonfishes', 'frequently', 'trains', 'Random', 'societies', 'land', 'peace', 'contracts', 'failure', 'integration', 'Dover', 'labour', 'articles', 'Fuentes', 'Doubleday', 'seizing', 'repairs', 'éké', 'unconscious', '9,900', 'Prizes', 'injuries', 'along', '10th', 'methods', 'complement', 'attitudes', 'addressed', 'sentiments', 'presentations', 'toxic', 'inhibitor', 'Medicine', 'signature', 'lake', 'Behring', 'Allen', 'ingested', 'sulphuric', 'news', 'vagrants', 'Ports', 'Sangoan', 'infamous', 'Altepetl', 'vary', 'be

Get the number of times & ratio for which each token is assigned to the same tag by both libraries

In [7]:
from collections import defaultdict, Counter
token_count = Counter(list(zip(*shared_token_in_sentences))[0])
agreement = defaultdict(int)
for token in shared_token_in_sentences :
    if token[1]==token[2]:
        agreement[token[0]]+=1

print("Number of times spacy and nltk agree: ", agreement)
ratios = {k:agreement[k]/token_count[k] for k in agreement}
print("Ratio of agreement: ", ratios)

Number of times spacy and nltk agree:  defaultdict(<class 'int'>, {'judges': 11, 'of': 940, 'the': 1528, 'elected': 14, 'during': 30, 'session': 11, 'scheduled': 1, 'for': 176, '8': 4, '17': 3, '2014': 9, 'in': 607, 'The': 270, 'terms': 9, 'nine': 1, 'years': 30, 'took': 17, 'office': 4, 'on': 151, '11': 7, '2015': 4, 'Accord': 1, 'satisfaction': 4, 'a': 432, 'contract': 5, 'law': 13, 'concept': 2, 'about': 15, 'purchase': 1, 'release': 2, 'from': 137, 'debt': 2, 'obligation': 3, 'It': 74, 'one': 43, 'methods': 4, 'by': 197, 'which': 91, 'parties': 5, 'may': 17, 'terminate': 1, 'agreement': 4, 'completed': 2, 'transfer': 3, 'valuable': 1, 'consideration': 2, 'that': 70, 'must': 4, 'actual': 1, 'performance': 2, 'itself': 5, 'accord': 4, 'discharge': 3, 'legal': 5, 'binds': 1, 'A': 26, 'valid': 1, 'prior': 4, 'instead': 5, 'it': 59, 'suspends': 1, 'right': 2, 'enforce': 2, 'accordance': 1, 'with': 117, 'will': 5, 'both': 10, 'contracts': 1, 'person': 4, 'sued': 1, 'over': 32, 'an': 86, 

In [19]:
from nltk.metrics import ConfusionMatrix
# ref:spacy row:nltk
print(ConfusionMatrix(list(zip(*shared_token_in_sentences))[1], list(zip(*shared_token_in_sentences))[2]).pretty_format(show_percents=True))

      |                                         C                                                P             P      S                      |
      |                                         C      C             N             P      P      R             U      C             V        |
      |             A      A      A      A      O      O      D      O      N      A      R      O      P      N      O      S      E        |
      |             D      D      D      U      N      N      E      U      U      R      O      P      R      C      N      Y      R        |
      |      .      J      P      V      X      J      J      T      N      M      T      N      N      T      T      J      M      B      X |
------+--------------------------------------------------------------------------------------------------------------------------------------+
    . |     <.>     .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      .      . |