In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m


In [2]:
# Import nltk and the brown module
import nltk
from nltk.corpus import brown

In [3]:
# Download the ncessary data from the dataset
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [4]:
# Load in our corpus using the universal tag-set. The dataset is in the form of a ist of lists of tuples
corpus = brown.tagged_sents(tagset='universal')
corpus

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [5]:
# Separate the inputs and targets to make the dataset usable to hugingface
inputs = []
targets = []

for sentence_tag_pairs in corpus:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [6]:
# Save data to json format
import json

with open('data.json', 'w') as f:
  for x, y in zip(inputs,targets):
    j = {'inputs':x, 'targets': y}
    s = json.dumps(j)
    f.write(f"{s}\n")


In [7]:
# Import function
from datasets import load_dataset

In [8]:
# Call load_dataset
data = load_dataset("json", data_files='data.json')

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-359176ab57eb26fb/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-359176ab57eb26fb/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# Inspect the data
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [10]:
# 60k samples is to much, shuffle the dataset and take the first 20k samples
small = data["train"].shuffle(seed=42).select(range(20_000))
small

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 20000
})

In [11]:
# Do a train test split
data = small.train_test_split(seed=42)

In [12]:
# Check out the first sample of the dataset, the input is a list of words, the target is a list of tags
data["train"][0]

{'inputs': ['Ulyate',
  'and',
  'Kearton',
  'climbed',
  'on',
  'toward',
  'the',
  'sound',
  'of',
  'the',
  'barking',
  'of',
  'the',
  'dogs',
  'and',
  'the',
  'sporadic',
  'roaring',
  'of',
  'the',
  'lion',
  ',',
  'till',
  'they',
  'came',
  ',',
  'out',
  'of',
  'breath',
  ',',
  'to',
  'the',
  'crest',
  ',',
  'and',
  'peering',
  'through',
  'the',
  'branches',
  'of',
  'a',
  'bush',
  ',',
  'this',
  'is',
  'what',
  'Ulyate',
  'saw',
  ':',
  'Jones',
  'who',
  'had',
  'apparently',
  '(',
  'and',
  'actually',
  'had',
  ')',
  'ridden',
  'up',
  'the',
  'nearly',
  'impassable',
  'hillside',
  ',',
  'sitting',
  'calmly',
  'on',
  'his',
  'horse',
  'within',
  'forty',
  'feet',
  'of',
  'a',
  'full-grown',
  'young',
  'lioness',
  ',',
  'who',
  'was',
  'crouched',
  'on',
  'a',
  'flat',
  'rock',
  'and',
  'seemed',
  'just',
  'about',
  'to',
  'charge',
  'him',
  ',',
  'while',
  'the',
  'dogs',
  'whirled',
  'aroun

In [13]:
# Check out the features attribute of our dataset
data["train"].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [14]:
# Map targets to ints
target_set = set()
for target in targets:
  target_set = target_set.union(target)
target_set


{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [15]:
# Create id2label and label2id
target_list = list(target_set)
id2label = {k: v for k, v in enumerate(target_list)}
label2id = {v: k for k, v in id2label.items()}

In [16]:
# Create the auto-tokenizer, you can try to use bert if u wish and compare the results
from transformers import AutoTokenizer

# Also try using Bert
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [17]:
# Test the tokenizer on the first sample of our dataset
idx = 0
t = tokenizer(data["train"][idx]["inputs"], is_split_into_words=True)
t

{'input_ids': [101, 158, 25928, 1566, 1105, 26835, 9349, 1320, 5998, 1113, 1755, 1103, 1839, 1104, 1103, 26635, 1104, 1103, 6363, 1105, 1103, 188, 27695, 23041, 1104, 1103, 11160, 117, 6174, 1152, 1338, 117, 1149, 1104, 2184, 117, 1106, 1103, 13468, 117, 1105, 19205, 1194, 1103, 5020, 1104, 170, 13771, 117, 1142, 1110, 1184, 158, 25928, 1566, 1486, 131, 2690, 1150, 1125, 4547, 113, 1105, 2140, 1125, 114, 17698, 1146, 1103, 2212, 24034, 11192, 1895, 25068, 117, 2807, 13285, 1113, 1117, 3241, 1439, 5808, 1623, 1104, 170, 1554, 118, 4215, 1685, 11160, 5800, 117, 1150, 1108, 15062, 1113, 170, 3596, 2067, 1105, 1882, 1198, 1164, 1106, 2965, 1140, 117, 1229, 1103, 6363, 18370, 1213, 1123, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [18]:
# The output is not a dict
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [19]:
# the batchencoding object has a tokens method
t.tokens()

['[CLS]',
 'U',
 '##lya',
 '##te',
 'and',
 'Ke',
 '##art',
 '##on',
 'climbed',
 'on',
 'toward',
 'the',
 'sound',
 'of',
 'the',
 'barking',
 'of',
 'the',
 'dogs',
 'and',
 'the',
 's',
 '##poradic',
 'roaring',
 'of',
 'the',
 'lion',
 ',',
 'till',
 'they',
 'came',
 ',',
 'out',
 'of',
 'breath',
 ',',
 'to',
 'the',
 'crest',
 ',',
 'and',
 'peering',
 'through',
 'the',
 'branches',
 'of',
 'a',
 'bush',
 ',',
 'this',
 'is',
 'what',
 'U',
 '##lya',
 '##te',
 'saw',
 ':',
 'Jones',
 'who',
 'had',
 'apparently',
 '(',
 'and',
 'actually',
 'had',
 ')',
 'ridden',
 'up',
 'the',
 'nearly',
 'imp',
 '##ass',
 '##able',
 'hillside',
 ',',
 'sitting',
 'calmly',
 'on',
 'his',
 'horse',
 'within',
 'forty',
 'feet',
 'of',
 'a',
 'full',
 '-',
 'grown',
 'young',
 'lion',
 '##ess',
 ',',
 'who',
 'was',
 'crouched',
 'on',
 'a',
 'flat',
 'rock',
 'and',
 'seemed',
 'just',
 'about',
 'to',
 'charge',
 'him',
 ',',
 'while',
 'the',
 'dogs',
 'whirled',
 'around',
 'her',
 '.',
 

In [20]:
# Value of i indicates it is the i'th word
# In the input sentence (counting from 0)
t.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 46,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 62,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 75,
 75,
 76,
 77,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 None]

In [21]:
# Define method, tak in list of labels and word id's, transforming frpm string to int
def align_targets(labels, word_ids):
  aligned_labels = []
  for word in word_ids:
    if word is None:
      # It's a token like [CLS]
      label = -100
    else:
      # It's a real word
      label = label2id[labels[word]]

    # Add the label
    aligned_labels.append(label)

  return aligned_labels

In [22]:
# Try our function
labels = data['train'][idx]['targets']
word_ids = t.word_ids()
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[-100,
 11,
 11,
 11,
 10,
 11,
 11,
 11,
 6,
 0,
 4,
 7,
 11,
 4,
 7,
 11,
 4,
 7,
 11,
 10,
 7,
 1,
 1,
 11,
 4,
 7,
 11,
 2,
 4,
 9,
 6,
 2,
 0,
 4,
 11,
 2,
 4,
 7,
 11,
 2,
 10,
 6,
 4,
 7,
 11,
 4,
 7,
 11,
 2,
 7,
 6,
 7,
 11,
 11,
 11,
 6,
 2,
 11,
 9,
 6,
 8,
 2,
 10,
 8,
 6,
 2,
 6,
 4,
 7,
 8,
 1,
 1,
 1,
 11,
 2,
 6,
 8,
 4,
 7,
 11,
 4,
 5,
 11,
 4,
 7,
 1,
 1,
 1,
 1,
 11,
 11,
 2,
 9,
 6,
 6,
 4,
 7,
 1,
 11,
 10,
 6,
 8,
 8,
 0,
 6,
 9,
 2,
 4,
 7,
 11,
 6,
 4,
 9,
 2,
 -100]

In [23]:
# Print out the aligned labels with the tokenized inputs
aligned_labels = [id2label[i] if i >= 0 else None for i in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
  print(f"{x}\t{y}")

[CLS]	None
U	NOUN
##lya	NOUN
##te	NOUN
and	CONJ
Ke	NOUN
##art	NOUN
##on	NOUN
climbed	VERB
on	PRT
toward	ADP
the	DET
sound	NOUN
of	ADP
the	DET
barking	NOUN
of	ADP
the	DET
dogs	NOUN
and	CONJ
the	DET
s	ADJ
##poradic	ADJ
roaring	NOUN
of	ADP
the	DET
lion	NOUN
,	.
till	ADP
they	PRON
came	VERB
,	.
out	PRT
of	ADP
breath	NOUN
,	.
to	ADP
the	DET
crest	NOUN
,	.
and	CONJ
peering	VERB
through	ADP
the	DET
branches	NOUN
of	ADP
a	DET
bush	NOUN
,	.
this	DET
is	VERB
what	DET
U	NOUN
##lya	NOUN
##te	NOUN
saw	VERB
:	.
Jones	NOUN
who	PRON
had	VERB
apparently	ADV
(	.
and	CONJ
actually	ADV
had	VERB
)	.
ridden	VERB
up	ADP
the	DET
nearly	ADV
imp	ADJ
##ass	ADJ
##able	ADJ
hillside	NOUN
,	.
sitting	VERB
calmly	ADV
on	ADP
his	DET
horse	NOUN
within	ADP
forty	NUM
feet	NOUN
of	ADP
a	DET
full	ADJ
-	ADJ
grown	ADJ
young	ADJ
lion	NOUN
##ess	NOUN
,	.
who	PRON
was	VERB
crouched	VERB
on	ADP
a	DET
flat	ADJ
rock	NOUN
and	CONJ
seemed	VERB
just	ADV
about	ADV
to	PRT
charge	VERB
him	PRON
,	.
while	ADP
the	DET
dogs	NOUN
whirled	VER

In [24]:
# Tokenize both inputs and targets
def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids. attention_mask, etc
  tokenized_inputs = tokenizer(
      batch['inputs'], truncation=True, is_split_into_words=True
  )

  labels_batch = batch['targets'] # original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))

    # recall: the 'target' must be stored in key called 'labels'
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [25]:
# Remove XX from model inputs - they are neither inputs nor targets
data["train"].column_names

['inputs', 'targets']

In [26]:
print(len(data
          ))
print(data)

2
DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 5000
    })
})


In [27]:
# Map the tokenize function to each sample in our dataset, batching to make it more efficent and we remove the colums
tokenized_datasets = data.map(
    tokenize_fn,
    batched=True, batch_size=1, num_proc=2,
    remove_columns=data["train"].column_names,
)

Map (num_proc=2):   0%|          | 0/15000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [28]:
# Check the outputs
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [29]:
 # Create the data collator
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [30]:
# Flattens a list of lists
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [31]:
# Compute metrics function
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  preds = np.argmax(logits, axis= -1)

  # remove -100 from labels and predictions
  labels_jagged = [[t for t in label if t != -100] for label in labels]

  # do the same for predictions whenever true label is -100
  preds_jagged = [[p for p , t in zip(ps, ts) if t != -100] \
      for ps, ts in zip(preds, labels)
  ]

  # flatten labels nd preds
  labels_flat = flatten(labels_jagged)
  preds_flat = flatten(preds_jagged)

  acc = accuracy_score(labels_flat, preds_flat)
  f1 = f1_score(labels_flat, preds_flat, average='macro')

  return {
      'f1':f1,
      'accuracy':acc,
  }


In [32]:
# Test our compute metrics function
labels = [[-100, 0, 0, 1, 2, 1, -100]]
logits = np.array([[
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.8, 0.1, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
    [0.1, 0.8, 0.1],
]])
compute_metrics((logits, labels))

{'f1': 0.6, 'accuracy': 0.8}

In [33]:
# Load up our pre-trained model
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

In [34]:
# Create the training-arguments object
from transformers import TrainingArguments

training_args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    num_train_epochs=2,
)

In [35]:
# Create the trainer object an passing in what we've created above
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train() # Begins the fine-tuning process

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0689,0.056895,0.938691,0.983666
2,0.0253,0.053224,0.953981,0.985421


TrainOutput(global_step=3750, training_loss=0.06740278498331706, metrics={'train_runtime': 247.7399, 'train_samples_per_second': 121.095, 'train_steps_per_second': 15.137, 'total_flos': 386431490254464.0, 'train_loss': 0.06740278498331706, 'epoch': 2.0})

In [37]:
# Save our model
trainer.save_model('my_saved_model')

In [41]:
# Load up our model as a pipeline object
from transformers import pipeline

pipe = pipeline(
    "token-classification",
    model='my_saved_model',
    device=0,
)

In [43]:
# Test the pipeline on a simple sentence
s = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
pipe(s)

[{'entity': 'NOUN',
  'score': 0.9996824,
  'index': 1,
  'word': 'Bill',
  'start': 0,
  'end': 4},
 {'entity': 'NOUN',
  'score': 0.99937457,
  'index': 2,
  'word': 'Gates',
  'start': 5,
  'end': 10},
 {'entity': 'VERB',
  'score': 0.9998348,
  'index': 3,
  'word': 'was',
  'start': 11,
  'end': 14},
 {'entity': 'DET',
  'score': 0.99989736,
  'index': 4,
  'word': 'the',
  'start': 15,
  'end': 18},
 {'entity': 'NOUN',
  'score': 0.9995834,
  'index': 5,
  'word': 'CEO',
  'start': 19,
  'end': 22},
 {'entity': 'ADP',
  'score': 0.9998374,
  'index': 6,
  'word': 'of',
  'start': 23,
  'end': 25},
 {'entity': 'NOUN',
  'score': 0.9995592,
  'index': 7,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity': 'ADP',
  'score': 0.9997334,
  'index': 8,
  'word': 'in',
  'start': 36,
  'end': 38},
 {'entity': 'NOUN',
  'score': 0.9998109,
  'index': 9,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity': '.',
  'score': 0.99982244,
  'index': 10,
  'word': ',',
  'st

In [44]:
s = "Bruce Wayne livs in Gotham City, he goes under the name Batman."
pipe(s)

[{'entity': 'NOUN',
  'score': 0.9985576,
  'index': 1,
  'word': 'Bruce',
  'start': 0,
  'end': 5},
 {'entity': 'NOUN',
  'score': 0.99945265,
  'index': 2,
  'word': 'Wayne',
  'start': 6,
  'end': 11},
 {'entity': 'VERB',
  'score': 0.92318225,
  'index': 3,
  'word': 'l',
  'start': 12,
  'end': 13},
 {'entity': 'VERB',
  'score': 0.8592201,
  'index': 4,
  'word': '##iv',
  'start': 13,
  'end': 15},
 {'entity': 'VERB',
  'score': 0.99387383,
  'index': 5,
  'word': '##s',
  'start': 15,
  'end': 16},
 {'entity': 'ADP',
  'score': 0.99960154,
  'index': 6,
  'word': 'in',
  'start': 17,
  'end': 19},
 {'entity': 'NOUN',
  'score': 0.9995672,
  'index': 7,
  'word': 'Gotham',
  'start': 20,
  'end': 26},
 {'entity': 'NOUN',
  'score': 0.99978787,
  'index': 8,
  'word': 'City',
  'start': 27,
  'end': 31},
 {'entity': '.',
  'score': 0.99987173,
  'index': 9,
  'word': ',',
  'start': 31,
  'end': 32},
 {'entity': 'PRON',
  'score': 0.9997727,
  'index': 10,
  'word': 'he',
  'sta

In [45]:
s = "Peter Parker is an American teenager .He lives in New York.What is he like ? He has got short straight brown hair, brown eyes, a long noseand a square face. He has got glasses. He is tall and slim but he isn't very strong !"
pipe(s)

[{'entity': 'NOUN',
  'score': 0.9997806,
  'index': 1,
  'word': 'Peter',
  'start': 0,
  'end': 5},
 {'entity': 'NOUN',
  'score': 0.9997851,
  'index': 2,
  'word': 'Parker',
  'start': 6,
  'end': 12},
 {'entity': 'VERB',
  'score': 0.99986565,
  'index': 3,
  'word': 'is',
  'start': 13,
  'end': 15},
 {'entity': 'DET',
  'score': 0.9997782,
  'index': 4,
  'word': 'an',
  'start': 16,
  'end': 18},
 {'entity': 'ADJ',
  'score': 0.9987219,
  'index': 5,
  'word': 'American',
  'start': 19,
  'end': 27},
 {'entity': 'NOUN',
  'score': 0.9986669,
  'index': 6,
  'word': 'teenager',
  'start': 28,
  'end': 36},
 {'entity': '.',
  'score': 0.9998491,
  'index': 7,
  'word': '.',
  'start': 37,
  'end': 38},
 {'entity': 'PRON',
  'score': 0.9998318,
  'index': 8,
  'word': 'He',
  'start': 38,
  'end': 40},
 {'entity': 'VERB',
  'score': 0.9997322,
  'index': 9,
  'word': 'lives',
  'start': 41,
  'end': 46},
 {'entity': 'ADP',
  'score': 0.99954456,
  'index': 10,
  'word': 'in',
  's