# Preparation

In [1]:
# Load data
from pandas import read_csv
df = read_csv("data/train.csv")

In [2]:
# Load spacy
import spacy
from spacy import displacy

# Load the model for English
nlp = spacy.load("en_core_web_sm")

# Working with tokens

In [21]:
# Tokenize document (for the document, just take a random row from the dataframe)
doc = nlp(df.sample().iloc[0].text)

# Define a function to filter tokens that are stop words or punctuation
def is_token_allowed(token):
    if (not token or not token.string.strip() or token.is_stop or token.is_punct):
        return False
    return True

# Extract tokens that are useful, in their lemmatized, lowercase form
tokens = [(token.lemma_.strip().lower(), token.pos_, token.dep_) for token in doc if is_token_allowed(token)]

# Print them out
for tok, pos, dep in tokens:
    print(tok, pos, dep)

bayelsa PROPN compound
poll NOUN ROOT
tension NOUN appos
bayelsa PROPN pobj
patience PROPN compound
jonathan PROPN nsubj
plan VERB advcl
hijack VERB xcomp
apc PROPN compound
pdp NOUN dobj
plan NOUN appos
lady PROPN pobj
http://t.co/3ejl9lzlch X punct


What the terms mean:
* Text: The original word text.
* Lemma: The base form of the word.
* POS: The simple part-of-speech tag.
* Tag: The detailed part-of-speech tag.
* Dep: Syntactic dependency, i.e. the relation between tokens.
* Shape: The word shape – capitalization, punctuation, digits.
* is alpha: Is the token an alpha character?
* is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [4]:
# Ask Spacy to explain a term:
spacy.explain("dobj")

'direct object'

In [5]:
# Visualize the tokens and connections
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")

# Working with entities

In [6]:
# Recognize entities
for ent in doc.ents:
       print(ent.text, ent.start_char, ent.end_char, ent.label_)

derma 89 94 PERSON


In [7]:
# Visualize the entities
displacy.render(doc, style="ent")