# Preparation

In [1]:
# Load data
from pandas import read_csv
df = read_csv("data/train.csv")

In [2]:
# Load spacy
import spacy
from spacy import displacy

# Load the model for English
nlp = spacy.load("en_core_web_sm")

# Working with tokens

In [3]:
# Tokenize document (for the document, just take a random row from the dataframe)
doc = nlp(df.sample().iloc[0].text)

# Recognize parts of sentence
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)



Udhampur Udhampur PROPN NNP amod Xxxxx True False
terror terror NOUN NN compound xxxx True False
attack attack NOUN NN ROOT xxxx True False
: : PUNCT : punct : False False
Militants militant NOUN NNS compound Xxxxx True False
attack attack VERB VBP nsubj xxxx True False
police police NOUN NNS compound xxxx True False
post post NOUN NN dobj xxxx True False
2 2 NUM CD nummod d False False
SPOs spo NOUN NNS nsubj XXXx True False
injured injure VERB VBN ROOT xxxx True False
: : PUNCT : punct : False False
Suspected suspect VERB VBN amod Xxxxx True False
militants militant NOUN NNS nsubj xxxx True False
tonight tonight NOUN NN npadvmod xxxx True False
attacked attack VERB VBD ROOT xxxx True False
a a DET DT det x True True
p p NOUN NN dobj x True False
... ... PUNCT : punct ... False False
http://t.co/Cwm0ULqu3E http://t.co/Cwm0ULqu3E PROPN NNP ROOT xxxx://x.xx/XxxdXXxxdX False False


What the terms mean:
* Text: The original word text.
* Lemma: The base form of the word.
* POS: The simple part-of-speech tag.
* Tag: The detailed part-of-speech tag.
* Dep: Syntactic dependency, i.e. the relation between tokens.
* Shape: The word shape – capitalization, punctuation, digits.
* is alpha: Is the token an alpha character?
* is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [4]:
# Ask Spacy to explain a term:
spacy.explain("dobj")

'direct object'

In [5]:
# Visualize the tokens and connections
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="dep")

# Working with entities

In [6]:
# Recognize entities
for ent in doc.ents:
       print(ent.text, ent.start_char, ent.end_char, ent.label_)

Udhampur 0 8 GPE
2 53 54 CARDINAL
tonight 89 96 TIME
http://t.co/Cwm0ULqu3E 113 135 ORG


In [7]:
# Visualize the entities
displacy.render(doc, style="ent")