In [1]:
# Perform standard imports
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Read the content of the file
with open('peterrabbit.txt', 'r') as file:
    text = file.read()

# 1. Create a Doc object from the file peterrabbit.txt
doc = nlp(text)

In [2]:
# 2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.
third_sentence = list(doc.sents)[2]  # Get the third sentence
for token in third_sentence:
    print(f"Token: {token.text}, {token.pos_}, {token.tag_}, {spacy.explain(token.tag_)}")

Token: They, PRON, PRP, pronoun, personal
Token: lived, VERB, VBD, verb, past tense
Token: with, ADP, IN, conjunction, subordinating or preposition
Token: their, PRON, PRP$, pronoun, possessive
Token: Mother, PROPN, NNP, noun, proper singular
Token: in, ADP, IN, conjunction, subordinating or preposition
Token: a, DET, DT, determiner
Token: sand, NOUN, NN, noun, singular or mass
Token: -, PUNCT, HYPH, punctuation mark, hyphen
Token: bank, NOUN, NN, noun, singular or mass
Token: ,, PUNCT, ,, punctuation mark, comma
Token: underneath, ADP, IN, conjunction, subordinating or preposition
Token: the, DET, DT, determiner
Token: root, NOUN, NN, noun, singular or mass
Token: of, ADP, IN, conjunction, subordinating or preposition
Token: a, DET, DT, determiner
Token: 
, SPACE, _SP, whitespace
Token: very, ADV, RB, adverb
Token: big, ADJ, JJ, adjective (English), other noun-modifier (Chinese)
Token: fir, NOUN, NN, noun, singular or mass
Token: -, PUNCT, HYPH, punctuation mark, hyphen
Token: tree, N

In [3]:
# 3. Provide a frequency list of POS tags (without )
pos_counts = {}

for token in doc:
    pos_counts[token.pos_] = pos_counts.get(token.pos_, 0) + 1

print(pos_counts)

{'DET': 90, 'PROPN': 75, 'ADP': 124, 'PUNCT': 172, 'NUM': 8, 'SPACE': 99, 'ADV': 65, 'SCONJ': 20, 'NOUN': 173, 'PRON': 108, 'VERB': 131, 'ADJ': 54, 'CCONJ': 61, 'AUX': 50, 'PART': 28}


In [4]:
# 4. Calculate the percentage of tokens that are nouns
num_nouns = sum(1 for token in doc if token.pos_ == "NOUN")
total_tokens = len(doc)
percentage_nouns = (num_nouns / total_tokens) * 100
print(f"Percentage of tokens that are nouns: {percentage_nouns:.2f}%")

Percentage of tokens that are nouns: 13.75%


In [5]:
# 5.  Display the Dependency Parse for the third sentence
import spacy.displacy

spacy.displacy.render(third_sentence, style="dep", jupyter=True)

In [6]:
# 6. Show the first two named entities
entities = list(doc.ents)[:2]
for ent in entities:
    print(ent.text, ent.label_, spacy.explain(ent.label_))

The Tale of Peter Rabbit WORK_OF_ART Titles of books, songs, etc.
Beatrix Potter PERSON People, including fictional


In [7]:
# 7. Count the number of sentences in the document
num_sentences = len(list(doc.sents))
print(f"Total sentences: {num_sentences}")

Total sentences: 57


In [8]:
# 8. Count sentences that contain named entities
num_sents_with_ents = sum(1 for sent in doc.sents if any(token.ent_type_ for token in sent))
print(f"Sentences containing named entities: {num_sents_with_ents}")

Sentences containing named entities: 38


In [9]:
spacy.displacy.render(list(doc.sents)[0], style="ent", jupyter=True)