# spaCy NLP Tasks Assignment

In [None]:
# Import spaCy and load the English model
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

## Task 1: Tokenization

In [None]:
print("---- Task 1: Tokenization ----")
text1 = "The quick brown fox doesn't jump over the lazy dog. Natural Language Processing is fascinating!"
doc1 = nlp(text1)

for token in doc1:
    print(f"Token: {token.text}, Head: {token.head.text}, Lemma: {token.lemma_}, Morph: {token.morph}")

# Questions answered in comments:
# 1. spaCy processes each word and punctuation as a token. It uses .text to display token, .head to show syntactic head, .lemma_ for the root form, and .morph for grammatical details.
# 2. Punctuation marks like periods and commas are treated as separate tokens. For example, "." is a token on its own.
# 3. Contractions like "doesn't" are split internally into components: "does" and "n't" for processing, though it is shown as a single token in some contexts.


## Task 2: Part-of-Speech Tagging

In [None]:
print("\n---- Task 2: Part-of-Speech Tagging ----")
for token in doc1:
    print(f"Token: {token.text}, POS: {token.pos_}, Tag: {token.tag_}")

# Questions answered in comments:
# POS tags:
# - "quick": ADJ (adjective)
# - "jump": VERB
# - "is": AUX (auxiliary verb)
# POS tagging helps in understanding the grammatical structure of a sentence.
# This is useful in grammar checking to detect incorrect usage and in machine translation to understand context-sensitive word forms.


## Task 3: Named Entity Recognition (NER)

In [None]:
print("\n---- Task 3: Named Entity Recognition ----")
text2 = "Barack Obama was the 44th President of the United States. He was born in Hawaii."
doc2 = nlp(text2)

for ent in doc2.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

# Questions answered in comments:
# Recognized entities: "Barack Obama", "44th", "President", "United States", "Hawaii"
# - "Barack Obama": PERSON
# - "Hawaii": GPE (Geo-Political Entity — e.g., countries, cities, states)


## Task 4: Experimentation

In [None]:
print("\n---- Task 4: Experimentation ----")
my_text = "Tim Apple launched a new iPhon in Sillicon Valley. It costed $999 dollars."
doc3 = nlp(my_text)

print("\nTokens and POS:")
for token in doc3:
    print(f"{token.text} ({token.pos_})")

print("\nNamed Entities:")
for ent in doc3.ents:
    print(f"{ent.text} ({ent.label_})")

# Questions answered in comments:
# - spaCy handles small typos like "iPhon" or "Sillicon" by not recognizing them as known entities (iPhon is not matched to iPhone, Sillicon is not matched to Silicon).
# - "Tim Apple" is identified as a PERSON (though it's a joke name), "$999" is correctly identified as MONEY.
# - This shows that spaCy’s entity recognition is sensitive to spelling and context, and incorrect spelling can prevent entity recognition.
