In [None]:
!pip install spacy



# Task 1

In [82]:

import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Sample Text
text = "The quick brown fox doesn't jump over the lazy dog. Natural Language Processing is fascinating!"

# Process the text with spacy
doc = nlp(text)

# Run the loop using text, lemma, head, and morph functions
print("=== Tokenization ===")
for token in doc:
    print(f'''TEXT: {token.text:<15} LEMMA: {token.lemma_:<15} HEAD: {token.head} MORPH: {token.morph}''')

# Answers to Questions

# 1) - How does spaCy process the various tokens?
# spaCy treats each word, punctuation mark, and contracted word as a token. 

# 2) - How does spaCy handle punctuation marks like periods and commas?
# Punctuation are treated as separate tokens. You can see them see them printed as indiviudal tokens with their own featuers

# 3) - What happens when the text includes contractions?
# spaCy keeps contractions like "don't" as a single token (it does not split "do" and "not"). The lemma is often expanded into ("do not")

=== Tokenization ===
TEXT: The             LEMMA: the             HEAD: fox MORPH: Definite=Def|PronType=Art
TEXT: quick           LEMMA: quick           HEAD: fox MORPH: Degree=Pos
TEXT: brown           LEMMA: brown           HEAD: fox MORPH: Degree=Pos
TEXT: fox             LEMMA: fox             HEAD: jump MORPH: Number=Sing
TEXT: does            LEMMA: do              HEAD: jump MORPH: Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
TEXT: n't             LEMMA: not             HEAD: jump MORPH: Polarity=Neg
TEXT: jump            LEMMA: jump            HEAD: jump MORPH: VerbForm=Inf
TEXT: over            LEMMA: over            HEAD: jump MORPH: 
TEXT: the             LEMMA: the             HEAD: dog MORPH: Definite=Def|PronType=Art
TEXT: lazy            LEMMA: lazy            HEAD: dog MORPH: Degree=Pos
TEXT: dog             LEMMA: dog             HEAD: over MORPH: Number=Sing
TEXT: .               LEMMA: .               HEAD: jump MORPH: PunctType=Peri
TEXT: Natural         L

# Task 2

In [None]:

# Run the loop using pos and tag functions
print("=== Part-of-Speech Tagging ===")
for token in doc:
    print(f'Token: {token.text:<15} POS: {token.pos_:<10} TAG: {token.tag_}')

# Answers to Questions

# 1) - Identify the POS tags for "quick", "jumps", and "is"

# "quick" -> ADJ(adjective)
# "jump" -> VERB(verb)
# "is" -> AUX(auxiliary verb)

# 2) - Why might POS taggin be useful for tasks like grammar checking or machine translation?

# POS tagging helps identify the grammatical role of each word in a sentence
# For grammar checking, it allows us to detect incorrect usage of words
# For machine translation, it hlpes preserve sentence structure and meaning across languages

=== Part-of-Speech Tagging ===
Token: The             POS: DET        TAG: DT
Token: quick           POS: ADJ        TAG: JJ
Token: brown           POS: ADJ        TAG: JJ
Token: fox             POS: NOUN       TAG: NN
Token: does            POS: AUX        TAG: VBZ
Token: n't             POS: PART       TAG: RB
Token: jump            POS: VERB       TAG: VB
Token: over            POS: ADP        TAG: IN
Token: the             POS: DET        TAG: DT
Token: lazy            POS: ADJ        TAG: JJ
Token: dog             POS: NOUN       TAG: NN
Token: .               POS: PUNCT      TAG: .
Token: Natural         POS: PROPN      TAG: NNP
Token: Language        POS: PROPN      TAG: NNP
Token: Processing      POS: NOUN       TAG: NN
Token: is              POS: AUX        TAG: VBZ
Token: fascinating     POS: ADJ        TAG: JJ
Token: !               POS: PUNCT      TAG: .


# Task 3

In [81]:
# New text to analyze
ner_text = "Barack Obama was the 44th President of the United States. He was born in Hawaii."

# Process the text
doc = nlp(ner_text)

# Run the loop using label function
print('=== Named Entity Recognition (NER) ====')
for ent in doc.ents:
    print(f'ENTITY: {ent.text:<25} LABEL: {ent.label_}')


# Answers to Questions

# 1) - Which enetities are recognized by spaCy?

# "Barack Obama", "44th", "the United States", "Hawaii"

# 2) - What entity types are assigned to "Barack Obama" and "Hawaii"?

# "Barack Obama" -> PERSON
# "Hawaii" -> GPE (Geo-Political Entity)

=== Named Entity Recognition (NER) ====
ENTITY: Barack Obama              LABEL: PERSON
ENTITY: 44th                      LABEL: ORDINAL
ENTITY: the United States         LABEL: GPE
ENTITY: Hawaii                    LABEL: GPE


# Task 4

In [101]:
# My original sentence
custom_text = "I believe Shai Gigeous-Alander of the OKC is going to win MVP over Nikola Jokic of the NBA!"

# Process text
doc = nlp(custom_text)

# Running the pipeline, POS tagging
print('=== Tokens and POS tags ===')
for token in doc:
    print(f'{token.text:<15} POS: {token.pos_:<10} TAG: {token.tag_}')

# Named Entity Recognition
print('\n\n=== Named Entities ===')
for ent in doc.ents:
    print(f'{ent.text:<25} LABEL: {ent.label_}')

# Answers to Questions

# spaCy recgonized Shai as a PERSON, which I wasn't sure it would be capable of doing. 
# Even when I drastically mispelled his name it recognized him as a person.
# Recognized NBA as an ORG which is correct, but also identified MVP as an ORG which it isn't
# Additionally, it regonized OKC, the abbreviated version of the Oklahoma City Thunder which surprised me


=== Tokens and POS tags ===
I               POS: PRON       TAG: PRP
believe         POS: VERB       TAG: VBP
Shai            POS: PROPN      TAG: NNP
Gigeous         POS: PROPN      TAG: NNP
-               POS: PUNCT      TAG: HYPH
Alander         POS: PROPN      TAG: NNP
of              POS: ADP        TAG: IN
the             POS: DET        TAG: DT
OKC             POS: PROPN      TAG: NNP
is              POS: AUX        TAG: VBZ
going           POS: VERB       TAG: VBG
to              POS: PART       TAG: TO
win             POS: VERB       TAG: VB
MVP             POS: PROPN      TAG: NNP
over            POS: ADP        TAG: IN
Nikola          POS: PROPN      TAG: NNP
Jokic           POS: PROPN      TAG: NNP
of              POS: ADP        TAG: IN
the             POS: DET        TAG: DT
NBA             POS: PROPN      TAG: NNP
!               POS: PUNCT      TAG: .


=== Named Entities ===
Shai Gigeous-Alander      LABEL: PERSON
OKC                       LABEL: ORG
MVP              