In [None]:
# Natural Language Processing (NLP) is concerned with the interactions between computers and human (natural) languages.
# It focuses on how computers can process and analyze large amounts of natural language data.
# Unlike numerical data, which computers excel at handling, text data is often highly unstructured.
# NLP uses various techniques to create structure from raw text data.
# Example techniques include:
# - Classifying emails as spam or legitimate
# - Performing sentiment analysis on reviews
# - Understanding and processing text commands

In [None]:
# SpaCy: Open Source NLP Library
# Handles NLP tasks with the most efficient implementation of common algorithms.
# Uses a single implementation method (the most efficient currently available).
# Better performance in most common NLP tasks.
# Users cannot choose the algorithmic implementation (defaults to the most efficient).
# Some applications are not included (such as sentiment analysis).

# NLTK: Popular Open Source NLP Library
# Less efficient implementation.
# NLTK is easier to use for certain tasks.

In [None]:
# Summary of Keywords:
# - `import spacy`: Imports the SpaCy library for NLP tasks.
# - `nlp`: A SpaCy model object used to process raw text data (e.g., `nlp = spacy.load('en_core_web_sm')`).
# - `spacy.load('en_core_web_sm')`: Loads the English language model for SpaCy.
# - `doc`: A processed SpaCy document object that contains tokens and their linguistic annotations (e.g., `doc = nlp("Tesla is looking at buying U.S. startup for $6 million")`).
# - `token.text`: Accesses the text of a token in a SpaCy document.
# - `token.pos_`: Accesses the part-of-speech tag of a token.
# - `token.dep_`: Accesses the syntactic dependency label of a token.
# - `token.lemma_`: Accesses the lemma (base form) of a token.
# - `doc.sents`: An iterator over sentence spans in the document.
# - `entity`: Refers to a named entity in the text.
# - `entity.label_`: Accesses the label (type) of a named entity.
# - `spacy.explain(entity.label_)`: Provides a human-readable explanation of the entity label.
# - `for token in doc`: Iterates over each token in the SpaCy document.
# - `for entity in doc.ents`: Iterates over each named entity in the SpaCy document.


In [None]:
# Install SpaCy
!pip install spacy

# Download the English language model
!python -m spacy download en_core_web_sm

In [None]:
# SpaCy with nlp() processes raw text data and performs various NLP operations such as tokenization, tagging, and parsing.
# These operations include part-of-speech recognition and more, resulting in a processed document.


In [21]:
import spacy

# Load the SpaCy model for English
nlp = spacy.load('en_core_web_sm')

# Process raw text data
doc = nlp("Tesla is looking at buying U.S. startup for $6 million")

# The nlp() function processes raw text and performs NLP tasks like tokenization, tagging, parsing, and part-of-speech recognition, resulting in a processed document.


In [22]:
for token in doc:
  print(token.text, token.pos_, token.dep_, token.lemma_)  # dep_: syntatic dependency

Tesla PROPN nsubj Tesla
is AUX aux be
looking VERB ROOT look
at ADP prep at
buying VERB pcomp buy
U.S. PROPN compound U.S.
startup NOUN dobj startup
for ADP prep for
$ SYM quantmod $
6 NUM compound 6
million NUM pobj million


In [11]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7b86d5ccef80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7b86d5ccff40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7b86d9d04a50>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7b86d5aefe40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7b86d5aef240>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7b86d9d07d10>)]

In [12]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [18]:
doc2 = nlp("Tesla isn't     looking for startups anymore.")

In [19]:
for token in doc2:
  print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
     SPACE dep
looking VERB ROOT
for ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [20]:
# indexing to grab token individually
doc2[0].pos_

'PROPN'

In [32]:
doc3 = nlp("This is the first setnence. This is another sentence. This is the last sentence")

In [35]:
# Iterate through the sentences in the doc object and print each sentence
for sentence in doc.sents:
    print(sentence)

Tesla is looking at buying U.S. startup for $6 million


In [34]:
doc3[7].is_sent_start

False

Tokenization

In [37]:
# Tokenization is the process of breaking up the original text into component pieces (tokens).
# Tokens are the basic building blocks of the doc object.
# Everything that helps understand the meaning of the text is derived from the tokens and their relationships.
# Prefix examples: $, (, "
# Suffix examples: km, ), ,, ., !
# Infix examples: -, --, /, ...
# Exceptions: Handles cases like "let's" or "U.S." to split or prevent splitting into multiple tokens.

In [39]:
mystring = '"We\'re moving to L.A.!"'

In [40]:
mystring

'"We\'re moving to L.A.!"'

In [41]:
print(mystring)

"We're moving to L.A.!"


In [42]:
doc4 = nlp(mystring)

In [43]:
for token in doc4:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [47]:
doc5 = nlp("We're here to help! send snail-mail or email nawarturk11@gmail.com or visit www.github.com/nawarturk")

In [48]:
for token in doc5:
  print(token.text)

We
're
here
to
help
!
send
snail
-
mail
or
email
nawarturk11@gmail.com
or
visit
www.github.com/nawarturk


In [56]:
doc6 = nlp("let's visit St. Louis in the U.S. next year.")

In [57]:
for token in doc6:
  print(token.text)

let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [58]:
len(doc6)

11

In [61]:
# doc5[5] = 'test'  # You cannot reassign a token in a SpaCy doc object.

In [69]:
doc7 = nlp('Apple to build a Hong Kong factory for $6 million')

In [64]:
for token in doc7:
  print(token.text, end='|')

Apple|to|build|a|Hong|Kong|factory|for|$|6|million|

In [72]:
# Iterate through the named entities in the doc7 object
for entity in doc7.ents:
    print(entity)  # Print the entity
    print(entity.label_)  # Print the entity's label
    print(str(spacy.explain(entity.label_)))  # Print an explanation of the entity's label
    print('\n')

# This recognizes and prints named entities from the text.


Apple
ORG
Companies, agencies, institutions, etc.


Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [73]:
doc8 = nlp('Autonomous cars shift insurance liability toward manufacturers.')

In [74]:
for chunk in doc8.noun_chunks:
  print(chunk)

Autonomous cars
insurance liability
manufacturers


In [None]:
# In the sentence "Tesla is looking at buying Hong Kong startup for $6 million":
# - Tokens are individual words or punctuation marks, e.g., "Tesla", "is", "looking", "at", "buying", "Hong", "Kong", "startup", "for", "$", "6", "million".
# - Entities are specific named objects, e.g., "Tesla" (an organization) and "Hong Kong" (a geopolitical entity).
# - Chunks are meaningful phrases centered around nouns, e.g., "Tesla", "Hong Kong startup", and "$6 million".
