In [None]:
! pip install spacy



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x78831b86e650>

In [None]:
introduction_doc = nlp("This tutorial is about Natural Language Processing in speech")
type(introduction_doc)

spacy.tokens.doc.Doc

In [None]:
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'speech']

In [None]:
import pathlib
file_name = "Introduction.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print ([token. text for token in introduction_doc])

['Walter', 'White', 'is', 'a', 'skilled', 'chemist', 'who', 'co', '-', 'founded', 'a', 'technology', 'firm', 'before', 'he', 'accepted', 'a', 'buy', '-', 'out', 'from', 'his', 'partners', '.', 'While', 'his', 'partners', 'became', 'wealthy', ',', 'Walter', 'became', 'a', 'high', 'school', 'chemistry', 'teacher', 'in', 'Albuquerque', ',', 'barely', 'making', 'ends', 'meet', 'with', 'his', 'family', ':', 'his', 'wife', 'Skyler', '(', 'Anna', 'Gunn', ')', 'and', 'son', 'Walter', 'Jr.', '(', 'RJ', 'Mitte', ')', '.', 'At', 'the', 'start', 'of', 'the', 'series', ',', 'the', 'day', 'after', 'his', '50th', 'birthday', ',', 'White', 'is', 'diagnosed', 'with', 'Stage', 'III', 'lung', 'cancer', '.', 'After', 'this', 'discovery', ',', 'White', 'decides', 'to', 'manufacture', 'and', 'sell', 'methamphetamine', 'with', 'a', 'former', 'student', ',', 'Jesse', 'Pinkman', '(', 'Aaron', 'Paul', ')', ',', 'to', 'ensure', 'his', 'family', "'s", 'financial', 'security', 'after', 'his', 'death', '.', 'Due', 

In [None]:
about_text = introduction_doc

In [None]:
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

5

In [None]:
for sentence in sentences:
  print(f"{sentence[:5]} ... ")

Walter White is a skilled ... 
While his partners became wealthy ... 
At the start of the ... 
After this discovery, White ... 
Due to his expertise, ... 


In [None]:
ellipsis_text = (
"Gus Fring, can you, ... never mind, I forgot"
" what I was saying. So, do you think"
" we should ... cook?" )

In [None]:
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[ :- 1]:
    if token.text == " ... ":
      doc[token.i + 1].is_sent_start = True
  return doc

In [None]:
custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")

In [None]:
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
  print(sentence)

Gus Fring, can you, ... never mind, I forgot what I was saying.
So, do you think we should ... cook?


In [None]:
import spacy
nlp = spacy. load("en_core_web_sm")
for token in about_doc:
  print (token, token.idx)

Walter 0
White 7
is 13
a 16
skilled 18
chemist 26
who 34
co 38
- 40
founded 41
a 49
technology 51
firm 62
before 67
he 74
accepted 77
a 86
buy 88
- 91
out 92
from 96
his 101
partners 105
. 113
While 115
his 121
partners 125
became 134
wealthy 141
, 148
Walter 150
became 157
a 164
high 166
school 171
chemistry 178
teacher 188
in 196
Albuquerque 199
, 210
barely 212
making 219
ends 226
meet 231
with 236
his 241
family 245
: 251
his 253
wife 257
Skyler 262
( 269
Anna 270
Gunn 275
) 279
and 281
son 285
Walter 289
Jr. 296
( 300
RJ 301
Mitte 304
) 309
. 310
At 312
the 315
start 319
of 325
the 328
series 332
, 338
the 340
day 344
after 348
his 354
50th 358
birthday 363
, 371
White 373
is 379
diagnosed 382
with 392
Stage 397
III 403
lung 407
cancer 412
. 418
After 420
this 426
discovery 431
, 440
White 442
decides 448
to 456
manufacture 459
and 471
sell 475
methamphetamine 480
with 496
a 501
former 503
student 510
, 517
Jesse 519
Pinkman 525
( 533
Aaron 534
Paul 540
) 544
, 545
to 547
ensure 5

In [None]:
print(
f'{"Text with Whitespace":22}'
f'{"Is Alphanumeric?":15}'
f'{"Is Punctuation?":18}'
f'{"Is Stop Word?"}'
)

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?


In [None]:
for token in about_doc:
    print(f'{str(token.text_with_ws):22}'
          f'{str(token.is_alpha):15}'
          f'{str(token.is_punct):18}'
          f'{str(token.is_stop)}')

Walter                True           False             False
White                 True           False             False
is                    True           False             True
a                     True           False             True
skilled               True           False             False
chemist               True           False             False
who                   True           False             True
co                    True           False             False
-                     False          True              False
founded               True           False             False
a                     True           False             True
technology            True           False             False
firm                  True           False             False
before                True           False             True
he                    True           False             True
accepted              True           False             False
a                     True    

In [None]:
tokens_to_print = [token.text for token in about_doc[8:15]]
print(tokens_to_print)

['-', 'founded', 'a', 'technology', 'firm', 'before', 'he']


In [None]:
import re
import spacy
from spacy.tokenizer import Tokenizer
custom_nlp = spacy. load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
custom_infixes = [r"@"]
infix_re = spacy.util.compile_infix_regex(
list(custom_nlp.Defaults.infixes) + custom_infixes
custom_nlp.tokenizer = Tokenizer(
custom_nlp.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None,
)
)
custom_about_text = (
"Gus Proto is a Python developer currently"
" working for a London@based Fintech"
" company. He is interested in learning"
" Natural Language Processing."
custom_tokenizer_about_doc = custom_nlp(custom_about_text)
print([token. text for token in custom_tokenizer_about_doc[8:15]])

In [None]:
file_name = "/content/Introduction.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print ([token. text for token in introduction_doc])

['Walter', 'White', 'is', 'a', 'skilled', 'chemist', 'who', 'co', '-', 'founded', 'a', 'technology', 'firm', 'before', 'he', 'accepted', 'a', 'buy', '-', 'out', 'from', 'his', 'partners', '.', 'While', 'his', 'partners', 'became', 'wealthy', ',', 'Walter', 'became', 'a', 'high', 'school', 'chemistry', 'teacher', 'in', 'Albuquerque', ',', 'barely', 'making', 'ends', 'meet', 'with', 'his', 'family', ':', 'his', 'wife', 'Skyler', '(', 'Anna', 'Gunn', ')', 'and', 'son', 'Walter', 'Jr.', '(', 'RJ', 'Mitte', ')', '.', 'At', 'the', 'start', 'of', 'the', 'series', ',', 'the', 'day', 'after', 'his', '50th', 'birthday', ',', 'White', 'is', 'diagnosed', 'with', 'Stage', 'III', 'lung', 'cancer', '.', 'After', 'this', 'discovery', ',', 'White', 'decides', 'to', 'manufacture', 'and', 'sell', 'methamphetamine', 'with', 'a', 'former', 'student', ',', 'Jesse', 'Pinkman', '(', 'Aaron', 'Paul', ')', ',', 'to', 'ensure', 'his', 'family', "'s", 'financial', 'security', 'after', 'his', 'death', '.', 'Due', 

In [None]:
import string
from collections import Counter

# Define a function to check if the word contains only alphabetic characters and symbols
def is_valid_word(word):
    return all(c in string.ascii_letters + string.punctuation for c in word) and len(word) <= 15

words = [
    token.text
    for token in introduction_doc
    if not token.is_stop and not token.is_punct and is_valid_word(token.text)
]

# Print the 50 most common valid words
print(Counter(words).most_common(50))


[('Jesse', 746), ('Walt', 389), ('Saul', 116), ('meth', 81), ('Gus', 71), ('Todd', 71), ('Jane', 60), ('money', 56), ('like', 52), ('Hank', 50), ('Walter', 49), ('know', 46), ('Mike', 42), ('drug', 40), ('Tuco', 39), ('cook', 35), ('car', 32), ('RV', 31), ('kill', 31), ('tells', 30), ('time', 29), ('Jack', 29), ('help', 28), ('death', 28), ('mean', 28), ('want', 27), ('later', 27), ('lab', 26), ('White', 25), ('Badger', 25), ('gun', 25), ('house', 24), ('got', 24), ('man', 23), ('way', 23), ('says', 23), ('Brock', 23), ('asks', 22), ('right', 21), ('good', 20), ('El', 20), ('door', 20), ('meeting', 20), ('new', 20), ('apartment', 20), ('Andrea', 20), ('Neil', 20), ('Camino', 19), ('Emilio', 18), ('DEA', 18)]


In [None]:
import pathlib
import spacy
from collections import Counter
import string

# Define a function to check if the word contains only alphabetic characters and symbols
def is_valid_word(word):
    return len(word) <= 15

# Load the spaCy model (ensure you have a model like 'en_core_web_sm' installed)
nlp = spacy.load("en_core_web_sm")

# Read the file
file_name = "goodguys.txt"
text = pathlib.Path(file_name).read_text(encoding="utf-8")

# Split the text into chunks
chunk_size = 1000000  # 1 million characters per chunk
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Process each chunk and extract words
words = []
for chunk in chunks:
    doc = nlp(chunk)
    words += [
        token.text.lower()  # Convert the word to lowercase
        for token in doc
        if not token.is_stop and not token.is_punct and is_valid_word(token.text)
    ]

# Print the 50 most common valid words
print(Counter(words).most_common(50))


[('\n', 22331), ('shree', 5567), ('prasad', 5502), ('>', 4457), ('<', 4361), ('media', 4097), ('omitted', 4095), ('neelraj', 3679), ('avv', 3527), ('reddy', 3506), ('tharun', 2571), ('gk', 2535), ('aie', 2163), ('pradeep', 2038), ('kumar', 1928), ('7/3/24', 1701), ('guru', 1378), ('big', 1358), ('fish', 1335), ('🐠', 1331), ('shyam', 864), ('praveen', 860), ('da', 794), ('mahendra', 768), ('sudhesh', 706), ('jeevan', 678), ('\n\n', 660), ('tamil', 640), ('1', 572), ('5/18/24', 481), ('5/22/24', 480), ('=', 455), ('2', 450), ('la', 432), ('5/30/23', 423), ('6/1/23', 423), ('nikhil', 421), ('5/25/23', 421), ('krishna', 415), ('5/26/23', 408), ('message', 396), ('6/9/23', 383), ('7/20/23', 373), ('6/2/23', 369), ('v', 368), ('g', 365), ('5/31/23', 362), ('6/29/24', 345), ('ah', 344), ('21:00', 343)]


In [None]:
print(Counter(words).most_common())



In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "SpaCy is a popular library for natural language processing."

# Process the text
doc = nlp(text)

# Print POS tags for each token
for token in doc:
    print(f'{token.text}: {token.pos_}')


SpaCy: PROPN
is: AUX
a: DET
popular: ADJ
library: NOUN
for: ADP
natural: ADJ
language: NOUN
processing: NOUN
.: PUNCT


In [None]:
import spacy
from spacy import displacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "SpaCy is a popular library for natural language processing."

# Process the text
doc = nlp(text)

# Visualize dependency parsing (using displaCy)
displacy.render(doc, style='dep', jupyter=True)  # Use `jupyter=True` for Jupyter notebooks, False for other environments

# For entity visualization:
# displacy.render(doc, style='ent', jupyter=True)


In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "SpaCy is a great library for Natural Language Processing!"

# Process the text
doc = nlp(text)

# Preprocessing steps
tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]

# Print preprocessed tokens
print(tokens)


['spacy', 'great', 'library', 'natural', 'language', 'processing']


In [None]:
import spacy
from spacy.matcher import Matcher

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Create a Matcher instance
matcher = Matcher(nlp.vocab)

# Define a pattern for matching
pattern = [{"LOWER": "spacy"}, {"IS_PUNCT": True}, {"LOWER": "library"}]

# Add the pattern to the matcher
matcher.add("SPACY_LIBRARY_PATTERN", [pattern])

# Sample text
text = "spacy is a great library, especially for natural language processing."

# Process the text
doc = nlp(text)

# Apply matcher to the doc
matches = matcher(doc)

# Print matching results
for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Matched span: '{span.text}'")


In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "SpaCy is an open-source library."

# Process the text
doc = nlp(text)

# Print dependencies and their relations
for token in doc:
    print(f'{token.text} -> {token.dep_} -> {token.head.text}')


SpaCy -> nsubj -> is
is -> ROOT -> is
an -> det -> library
open -> amod -> source
- -> punct -> source
source -> compound -> library
library -> attr -> is
. -> punct -> is


In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Me and my friends went to New York to meet Jeffrey, who is a famous person"

# Process the text
doc = nlp(text)

# Print named entities
for ent in doc.ents:
    print(f'{ent.text} ({ent.label_})')


New York (GPE)
Jeffrey (PERSON)
