In [4]:
import re
import spacy
import nltk
from spacy.matcher import Matcher
from nltk.corpus import stopwords

**Tokenization and Text Processing:**

In [10]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import logging

# Redirect nltk download messages to the logger
nltk_logger = logging.getLogger('nltk')
nltk_logger.setLevel(logging.CRITICAL)

nltk.download('punkt', quiet=True)

# Read text from a file
with open('biblio.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenization
tokens = word_tokenize(text)
print("Word Tokens:", tokens)

sentences = sent_tokenize(text)
print("Sentences:", sentences)


Word Tokens: ['This', 'is', 'a', 'sample', 'text', 'file', 'for', 'testing', 'NLTK', 'and', 'spaCy', '.', 'It', 'contains', 'multiple', 'sentences', 'and', 'various', 'linguistic', 'elements', '.', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'an', 'exciting', 'field', 'in', 'computer', 'science', '.', 'Named', 'entities', 'like', 'Apple', 'Inc.', 'and', 'New', 'York', 'may', 'appear', 'in', 'the', 'text', '.', 'The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.', 'Dependency', 'parsing', 'helps', 'in', 'understanding', 'the', 'grammatical', 'structure', 'of', 'sentences', '.', 'Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.', 'Part-of-speech', 'tagging', 'assigns', 'grammatical', 'categories', 'to', 'each', 'token', '.', 'Word', 'embeddings', 'capture', 'the', 'semantic', 'meaning', 'of', 'words', '.', 'Feel', 'free', 'to', 'modify', 'this', 'text', 'or', 'create', 'your',

**Part-of-Speech Tagging:**

In [12]:
nltk.download('averaged_perceptron_tagger',quiet=True)

# Part-of-Speech Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('text', 'NN'), ('file', 'NN'), ('for', 'IN'), ('testing', 'VBG'), ('NLTK', 'NNP'), ('and', 'CC'), ('spaCy', 'NN'), ('.', '.'), ('It', 'PRP'), ('contains', 'VBZ'), ('multiple', 'JJ'), ('sentences', 'NNS'), ('and', 'CC'), ('various', 'JJ'), ('linguistic', 'JJ'), ('elements', 'NNS'), ('.', '.'), ('Natural', 'JJ'), ('Language', 'NN'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('an', 'DT'), ('exciting', 'JJ'), ('field', 'NN'), ('in', 'IN'), ('computer', 'NN'), ('science', 'NN'), ('.', '.'), ('Named', 'VBN'), ('entities', 'NNS'), ('like', 'IN'), ('Apple', 'NNP'), ('Inc.', 'NNP'), ('and', 'CC'), ('New', 'NNP'), ('York', 'NNP'), ('may', 'MD'), ('appear', 'VB'), ('in', 'IN'), ('the', 'DT'), ('text', 'NN'), ('.', '.'), ('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('Dependency',

**spaCy Examples with Text File:**
###Tokenization and Part-of-Speech Tagging:

In [17]:
import spacy
from itertools import islice

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

# Read text from a file
with open('biblio.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenization and Part-of-Speech Tagging
doc = nlp(text)
for token in islice(doc, 10):
    print(f"Token: {token.text}, POS: {token.pos_}")


Token: This, POS: PRON
Token: is, POS: AUX
Token: a, POS: DET
Token: sample, POS: NOUN
Token: text, POS: NOUN
Token: file, POS: NOUN
Token: for, POS: ADP
Token: testing, POS: NOUN
Token: NLTK, POS: PROPN
Token: and, POS: CCONJ


**Named Entity Recognition (NER):**

In [18]:
# Named Entity Recognition (NER)
doc = nlp(text)
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Entity: NLTK, Label: LOC
Entity: Natural Language Processing, Label: ORG
Entity: Apple Inc., Label: ORG
Entity: New York, Label: GPE


**Word Embeddings and Similarity:**

In [21]:
import numpy as np

word_vector_apple = nlp("apple").vector
word_vector_orange = nlp("orange").vector

# Calculate similarity using dot product and Euclidean norm
similarity = np.dot(word_vector_apple, word_vector_orange) / (np.linalg.norm(word_vector_apple) * np.linalg.norm(word_vector_orange))

print("Similarity between 'apple' and 'orange':", similarity)
#In the context of word embeddings and vector representations

Similarity between 'apple' and 'orange': 0.6135188


**Dependency Parsing:**

In [23]:
# Dependency Parsing
doc = nlp(text)
for token in islice(doc, 10):
    print(f"Token: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")

"""
    sat
   /   \
The    cat
 |      |
on     mat

"""

Token: This, Dependency: nsubj, Head: is
Token: is, Dependency: ROOT, Head: is
Token: a, Dependency: det, Head: file
Token: sample, Dependency: compound, Head: file
Token: text, Dependency: compound, Head: file
Token: file, Dependency: attr, Head: is
Token: for, Dependency: prep, Head: file
Token: testing, Dependency: compound, Head: NLTK
Token: NLTK, Dependency: pobj, Head: for
Token: and, Dependency: cc, Head: NLTK


'\n    sat\n   /   The    cat\n |      |\non     mat\n\n'

In [34]:
from spacy import displacy

# Process the entire text with spaCy
doc = nlp(text)

# Visualize the dependency tree
displacy.render(doc, style='dep', options={'distance': 90}, jupyter=True)