In [35]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krsid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\krsid\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\krsid\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\krsid\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [36]:
text = "Steve Jobs, the co-founder of Apple Inc., introduced the first iPhone in 2007."

# Tokenize the text
tokens = word_tokenize(text)

# Perform part-of-speech tagging
pos_tags = pos_tag(tokens)

In [37]:
# Pattern matching for person and organization names
person_names = []
org_names = []
for token, pos in pos_tags:
    if pos.startswith('NNP'):
        if token == 'Jobs' or token == 'Steve':
            person_names.append(token)
        elif token == 'Apple':
            org_names.append(token)

print("Person Names:", person_names)
print("Organization Names:", org_names)

Person Names: ['Steve', 'Jobs']
Organization Names: ['Apple']


In [38]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Michael Jordan, a former basketball player for the Chicago Bulls, now resides in Florida."

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text:{16}} {ent.label_}")

Michael Jordan   PERSON
the Chicago Bulls ORG
Florida          GPE


In [39]:
nlp = spacy.load("en_core_web_sm")

text1 = "Apple Inc. released a new iPhone model."
text2 = "I ate a juicy apple for breakfast."

doc1 = nlp(text1)
doc2 = nlp(text2)

for ent in doc1.ents:
    print(f"{ent.text:{16}} {ent.label_}")

for ent in doc2.ents:
    print(f"{ent.text:{16}} {ent.label_}")

Apple Inc.       ORG
iPhone           ORG


In [40]:
import gensim.downloader as api
from gensim.models import KeyedVectors

# Load pre-trained word vectors
wv = api.load("word2vec-google-news-300")

# Calculate similarity between words
sim = wv.similarity('apple', 'company')
print(f"Similarity between 'apple' and 'company': {sim}")

sim = wv.similarity('apple', 'fruit')
print(f"Similarity between 'apple' and 'fruit': {sim}")

Similarity between 'apple' and 'company': 0.07217501103878021
Similarity between 'apple' and 'fruit': 0.6410146951675415


In [41]:
text = "Elvis Presley, the legendary musician, and Barack Obama, the former US President, were both influential figures."

# Tokenize and perform part-of-speech tagging
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

# Define fine-grained entity categories
entities = []
for token, pos in pos_tags:
    if pos.startswith('NNP'):
        if token == 'Elvis' or token == 'Presley':
            entities.append(('PERSON-ARTIST', token))
        elif token == 'Barack' or token == 'Obama':
            entities.append(('PERSON-POLITICIAN', token))

print("Identified Entities:")
for entity in entities:
    print(f"{entity[1]:{16}} {entity[0]}")

Identified Entities:
Elvis            PERSON-ARTIST
Presley          PERSON-ARTIST
Barack           PERSON-POLITICIAN
Obama            PERSON-POLITICIAN


In [42]:
nlp = spacy.load("en_core_web_sm")
text = "Elvis Presley, the legendary musician, and Barack Obama, the former US President, were both influential figures."

doc = nlp(text)

for ent in doc.ents:
    if ent.label_ == 'PERSON':
        if 'musician' in ent.sent.text.lower():
            print(f"{ent.text:{16}} PERSON-ARTIST")
        elif 'president' in ent.sent.text.lower():
            print(f"{ent.text:{16}} PERSON-POLITICIAN")

Elvis Presley    PERSON-ARTIST
Barack Obama     PERSON-ARTIST


In [43]:
# Load English and Spanish NER models
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")

# English text
text_en = "Michael Jordan played for the Chicago Bulls in the NBA."
doc_en = nlp_en(text_en)

print("English Entities:")
for ent in doc_en.ents:
    print(f"{ent.text:{16}} {ent.label_}")

# Spanish text
text_es = "Lionel Messi juega para el Barcelona en La Liga."
doc_es = nlp_es(text_es)

print("\nSpanish Entities:")
for ent in doc_es.ents:
    print(f"{ent.text:{16}} {ent.label_}")

English Entities:
Michael Jordan   PERSON
the Chicago Bulls ORG
NBA              ORG

Spanish Entities:
Lionel Messi     PER
Barcelona        ORG
La Liga          ORG


In [47]:
from nltk import word_tokenize, pos_tag, ne_chunk

# Load pre-trained NER tagger
# tagger = nltk.data.load('chunkers/maxent_ne_chunker/english.bin')

# Open file for simulating text stream
# with open('text_stream.txt', 'r') as f:
#     for line in f:
#         # Tokenize and perform part-of-speech tagging
#         tokens = nltk.word_tokenize(line)
#         pos_tags = nltk.pos_tag(tokens)

#         # Perform NER
#         entities = tagger.parse(pos_tags)

#         # Print identified entities
#         print("Entities in line:", line)
#         for entity in entities:
#             if isinstance(entity, nltk.tree.Tree):
#                 if entity.label() == 'PERSON':
#                     print(f"  {' '.join(c[0] for c in entity):{16}} {entity.label()}")


# Simulated text stream as a list of strings
text_stream = [
    "Michael Jordan played for the Chicago Bulls in the NBA.",
    "Lionel Messi plays for Paris Saint-Germain in Ligue 1."
]

# Process each line in the simulated text stream
for line in text_stream:
    # Tokenize and perform part-of-speech tagging
    tokens = word_tokenize(line)
    pos_tags = pos_tag(tokens)

    # Perform NER
    entities = ne_chunk(pos_tags)

    # Print identified entities
    print("Entities in line:", line)
    for entity in entities:
        if isinstance(entity, nltk.Tree):
            if entity.label() == 'PERSON':
                print(f"  {' '.join(c[0] for c in entity):{16}} {entity.label()}")

Entities in line: Michael Jordan played for the Chicago Bulls in the NBA.
  Michael          PERSON
  Jordan           PERSON
Entities in line: Lionel Messi plays for Paris Saint-Germain in Ligue 1.
  Lionel           PERSON
  Paris            PERSON


In [49]:
# # Load NER model
# nlp = spacy.load("en_core_web_sm")

# # Open file for simulating text stream
# with open('text_stream.txt', 'r') as f:
#     for line in f:
#         doc = nlp(line)

#         # Print identified entities
#         print("Entities in line:", line)
#         for ent in doc.ents:
#             print(f"  {ent.text:{16}} {ent.label_}")


# Load NER model
nlp = spacy.load("en_core_web_sm")

for line in text_stream:
    # Tokenize and perform part-of-speech tagging
    tokens = word_tokenize(line)
    pos_tags = pos_tag(tokens)

    # Perform NER with NLTK
    entities = ne_chunk(pos_tags)

    # Print identified entities with NLTK
    print("Entities (NLTK) in line:", line)
    for entity in entities:
        if isinstance(entity, nltk.Tree):
            if entity.label() == 'PERSON':
                print(f"  {' '.join(c[0] for c in entity):{16}} {entity.label()}")

    # Perform NER with SpaCy
    doc = nlp(line)

    # Print identified entities with SpaCy
    print("Entities (SpaCy) in line:", line)
    for ent in doc.ents:
        print(f"  {ent.text:{16}} {ent.label_}")

Entities (NLTK) in line: Michael Jordan played for the Chicago Bulls in the NBA.
  Michael          PERSON
  Jordan           PERSON
Entities (SpaCy) in line: Michael Jordan played for the Chicago Bulls in the NBA.
  Michael Jordan   PERSON
  the Chicago Bulls ORG
  NBA              ORG
Entities (NLTK) in line: Lionel Messi plays for Paris Saint-Germain in Ligue 1.
  Lionel           PERSON
  Paris            PERSON
Entities (SpaCy) in line: Lionel Messi plays for Paris Saint-Germain in Ligue 1.
  Messi            PERSON
  Paris Saint-Germain ORG
  Ligue 1          GPE
