In [1]:
# Rule based - NLTK

In [2]:
import requests
import nltk
from nltk import word_tokenize, pos_tag
from nltk.chunk import RegexpParser

In [24]:
# Load NLTK data
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [34]:
api_url = "https://newsapi.org/v2/everything?q=apple&from=2024-08-03&to=2024-08-04&sortBy=popularity&apiKey=7c75568f1c764c1a99df62b1bdd8f278"
api_key = "7c75568f1c764c1a99df62b1bdd8f278"  

In [35]:
# Fetch the news article
response = requests.get(api_url, api_key)
data = response.json()

In [37]:
# Assuming the article content is in the 'content' field of the first article
article = data['articles'][0]['description']
print(f"Fetched Article:\n{article}\n")

Fetched Article:
Google's Pixel Watch 3 is becoming official on August 13, and for the first time in the series it will come in two sizes: 41mm and 45mm.

Today the pricing for all versions has been leaked. The 41mm Wi-Fi and Bluetooth Pixel Watch 3 will be $349, while the 45…



In [38]:
# Tokenize the text into sentences
sentences = nltk.sent_tokenize(article)

# Tokenize each sentence into words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

# Part-of-Speech (POS) tagging for each tokenized sentence
pos_tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

In [40]:
# Define the chunking pattern
chunk_pattern = r"""
    NE: {<NNP>+}
"""

# Initialize the chunk parser
chunk_parser = RegexpParser(chunk_pattern)

# List to store named entities
named_entities = []

In [41]:
# Chunk and extract named entities from each POS tagged sentence
for sentence in pos_tagged_sentences:
    tree = chunk_parser.parse(sentence)
    for subtree in tree:
        if isinstance(subtree, nltk.Tree) and subtree.label() == 'NE':
            entity = " ".join([token for token, pos in subtree.leaves()])
            named_entities.append(entity)

In [42]:
# Print the extracted named entities
print("Extracted Named Entities:")
for entity in named_entities:
    print(f"Entity: {entity}")

Extracted Named Entities:
Entity: Google
Entity: Pixel Watch
Entity: August
Entity: Bluetooth Pixel Watch


In [23]:
# ML based - Spacy

In [43]:
import requests
import spacy

# Load SpaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

In [44]:
api_url = "https://newsapi.org/v2/everything?q=apple&from=2024-08-03&to=2024-08-04&sortBy=popularity&apiKey=7c75568f1c764c1a99df62b1bdd8f278"
api_key = "7c75568f1c764c1a99df62b1bdd8f278"  

# Fetch the news article
response = requests.get(api_url, api_key)
data = response.json()

In [45]:
# Directly access the content, description, or title of the first article
article = data['articles'][0]['content'] or data['articles'][0]['description'] or data['articles'][0]['title']
print(f"Fetched Article:\n{article}\n")

Fetched Article:
Google's Pixel Watch 3 is becoming official on August 13, and for the first time in the series it will come in two sizes: 41mm and 45mm.
Today the pricing for all versions has been leaked. The 41mm … [+938 chars]



In [46]:
# Process the text with SpaCy
doc = nlp(article)

# Extract and print named entities
print("Extracted Named Entities:")
for entity in doc.ents:
    print(f"Entity: {entity.text}, Type: {entity.label_}")

Extracted Named Entities:
Entity: Google, Type: ORG
Entity: 3, Type: CARDINAL
Entity: August 13, Type: DATE
Entity: first, Type: ORDINAL
Entity: two, Type: CARDINAL
Entity: 41mm and, Type: QUANTITY
Entity: 45mm, Type: QUANTITY
Entity: Today, Type: DATE
Entity: 41mm, Type: QUANTITY
