In [2]:

import pandas as pd  # Importing the pandas library for data manipulation and analysis
import spacy # Importing the spacy library for natural language processing
import requests # Importing the requests library to make HTTP requests to web pages
from spacy import displacy # Importing the displacy module from spacy for visualizing the entities in a document
from bs4 import BeautifulSoup # Importing the BeautifulSoup class from the bs4 module to parse HTML/XML documents

nlp = spacy.load("en_core_web_sm") # Loading the small English language model from spacy for text processing tasks
pd.set_option("display.max_rows", 200) # Setting pandas to display a maximum of 200 rows in the output for better visibility in large dataframes




In [3]:
# Defining a string 'content' with news text containing information about Mahua Moitra and allegations against her
content = "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."

# Using the spacy NLP model to process the content and extract entities (names, places, etc.)
doc = nlp(content)

# Iterating through all named entities recognized in the text
for ent in doc.ents:
    # Printing the entity text, its start and end position in the content, and the type of entity (label)
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


Congress 10 18 ORG
Mahua Moitra 26 38 PERSON
the Supreme Court 49 66 ORG
the Lok Sabha 94 107 PERSON
Moitra 157 163 ORG
Parliament 184 194 ORG
last week 195 204 DATE
the Ethics Committee 211 231 ORG
Darshan Hiranandani 373 392 PERSON


In [4]:
# Rendering the document (doc) using displacy to visualize the named entities
# The style="ent" specifies that we want to visualize entities (e.g., persons, organizations, dates, etc.)
displacy.render(doc, style="ent")

In [5]:
# Creating a list of tuples containing the text, type (label), and lemma of each named entity in the document
# The lemma is the base or dictionary form of the word (e.g., 'running' -> 'run')
entities = [(ent.text, ent.label_, ent.lemma_) for ent in doc.ents]

# Creating a pandas DataFrame from the list of entities with columns 'text', 'type', and 'lemma'
df = pd.DataFrame(entities, columns=['text', 'type', 'lemma'])

# Printing the DataFrame to display the named entities, their types, and their lemmas
print(df)


                   text    type                 lemma
0              Congress     ORG              Congress
1          Mahua Moitra  PERSON          Mahua Moitra
2     the Supreme Court     ORG     the Supreme Court
3         the Lok Sabha  PERSON         the Lok Sabha
4                Moitra     ORG                Moitra
5            Parliament     ORG            Parliament
6             last week    DATE             last week
7  the Ethics Committee     ORG  the Ethics Committee
8   Darshan Hiranandani  PERSON   Darshan Hiranandani
