In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii. He served as the 44th President of the United States."

In [6]:
doc = nlp(text)
print("Entities found in the text:")
for ent in doc.ents:
    print(f"Text: {ent.text}, Label: {ent.label_}")

Entities found in the text:
Text: Barack Obama, Label: PERSON
Text: August 4, 1961, Label: DATE
Text: Honolulu, Label: GPE
Text: Hawaii, Label: GPE
Text: 44th, Label: ORDINAL
Text: the United States, Label: GPE


In [7]:
import pandas as pd
df = pd.read_csv("NER dataset.csv",encoding='latin-1')

In [8]:
df.head(n=5)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [14]:
df['Sentence #'].fillna(method='ffill', inplace=True)

# Display the first few rows

# drop the nan from Word
df.dropna(inplace=True)
print(df.head())

  df['Sentence #'].fillna(method='ffill', inplace=True)


    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1  Sentence: 1             of   IN   O
2  Sentence: 1  demonstrators  NNS   O
3  Sentence: 1           have  VBP   O
4  Sentence: 1        marched  VBN   O


In [16]:
sentences = []
entities = []

for sentence, group in df.groupby('Sentence #'):
    words = group['Word'].tolist()
    tags = group['Tag'].tolist()

    # Join words to form a sentence
    sentences.append(" ".join(words))
    entities.append(tags)

# Display a sample sentence with its tags
for i in range(2):  # Display the first 2 sentences
    print(f"Sentence: {sentences[i]}")
    print(f"Tags: {entities[i]}")
    print()

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']

Sentence: Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .
Tags: ['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O']



In [17]:
# unique value of the Tag
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [18]:
# unique value of the POS
df['POS'].unique()

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH', 'FW'], dtype=object)

In [19]:
from collections import Counter

all_tags = [tag for sentence_tags in entities for tag in sentence_tags if tag != 'O']
tag_counts = Counter(all_tags)

# Print the most common tags
for tag, count in tag_counts.items():
    print(f"Tag: {tag}, Count: {count}")

Tag: B-geo, Count: 20707
Tag: B-gpe, Count: 8852
Tag: B-tim, Count: 11094
Tag: B-org, Count: 10952
Tag: I-geo, Count: 4144
Tag: B-per, Count: 9438
Tag: I-per, Count: 9759
Tag: I-org, Count: 9020
Tag: I-tim, Count: 3536
Tag: B-art, Count: 217
Tag: I-art, Count: 167
Tag: B-nat, Count: 114
Tag: I-gpe, Count: 121
Tag: I-nat, Count: 34
Tag: B-eve, Count: 200
Tag: I-eve, Count: 162


In [20]:
def extract_named_entities(words, tags):
  entities= []
  curr_entity = []
  current_tag = None

  for word, tag in zip(words, tags):
    if tag.startswith('B-'):
      if curr_entity:
        entities.append(( ' '.join(curr_entity),current_tag))
      curr_entity = [word]
      current_tag = tag[2:]
    elif tag.startswith('I-') and tag[2:] == current_tag:
      curr_entity.append(word)
    else:
      if curr_entity:
        entities.append(( ' '.join(curr_entity),current_tag))
        curr_entity = []
        current_tag = None
  if curr_entity:
    entities.append(( ' '.join(curr_entity),current_tag))
  return entities


for i in range(2):  # Process the first 2 sentences
    words = sentences[i].split()
    tags = entities[i]
    named_entities = extract_named_entities(words, tags)

    print(f"Sentence: {sentences[i]}")
    print("Named Entities:")
    for entity, tag in named_entities:
        print(f"  {entity}: {tag}")
    print()

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Named Entities:
  London: geo
  Iraq: geo
  British: gpe

Sentence: Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .
Named Entities:
  Iranian: gpe
  Wednesday: tim
  IAEA: org



# Using Spacy

In [22]:
import spacy

# Load pre-trained SpaCy model
nlp = spacy.load("en_core_web_sm")

# Process the new sentence
doc = nlp("Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .")

# Extract entities
print("Named Entities:")
for ent in doc.ents:
    print(f"  {ent.text}: {ent.label_}")

Named Entities:
  Iranian: NORP
  Wednesday: DATE
  IAEA: ORG


In [23]:
results = []
for entity, tag in named_entities:
    results.append({"Entity": entity, "Type": tag})

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

      Entity Type
0    Iranian  gpe
1  Wednesday  tim
2       IAEA  org
