pip install flair

In [2]:
import pandas as pd
import spacy
from flair.models import SequenceTagger
from flair.data import Sentence
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline

2025-05-04 11:22:17.029529: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746357737.265670      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746357737.329034      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
df = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/education_data.csv')
texts = df['description'].dropna().astype(str).tolist()[:3] 

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
# NER with spaCy
print("Named Entity Recognition with spaCy")
nlp_spacy = spacy.load("en_core_web_sm")

for i, text in enumerate(texts, 1):
    doc = nlp_spacy(text)
    print(f"\n Text {i}: {text}\nEntities:")
    for ent in doc.ents:
        print(f" - {ent.text:30} ➤ {ent.label_}")

Named Entity Recognition with spaCy

 Text 1: CUET PG 2024: UGC said that the list of participating institutions displayed on the NTA website is dynamic and will be updated to include the name of the new universities after they are registered.
Entities:
 - CUET                           ➤ ORG
 - 2024                           ➤ CARDINAL
 - UGC                            ➤ ORG
 - NTA                            ➤ ORG

 Text 2: On April 10, 2023 TCS had announced that it had been selected by Oxford University for the delivery of most of the university’s admissions tests from 2023 onwards.
Entities:
 - April 10, 2023                 ➤ DATE
 - TCS                            ➤ ORG
 - Oxford University              ➤ ORG
 - 2023                           ➤ DATE

 Text 3: AISHE Report 2021-22: The enrollment in STEM (at UG, PG, MPhil and PhD levels) is 98,49,488, out of which 56,56,488 are men and 41,93,000 are women.
Entities:
 - AISHE Report                   ➤ ORG
 - 2021-22                

In [6]:
print("Named Entity Recognition with BERT")

tokenizer_bert_ner = BertTokenizerFast.from_pretrained("dslim/bert-base-NER")
model_bert_ner = BertForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=model_bert_ner, tokenizer=tokenizer_bert_ner, aggregation_strategy="simple")

for idx, text in enumerate(texts, 1):
    print(f"\n Text {idx}: {text}\nEntities:")
    ner_results = ner_pipeline(text)
    for ent in ner_results:
        print(f" • {ent['word']:25s} ➤ {ent['entity_group']} ({ent['score']:.2f})")

Named Entity Recognition with BERT


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



 Text 1: CUET PG 2024: UGC said that the list of participating institutions displayed on the NTA website is dynamic and will be updated to include the name of the new universities after they are registered.
Entities:
 • P                         ➤ ORG (0.54)
 • UGC                       ➤ ORG (1.00)
 • NTA                       ➤ ORG (0.98)

 Text 2: On April 10, 2023 TCS had announced that it had been selected by Oxford University for the delivery of most of the university’s admissions tests from 2023 onwards.
Entities:
 • TCS                       ➤ ORG (1.00)
 • Oxford University         ➤ ORG (1.00)

 Text 3: AISHE Report 2021-22: The enrollment in STEM (at UG, PG, MPhil and PhD levels) is 98,49,488, out of which 56,56,488 are men and 41,93,000 are women.
Entities:
 • AISHE                     ➤ ORG (0.72)
 • Report                    ➤ MISC (0.68)
 • ST                        ➤ MISC (0.55)
 • MP                        ➤ MISC (0.58)


In [7]:
print("POS Tagging with spaCy")

for i, text in enumerate(texts, 1):
    doc = nlp_spacy(text)
    print(f"\n Text {i}: {text[:90]}...")
    print("POS Tags (spaCy):")
    for token in doc[:20]:
        print(f"{token.text:<12} ➤ {token.pos_}")

POS Tagging with spaCy

 Text 1: CUET PG 2024: UGC said that the list of participating institutions displayed on the NTA we...
POS Tags (spaCy):
CUET         ➤ PROPN
PG           ➤ PROPN
2024         ➤ NUM
:            ➤ PUNCT
UGC          ➤ PROPN
said         ➤ VERB
that         ➤ SCONJ
the          ➤ DET
list         ➤ NOUN
of           ➤ ADP
participating ➤ VERB
institutions ➤ NOUN
displayed    ➤ VERB
on           ➤ ADP
the          ➤ DET
NTA          ➤ PROPN
website      ➤ NOUN
is           ➤ AUX
dynamic      ➤ ADJ
and          ➤ CCONJ

 Text 2: On April 10, 2023 TCS had announced that it had been selected by Oxford University for the...
POS Tags (spaCy):
On           ➤ ADP
April        ➤ PROPN
10           ➤ NUM
,            ➤ PUNCT
2023         ➤ NUM
TCS          ➤ PROPN
had          ➤ AUX
announced    ➤ VERB
that         ➤ SCONJ
it           ➤ PRON
had          ➤ AUX
been         ➤ AUX
selected     ➤ VERB
by           ➤ ADP
Oxford       ➤ PROPN
University   ➤ PROPN
for          

In [8]:
print(" POS Tagging with BERT-based Flair")

tagger = SequenceTagger.load("pos")

for i, text in enumerate(texts, 1):
    sentence = Sentence(text[:100])
    tagger.predict(sentence)
    print(f"\n BERT-based POS Tags for Text {i}:")
    for token in sentence.tokens[:20]:
        tag = token.annotation_layers['pos'][0].value if 'pos' in token.annotation_layers else 'N/A'
        print(f"{token.text:<12} ➤ {tag}")


 POS Tagging with BERT-based Flair


pytorch_model.bin:   0%|          | 0.00/249M [00:00<?, ?B/s]

2025-05-04 11:22:42,683 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD

 BERT-based POS Tags for Text 1:
CUET         ➤ NNP
PG           ➤ NNP
2024         ➤ CD
:            ➤ :
UGC          ➤ NNP
said         ➤ VBD
that         ➤ IN
the          ➤ DT
list         ➤ NN
of           ➤ IN
participating ➤ VBG
institutions ➤ NNS
displayed    ➤ VBN
on           ➤ IN
the          ➤ DT
NTA          ➤ NNP
website      ➤ NN
is           ➤ VBZ
d            ➤ NN

 BERT-based POS Tags for Text 2:
On           ➤ IN
April        ➤ NNP
10           ➤ CD
,            ➤ ,
2023         ➤ CD
TCS          ➤ NNP
had          ➤ VBD
announced    ➤ VBN
that         ➤ IN
it           ➤ PRP
had          ➤ VBD
been         ➤ VBN
selected     ➤ VBN
by           ➤ IN
Oxford       ➤ NNP


In [9]:
from sklearn.metrics import classification_report

tagger_flair = SequenceTagger.load("flair/upos-english")  # Universal POS tags

true_tags = []
pred_tags = []

for text in texts:
    doc_spacy = nlp_spacy(text)
    sentence_flair = Sentence(text)
    tagger_flair.predict(sentence_flair)

    min_len = min(len(doc_spacy), len(sentence_flair))

    for i in range(min_len):
        spacy_tag = doc_spacy[i].pos_
        flair_tag = sentence_flair[i].labels[0].value if sentence_flair[i].labels else 'N/A'
        true_tags.append(spacy_tag)
        pred_tags.append(flair_tag)

print("\n POS Tagging Evaluation (Flair vs spaCy):\n")
print(classification_report(true_tags, pred_tags, zero_division=0))

pytorch_model.bin:   0%|          | 0.00/244M [00:00<?, ?B/s]

2025-05-04 11:22:56,721 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, VERB, PUNCT, ADP, DET, PROPN, PRON, ADJ, ADV, CCONJ, PART, NUM, AUX, INTJ, SYM, X, <START>, <STOP>

 POS Tagging Evaluation (Flair vs spaCy):

              precision    recall  f1-score   support

         ADJ       1.00      0.67      0.80         3
         ADP       0.53      0.62      0.57        13
         ADV       0.00      0.00      0.00         1
         AUX       1.00      0.10      0.18        10
       CCONJ       0.25      0.33      0.29         3
         DET       1.00      0.57      0.73         7
        NOUN       0.75      0.69      0.72        13
         NUM       0.38      0.89      0.53         9
        PART       1.00      0.50      0.67         2
        PRON       0.50      0.67      0.57         3
       PROPN       0.73      0.57      0.64        14
       PUNCT       0.40      0.40      0.40        10
       SCONJ       0.00      0.00      0.00         3
         SYM 

In [10]:
import pandas as pd

spacy_entities = []
for i, text in enumerate(texts, 1):
    doc = nlp_spacy(text)
    for ent in doc.ents:
        spacy_entities.append({'Text ID': i, 'Entity': ent.text.strip(), 'Label (spaCy)': ent.label_})

bert_entities = []
for i, text in enumerate(texts, 1):
    ner_results = ner_pipeline(text)
    for ent in ner_results:
        bert_entities.append({'Text ID': i, 'Entity': ent['word'].strip(), 'Label (BERT)': ent['entity_group']})

df_spacy = pd.DataFrame(spacy_entities)
df_bert = pd.DataFrame(bert_entities)
merged = pd.merge(df_spacy, df_bert, on=["Text ID", "Entity"], how="outer")

merged['Label (spaCy)'] = merged['Label (spaCy)'].fillna("None")
merged['Label (BERT)'] = merged['Label (BERT)'].fillna("None")

pd.set_option('display.max_rows', None)
print(merged.to_string(index=False))

 Text ID            Entity Label (spaCy) Label (BERT)
       1              2024      CARDINAL         None
       1              CUET           ORG         None
       1               NTA           ORG          ORG
       1                 P          None          ORG
       1               UGC           ORG          ORG
       2              2023          DATE         None
       2    April 10, 2023          DATE         None
       2 Oxford University           ORG          ORG
       2               TCS           ORG          ORG
       3           2021-22          DATE         None
       3         41,93,000      CARDINAL         None
       3         56,56,488      CARDINAL         None
       3         98,49,488          DATE         None
       3             AISHE          None          ORG
       3      AISHE Report           ORG         None
       3                MP          None         MISC
       3             MPhil           ORG         None
       3                PG  