<a href="https://colab.research.google.com/github/Harika-Mullaguri/NLP/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import regex as re


text = "Elon Musk founded SpaceX in 2002. He was born in Pretoria, South Africa."


patterns = {
    "PERSON": r"\b(Elon Musk)\b",
    "ORG": r"\b(SpaceX)\b",
    "DATE": r"\b(2002)\b",
    "LOCATION": r"\b(Pretoria|South Africa)\b"

}


entities = {}
for entity, pattern in patterns.items():
    matches = re.findall(pattern, text)
    if matches:
        entities[entity] = matches

print(entities)

{'PERSON': ['Elon Musk'], 'ORG': ['SpaceX'], 'DATE': ['2002'], 'LOCATION': ['Pretoria', 'South Africa']}


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
text = "Sundar Pichai is the CEO of Google, headquartered in California.He is the CEO of Alphabet Inc. and Google. In 2025, under his leadership, Google increased its focus on artificial intelligence.This included the launch of Gemini 3 and large investments in AI infrastructure"

In [None]:
doc = nlp(text)

In [None]:
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

Sundar Pichai -> PERSON
Google -> ORG
California -> GPE
Alphabet Inc. -> ORG
Google -> ORG
2025 -> DATE
Google -> ORG
Gemini 3 -> ORG
AI -> GPE


In [None]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [None]:
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from nltk.corpus import conll2002

In [None]:
import nltk
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [None]:
train_sents = conll2002.iob_sents('esp.train')
test_sents = conll2002.iob_sents('esp.testb')

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'word': word,
        'postag': postag,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
    }
    if i > 0:
        word_prev = sent[i-1][0]
        features.update({'word_prev': word_prev})
    else:
        features['BOS'] = True  # Beginning of Sentence
    if i < len(sent)-1:
        word_next = sent[i+1][0]
        features.update({'word_next': word_next})
    else:
        features['EOS'] = True  # End of Sentence

    return features

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, _, label in sent]

In [None]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100)
crf.fit(X_train, y_train)

In [None]:
y_pred = crf.predict(X_test)

# Classification report
print(flat_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-LOC       0.81      0.76      0.79      1084
      B-MISC       0.73      0.52      0.60       339
       B-ORG       0.81      0.83      0.82      1400
       B-PER       0.80      0.87      0.84       735
       I-LOC       0.76      0.63      0.69       325
      I-MISC       0.66      0.53      0.59       557
       I-ORG       0.83      0.81      0.82      1104
       I-PER       0.86      0.94      0.90       634
           O       0.99      1.00      0.99     45355

    accuracy                           0.97     51533
   macro avg       0.80      0.76      0.78     51533
weighted avg       0.97      0.97      0.97     51533



In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
sentences = [["John", "lives", "in", "New", "York"],
             ["Mary", "works", "at", "Google"]]
labels = [["B-PER", "O", "O", "B-LOC", "I-LOC"],
          ["B-PER", "O", "O", "B-ORG"]]

In [None]:
word2idx = {word: i + 1 for i, word in enumerate(set(word for sent in sentences for word in sent))}
label2idx = {label: i for i, label in enumerate(set(label for sent in labels for label in sent))}

In [None]:
X = [[word2idx[word] for word in sent] for sent in sentences]
y = [[label2idx[label] for label in sent] for sent in labels]

In [None]:
max_len = max(len(sent) for sent in sentences)
X = pad_sequences(X, maxlen=max_len, padding="post")
y = pad_sequences(y, maxlen=max_len, padding="post")

In [None]:
model = Sequential([
    Embedding(input_dim=len(word2idx) + 1, output_dim=50, input_length=max_len),
    Bidirectional(LSTM(units=50, return_sequences=True)),
    TimeDistributed(Dense(len(label2idx), activation="softmax"))
])



In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
y = np.expand_dims(y, -1)  # Expand dimensions to match output shape
model.fit(X, y, epochs=5, batch_size=2)

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1000 - loss: 1.6138
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.3000 - loss: 1.6083
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4000 - loss: 1.6028
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.3000 - loss: 1.5974
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.4000 - loss: 1.5919


<keras.src.callbacks.history.History at 0x7d4c0e7ce6c0>

In [None]:
test_sentence = ["Alice", "works", "at", "Microsoft"]
test_X = [[word2idx.get(word, 0) for word in test_sentence]]
test_X = pad_sequences(test_X, maxlen=max_len, padding="post")
prediction = model.predict(test_X)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 389ms/step


In [None]:
predicted_labels = [list(label2idx.keys())[np.argmax(pred)] for pred in prediction[0]]
print(predicted_labels)

['O', 'O', 'O', 'O', 'O']
