In [None]:
import nltk
from nltk.corpus import conll2002
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score

# Download the CoNLL-2002 dataset
nltk.download('conll2002')

# Load the dataset
data = conll2002.iob_sents()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define features for CRF
def word2features(sent, i):
    word = sent[i][0]
    return {'word': word}

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, pos, label in sent]

# Extract features and labels
X_train = [sent2features(sent) for sent in train_data]
y_train = [sent2labels(sent) for sent in train_data]
X_test = [sent2features(sent) for sent in test_data]
y_test = [sent2labels(sent) for sent in test_data]

# Train a CRF model
crf = CRF()
crf.fit(X_train, y_train)

# Predict and evaluate
y_pred = crf.predict(X_test)
f1_score = flat_f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score: {f1_score}')

new_sentence = "hello navadiya"

# Use the NER model to predict named entities
new_prediction = crf.predict([sent2features(new_sentence.split())])

print(f'Predicted Entities: {new_prediction[0]}')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\HARSHIT\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!
