# NER Using sklearn and CRF

In [3]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn-crfsuite-0.3.6


In [9]:
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
data = [
    [("Apple", "ORG"), ("is", "O"), ("a", "O"), ("great", "O"), ("company", "O")],
    [("I", "O"), ("like", "O"), ("to", "O"), ("eat", "O"), ("bananas", "PRODUCT")],
]
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
        'word[-3:]': word[-3:],
    }
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, label in sent]

data = [[(word, label) for word, label in sentence] for sentence in data]
X = [sent2features(s) for s in data]
y = [sent2labels(s) for s in data]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=1.0,
    c2=1e-3,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)

y_test_flat = [label for labels in y_test for label in labels]
y_pred_flat = [label for labels in y_pred for label in labels]

report = classification_report(y_test_flat, y_pred_flat)
print(report)


              precision    recall  f1-score   support

           O       0.80      1.00      0.89         4
     PRODUCT       0.00      0.00      0.00         1

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
X

[[{'word': 'Apple',
   'is_upper': False,
   'is_title': True,
   'is_digit': False,
   'word[-3:]': 'ple'},
  {'word': 'is',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'is'},
  {'word': 'a',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'a'},
  {'word': 'great',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'eat'},
  {'word': 'company',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'any'}],
 [{'word': 'I',
   'is_upper': True,
   'is_title': True,
   'is_digit': False,
   'word[-3:]': 'I'},
  {'word': 'like',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'ike'},
  {'word': 'to',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'to'},
  {'word': 'eat',
   'is_upper': False,
   'is_title': False,
   'is_digit': False,
   'word[-3:]': 'eat'},
  {'word': 'bananas',
   'is_uppe

#NER using Spacy

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

2023-10-19 01:01:19.350832: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from pprint import pprint
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')

sentence = '''Prime Minister Jacinda Ardern has claimed that New Zealand had won a big
battle over the spread of coronavirus. Her words came as the country begins to exit from its lockdown.'''
entities= nlp(sentence)
pprint([ (X, X.ent_iob_, X.ent_type_) for X in entities] )

print("Named entities in this text are\n")
for ent in entities.ents:
    print(ent.text,ent.label_)

displacy.render(entities, style='ent', jupyter=True)

[(Prime, 'O', ''),
 (Minister, 'O', ''),
 (Jacinda, 'B', 'PERSON'),
 (Ardern, 'I', 'PERSON'),
 (has, 'O', ''),
 (claimed, 'O', ''),
 (that, 'O', ''),
 (New, 'B', 'GPE'),
 (Zealand, 'I', 'GPE'),
 (had, 'O', ''),
 (won, 'O', ''),
 (a, 'O', ''),
 (big, 'O', ''),
 (
, 'O', ''),
 (battle, 'O', ''),
 (over, 'O', ''),
 (the, 'O', ''),
 (spread, 'O', ''),
 (of, 'O', ''),
 (coronavirus, 'O', ''),
 (., 'O', ''),
 (Her, 'O', ''),
 (words, 'O', ''),
 (came, 'O', ''),
 (as, 'O', ''),
 (the, 'O', ''),
 (country, 'O', ''),
 (begins, 'O', ''),
 (to, 'O', ''),
 (exit, 'O', ''),
 (from, 'O', ''),
 (its, 'O', ''),
 (lockdown, 'O', ''),
 (., 'O', '')]
Named entities in this text are

Jacinda Ardern PERSON
New Zealand GPE


# Task
Go through the tutorial and run  the code
  https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
  https://www.kaggle.com/code/abhinavwalia95/how-to-loading-and-fitting-dataset-to-scikit/input

1. Run the same model with different dataset and submit your results.
2. Use the same model i.e. CRF but predict POS tags.
