In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

### DATASET

The Dataset can be downloaded here:
https://www.kaggle.com/code/abhinavwalia95/how-to-loading-and-fitting-dataset-to-scikit/input

In [2]:
ner_df = pd.read_csv('ner_dataset.csv.zip', encoding = "ISO-8859-1")
ner_df = ner_df[:100000]
ner_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
ner_df.isnull().sum()

Sentence #    95456
Word              0
POS               0
Tag               0
dtype: int64

In [4]:
ner_df.ffill(inplace=True)
ner_df['Sentence #'].nunique(), ner_df.Word.nunique(), ner_df.Tag.nunique()

(4544, 10922, 17)

In [5]:
print(ner_df)

           Sentence #           Word  POS Tag
0         Sentence: 1      Thousands  NNS   O
1         Sentence: 1             of   IN   O
2         Sentence: 1  demonstrators  NNS   O
3         Sentence: 1           have  VBP   O
4         Sentence: 1        marched  VBN   O
...               ...            ...  ...  ..
99995  Sentence: 4543           some   DT   O
99996  Sentence: 4543      seriously   RB   O
99997  Sentence: 4543              .    .   O
99998  Sentence: 4544  Demonstrators  NNS   O
99999  Sentence: 4544       chanting  VBG   O

[100000 rows x 4 columns]


In [6]:
ner_df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,75
1,B-eve,53
2,B-geo,3303
3,B-gpe,1740
4,B-nat,30
5,B-org,1876
6,B-per,1668
7,B-tim,1823
8,I-art,43
9,I-eve,47


In [7]:
X = ner_df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)

X = v.fit_transform(X.to_dict('records'))
y = ner_df.Tag.values

In [8]:
classes = np.unique(y)
classes = classes.tolist()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((67000, 15507), (67000,))

In [10]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 11.53, NNZs: 113, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.85 seconds.
-- Epoch 1
Norm: 8.43, NNZs: 57, Bias: -3.000000, T: 67000, Avg. loss: 0.000567
Total training time: 2.86 seconds.
-- Epoch 1
Norm: 68.07, NNZs: 2642, Bias: -4.000000, T: 67000, Avg. loss: 0.041776
Total training time: 2.90 seconds.
-- Epoch 1
Norm: 56.87, NNZs: 2044, Bias: -4.000000, T: 67000, Avg. loss: 0.034970
Total training time: 3.02 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   4 out of  17 | elapsed:    6.2s remaining:   20.2s


Norm: 13.42, NNZs: 162, Bias: -4.000000, T: 67000, Avg. loss: 0.001642
Total training time: 3.19 seconds.
-- Epoch 1
Norm: 49.90, NNZs: 1337, Bias: -4.000000, T: 67000, Avg. loss: 0.015328
Total training time: 3.23 seconds.
-- Epoch 1
Norm: 44.41, NNZs: 1127, Bias: -4.000000, T: 67000, Avg. loss: 0.017164
Total training time: 3.26 seconds.
-- Epoch 1
Norm: 48.83, NNZs: 1578, Bias: -4.000000, T: 67000, Avg. loss: 0.022328
Total training time: 3.36 seconds.
-- Epoch 1


[Parallel(n_jobs=-1)]: Done   6 out of  17 | elapsed:    6.4s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done   8 out of  17 | elapsed:    6.5s remaining:    7.3s


Norm: 10.44, NNZs: 106, Bias: -3.000000, T: 67000, Avg. loss: 0.001060
Total training time: 2.90 seconds.
-- Epoch 1
Norm: 11.45, NNZs: 96, Bias: -3.000000, T: 67000, Avg. loss: 0.000776
Total training time: 2.90 seconds.


[Parallel(n_jobs=-1)]: Done  10 out of  17 | elapsed:    8.9s remaining:    6.2s


Norm: 11.00, NNZs: 102, Bias: -3.000000, T: 67000, Avg. loss: 0.001209
Total training time: 2.97 seconds.
Norm: 35.13, NNZs: 803, Bias: -4.000000, T: 67000, Avg. loss: 0.011149
Total training time: 3.15 seconds.
Norm: 6.24, NNZs: 31, Bias: -3.000000, T: 67000, Avg. loss: 0.000209
Total training time: 2.93 seconds.


[Parallel(n_jobs=-1)]: Done  12 out of  17 | elapsed:    9.2s remaining:    3.8s


Norm: 30.53, NNZs: 672, Bias: -4.000000, T: 67000, Avg. loss: 0.012030
Total training time: 2.87 seconds.
Norm: 53.57, NNZs: 1703, Bias: -4.000000, T: 67000, Avg. loss: 0.026224
Total training time: 3.08 seconds.
Norm: 60.35, NNZs: 2091, Bias: -6.000000, T: 67000, Avg. loss: 0.026940
Total training time: 3.09 seconds.


[Parallel(n_jobs=-1)]: Done  14 out of  17 | elapsed:    9.4s remaining:    1.9s


Norm: 73.89, NNZs: 2851, Bias: 4.000000, T: 67000, Avg. loss: 0.048866
Total training time: 2.09 seconds.


[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:   11.0s finished


In [11]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [12]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       0.11      0.05      0.07        19
       B-geo       0.56      0.81      0.66      1085
       B-gpe       0.92      0.78      0.84       556
       B-nat       1.00      0.17      0.29        12
       B-org       0.39      0.52      0.44       589
       B-per       0.70      0.46      0.56       564
       B-tim       0.91      0.63      0.75       611
       I-art       0.00      0.00      0.00        12
       I-eve       0.67      0.22      0.33        18
       I-geo       0.75      0.42      0.54       230
       I-gpe       1.00      0.07      0.13        14
       I-nat       0.50      0.50      0.50         2
       I-org       0.48      0.50      0.49       445
       I-per       0.83      0.13      0.22       591
       I-tim       0.36      0.18      0.24       194

   micro avg       0.61      0.54      0.58      4966
   macro avg       0.57   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Linear Classifiers

In [13]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [14]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        24
       B-eve       1.00      0.05      0.10        19
       B-geo       0.76      0.67      0.71      1085
       B-gpe       0.96      0.59      0.73       556
       B-nat       0.00      0.00      0.00        12
       B-org       0.68      0.37      0.48       589
       B-per       0.93      0.36      0.52       564
       B-tim       0.94      0.63      0.75       611
       I-art       0.00      0.00      0.00        12
       I-eve       1.00      0.11      0.20        18
       I-geo       0.82      0.33      0.47       230
       I-gpe       0.00      0.00      0.00        14
       I-nat       0.00      0.00      0.00         2
       I-org       0.39      0.68      0.50       445
       I-per       0.40      0.90      0.56       591
       I-tim       0.00      0.00      0.00       194

   micro avg       0.63      0.56      0.59      4966
   macro avg       0.49   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
