In [3]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [5]:
import pandas as pd
data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [7]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,8118,11
1,,of,IN,O,6796,11
2,,demonstrators,NNS,O,17713,11
3,,have,VBP,O,31568,11
4,,marched,VBN,O,3889,11


In [8]:
data_fillna = data.fillna(method='ffill', axis=0)

In [9]:
data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  data_group = data_fillna.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))


In [10]:
data_group.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[8118, 6796, 17713, 31568, 3889, 32081, 7184, ...","[11, 11, 11, 11, 11, 11, 14, 11, 11, 11, 11, 1..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[19859, 27270, 17144, 11233, 1059, 16968, 1036...","[8, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[9849, 24712, 7958, 7028, 4609, 17587, 1827, 3...","[11, 11, 4, 11, 11, 11, 11, 11, 14, 11, 11, 11..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[29622, 32899, 3891, 18364, 12206, 21887, 2620...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[26873, 23319, 8945, 17189, 13669, 10404, 2735...","[14, 11, 11, 7, 9, 11, 4, 11, 14, 11, 8, 11, 8..."


In [11]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [12]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [13]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [14]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  35179 
output_dim:  64 
input_length:  104 
n_tags:  17


In [15]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [16]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [17]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [25]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp(
'Jim bought 300 shares of Acme Corp. in 2006. And producing an annotated block of text that highlights the names of entities: [Jim]Person bought 300 shares of [Acme Corp.]Organization in [2006]Time. In this example, a person name consisting of one token, a two-token company name and a temporal expression have been detected and classified.State-of-the-art NER systems for English produce near-human performance. For example, the best system entering MUC-7 scored 93.39% of F-measure while human annotators scored 97.60% and 96.95%.[1][2]'
)
displacy.render(text, style = 'ent', jupyter=True)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [20]:
pip install -U spacy

Collecting spacyNote: you may need to restart the kernel to use updated packages.

  Using cached spacy-3.2.3-cp38-cp38-win_amd64.whl (11.6 MB)
Collecting thinc<8.1.0,>=8.0.12
  Using cached thinc-8.0.13-cp38-cp38-win_amd64.whl (1.0 MB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.6-cp38-cp38-win_amd64.whl (113 kB)
Collecting typer<0.5.0,>=0.3.0
  Using cached typer-0.4.0-py3-none-any.whl (27 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Using cached wasabi-0.9.0-py3-none-any.whl (25 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Using cached spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting blis<0.8.0,>=0.4.0
  Using cached blis-0.7.6-cp38-cp38-win_amd64.whl (6.6 MB)
Collecting tqdm<5.0.0,>=4.38.0
  Downloading tqdm-4.63.0-py2.py3-none-any.whl (76 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting srsly<3.0.0,>=2.4.1
  Using cached srsly-2.4.2-cp38-cp38-win_amd64.whl (452 kB)
Collecting cymem<2.1.0,>=2.0.2
  Using cac

In [22]:
python -m spacy download en_core_web_sm

SyntaxError: invalid syntax (Temp/ipykernel_159100/581980377.py, line 1)

In [4]:
import spacy.cli
spacy.cli.download("en_core_web_sm")


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp(
'Jim bought 300 shares of Acme Corp. in 2006. And producing an annotated block of text that highlights the names of entities: [Jim]Person bought 300 shares of [Acme Corp.]Organization in [2006]Time. In this example, a person name consisting of one token, a two-token company name and a temporal expression have been detected and classified.State-of-the-art NER systems for English produce near-human performance. For example, the best system entering MUC-7 scored 93.39% of F-measure while human annotators scored 97.60% and 96.95%.[1][2]'
)
displacy.render(text, style = 'ent', jupyter=True)

In [6]:
import spacy.cli

In [1]:
conda create -n spacy python=3.6 anaconda 


Note: you may need to restart the kernel to use updated packages.


'conda' is not recognized as an internal or external command,
operable program or batch file.


In [1]:
pip install conda

Collecting condaNote: you may need to restart the kernel to use updated packages.


    ERROR: Command errored out with exit status 1:


  Using cached conda-4.3.16.tar.gz (299 kB)
Collecting pycosat>=0.6.1
  Using cached pycosat-0.6.3.zip (66 kB)
Collecting ruamel.yaml>=0.11.14
  Using cached ruamel.yaml-0.17.21-py3-none-any.whl (109 kB)
Collecting conda
  Using cached conda-4.3.13.tar.gz (370 kB)
  Using cached conda-4.2.7.tar.gz (235 kB)
  Downloading conda-4.2.6.tar.gz (235 kB)
  Downloading conda-4.1.6.tar.gz (144 kB)
Building wheels for collected packages: conda, pycosat
  Building wheel for conda (setup.py): started
  Building wheel for conda (setup.py): finished with status 'error'
  Running setup.py clean for conda
  Building wheel for pycosat (setup.py): started
  Building wheel for pycosat (setup.py): finished with status 'done'
  Created wheel for pycosat: filename=pycosat-0.6.3-cp38-cp38-win_amd64.whl size=42375 sha256=4bb0bd460045e80de7b30e69cea3daa1ab0016696520c0364138b7a7c62b19df
  Stored in directory: c:\users\lenovo\appdata\local\pip\cache\wheels\71\5b\2d\0a9247760f0f008abc0eae7c0127a8c1cddfe1145e4b17


     command: 'C:\Users\Lenovo\.conda\envs\tensorflow-sessions\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Lenovo\\AppData\\Local\\Temp\\pip-install-hl9j28rq\\conda_95e5d8450674415cba97772ca6579ba2\\setup.py'"'"'; __file__='"'"'C:\\Users\\Lenovo\\AppData\\Local\\Temp\\pip-install-hl9j28rq\\conda_95e5d8450674415cba97772ca6579ba2\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\Lenovo\AppData\Local\Temp\pip-pip-egg-info-a0ffj2xf'
         cwd: C:\Users\Lenovo\AppData\Local\Temp\pip-install-hl9j28rq\conda_95e5d8450674415cba97772ca6579ba2\
    Complete output (5 lines):
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "C:\Users\Lenovo\AppData\Local\Tem

In [3]:
conda create -n spacy python=3.10 anaconda 


Note: you may need to restart the kernel to use updated packages.


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\Scripts\conda-script.py", line 11, in <module>
    from conda.cli import main
ModuleNotFoundError: No module named 'conda'


In [1]:
conda create  ner


Note: you may need to restart the kernel to use updated packages.


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\Scripts\conda-script.py", line 11, in <module>
    from conda.cli import main
ModuleNotFoundError: No module named 'conda'


In [2]:
 conda create --name newEnv python=3.5
    

SyntaxError: invalid syntax (Temp/ipykernel_29344/4169327782.py, line 1)

In [4]:
 conda create --name newEnv 


Note: you may need to restart the kernel to use updated packages.


Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\Scripts\conda-script.py", line 11, in <module>
    from conda.cli import main
ModuleNotFoundError: No module named 'conda'


In [None]:
python -m venv venv