In [95]:
!pip install -q pyspellchecker
!python -m spacy download en_core_web_md en_core_web_sm -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [2]:
import spacy
from spacy import displacy
import pandas as pd

# Load the SpaCy English language model
nlp = spacy.load("en_core_web_sm")
nlp.analyze_pipes()

  from .autonotebook import tqdm as notebook_tqdm


{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att


#Spacy Features

| Name                   | Description                                                                                                   |
|-------------------------------|---------------------------------------------------------------------------------------------------------------|
| Tokenization                  | Segmenting text into words, punctuation marks, etc.                                                            |
| Part-of-speech (POS) Tagging  | Assigning word types to tokens, like verb or noun.                                                              |
| Dependency Parsing            | Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object. |
| Lemmatization                 | Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat".  |
| Sentence Boundary Detection   | Finding and segmenting individual sentences.                                                                    |
| Named Entity Recognition (NER)| Labelling named "real-world" objects, like persons, companies, or locations.                                   |
| Entity Linking (EL)           | Disambiguating textual entities to unique identifiers in a knowledge base.                                    |
| Similarity                    | Comparing words, text spans, and documents and how similar they are to each other.                           |
| Text Classification           | Assigning categories or labels to a whole document or parts of a document.                                     |
| Rule-based Matching           | Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions.    |
| Training                      | Updating and improving a statistical model's predictions.                                                     |
| Serialization                 | Saving objects to files or byte strings.                                                                       |


# Linguistic Features

## Stopword,Punctuation,Special character  Removal

In [97]:
text = "This is an (exmple) sentence for text preprocesing!, Try to remove 7 & 5."

# 1. Lowercasing
text = text.lower()

# 2. Tokenization
doc = nlp(text)
tokens = [token.text for token in doc]

# 3. Stopword Removal
stop_words = spacy.lang.en.stop_words.STOP_WORDS  # dtype=set()
stopword_filtered = [token for token in doc if not token.is_stop] 
stopword_filtered_sentence = ' '.join([token.text for token in stopword_filtered]) 

# 4. Punctuation Removal
punctuation_filtered = [token for token in doc if not token.is_punct]
punctuation_filtered_sentence = ' '.join([token.text for token in punctuation_filtered]) 

# 5. Special Character Removal
special_character_filtered = [token for token in doc if token.is_alpha]
special_character_filtered_sentence = ' '.join([token.text for token in special_character_filtered ]) 

print("Text : ",doc)
print("Tokens : ",tokens)
print("="*100)
print("Sentence with stopword removed : ",stopword_filtered_sentence)
print("="*100)
print("Sentence with punctuation removed : ",punctuation_filtered_sentence)
print("="*100)
print("Sentence with special character removed : ",special_character_filtered_sentence)

Text :  this is an (exmple) sentence for text preprocesing!, try to remove 7 & 5.
Tokens :  ['this', 'is', 'an', '(', 'exmple', ')', 'sentence', 'for', 'text', 'preprocesing', '!', ',', 'try', 'to', 'remove', '7', '&', '5', '.']
Sentence with stopword removed :  ( exmple ) sentence text preprocesing ! , try remove 7 & 5 .
Sentence with punctuation removed :  this is an exmple sentence for text preprocesing try to remove 7 5
Sentence with special character removed :  this is an exmple sentence for text preprocesing try to remove


## Syntax Analysis



    Tokenizer exception: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied.
    Prefix: Character(s) at the beginning, e.g. $, (, “, ¿.
    Suffix: Character(s) at the end, e.g. km, ), ”, !.
    Infix: Character(s) in between, e.g. -, --, /, ….



In [98]:

doc = nlp("Apple is looking at buying U.K (United Kingdom). startup for $1 billion")
data = []
for token in doc:
    data.append([token.text,token.prefix,token.suffix])

df = pd.DataFrame(data, columns=["Text", "Prefix", "Suffix"])
df

Unnamed: 0,Text,Prefix,Suffix
0,Apple,14862748245026736845,16838316573854262908
1,is,5097672513440128799,3411606890003347522
2,looking,2985121464356781022,7679303661980345986
3,at,11901859001352538922,11667289587015813222
4,buying,15598372446745583797,7679303661980345986
5,U.K,8804677687338025421,15717335729472840852
6,(,12638816674900267446,12638816674900267446
7,United,8804677687338025421,14814004474743358178
8,Kingdom,8656007278630577320,4659205314568820379
9,),3842344029291005339,3842344029291005339


## Dependency Parsing

In [120]:
text = "The cat is sitting on the mat."

doc = nlp(text)

# Perform dependency parsing
data = []
for token in doc:
    data.append([token.text,token.dep_,token.head.text,token.head.pos_])

df = pd.DataFrame(data,columns=["Text","Dependency_Label","Head_Word","POS_Head_Word"])
df

Unnamed: 0,Text,Dependency_Label,Head_Word,POS_Head_Word
0,The,det,cat,NOUN
1,cat,nsubj,sitting,VERB
2,is,aux,sitting,VERB
3,sitting,ROOT,sitting,VERB
4,on,prep,sitting,VERB
5,the,det,mat,NOUN
6,mat,pobj,on,ADP
7,.,punct,sitting,VERB


## Basic Spelling Correction

In [122]:
from spellchecker import SpellChecker

# 7. Spell Checking and Correction

text = "This is a exmplesss senteence from Text preoprocesing!"

# 1. Lowercasing
text = text.lower()

# 2. Tokenization
doc = nlp(text)
tokens = [token.text for token in doc]

spell = SpellChecker()

corrected_tokens = [spell.correction(token) for token in tokens]

# 8. Handling Contractions (using a simple mapping)
contractions = {
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "mightn't": "might not",
    "mustn't": "must not"
}

expanded_tokens = [contractions.get(token, token) for token in corrected_tokens]

# Print the results
print("Orginal Sentence : ",text)
print("Corrected Tokens:", corrected_tokens)
print("Expanded Tokens:", expanded_tokens)


Orginal Sentence :  this is a exmplesss senteence from text preoprocesing!
Corrected Tokens: ['this', 'is', 'a', None, 'sentence', 'from', 'text', None, '!']
Expanded Tokens: ['this', 'is', 'a', None, 'sentence', 'from', 'text', None, '!']


## Normalization

In [101]:

text = "I bought 5 apples and 3 oranges."

doc = nlp(text)

normalized_text = " ".join([token.text.lower() if token.is_alpha else token.text for token in doc])
print("Normalized text : ",normalized_text)

Normalized text :  i bought 5 apples and 3 oranges .


## POS Tagging & chunking



    Text: The original word text.
    Lemma: The base form of the word.
    POS: The simple UPOS part-of-speech tag.
    Tag: The detailed part-of-speech tag.
    Dep: Syntactic dependency, i.e. the relation between tokens.
    Shape: The word shape – capitalization, punctuation, digits.
    is alpha: Is the token an alpha character?
    is stop: Is the token part of a stop list, i.e. the most common words of the language?



### Interrogative,Declarative & Complex Sentence

In [102]:

# Interrogative,Declarative & Complex Sentence
text = "The quick brown fox jumps over the lazy dog."

doc = nlp(text)

data = []
for token in doc:
    data.append([
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        token.dep_,
        token.shape_,
        token.is_alpha,
        token.is_stop
    ])


df = pd.DataFrame(data, columns=["Text", "Lemma", "POS", "Tag", "Dependency", "Shape", "Is Alpha", "Is Stop"])
df

Unnamed: 0,Text,Lemma,POS,Tag,Dependency,Shape,Is Alpha,Is Stop
0,The,the,DET,DT,det,Xxx,True,True
1,quick,quick,ADJ,JJ,amod,xxxx,True,False
2,brown,brown,ADJ,JJ,amod,xxxx,True,False
3,fox,fox,NOUN,NN,nsubj,xxx,True,False
4,jumps,jump,VERB,VBZ,ROOT,xxxx,True,False
5,over,over,ADP,IN,prep,xxxx,True,True
6,the,the,DET,DT,det,xxx,True,True
7,lazy,lazy,ADJ,JJ,amod,xxxx,True,False
8,dog,dog,NOUN,NN,pobj,xxx,True,False
9,.,.,PUNCT,.,punct,.,False,False


  
    Text of the main/root token in the chunk  "chunk.root.text"
    Dependency label of the root token  "chunk.root.dep_"
    Text of the head word of the root token  "chunk.root.head.text"

### Extract Noun chunking

In [115]:


text = "The quick brown fox jumps over the lazy dog."

doc = nlp(text)

# Extract Noun chunking
data = []
for chunk in doc.noun_chunks:
    data.append([chunk.text,chunk.root.text,chunk.root.dep_,chunk.root.head.text])

df = pd.DataFrame(data,columns=["Text","Root_Token","Dependency_Label","Head_Word"])
df

Unnamed: 0,Text,Root_Token,Dependency_Label,Head_Word
0,The quick brown fox,fox,nsubj,jumps
1,the lazy dog,dog,pobj,over


### Extract Verb phrases

In [116]:
# Extract Verb phrases
verb_phrases = []
for token in doc:
    if token.pos_ == "VERB":
        verb_phrase = token.text
        for child in token.children:
            if child.dep_ in ["aux", "auxpass", "advmod", "prt"]:
                verb_phrase += " " + child.text
        verb_phrases.append(verb_phrase)

# Print the verb phrases
print("Verb Phrases:", verb_phrases)  


print("="*100)
print("-"*100)
print("="*100)

# Extract prepositional phrases
prepositional_phrases = []
for token in doc:
    if token.pos_ == "ADP":  # Check if token is a preposition
        prep_phrase = " ".join([tok.text for tok in token.subtree])
        prepositional_phrases.append(prep_phrase)

# Print the prepositional phrases
print("Prepositional Phrases:", prepositional_phrases)


Verb Phrases: ['jumps']
----------------------------------------------------------------------------------------------------
Prepositional Phrases: ['over the lazy dog']


## Named Entity Recognition



    Text: The original entity text.
    Start: Index of start of entity in the Doc.
    End: Index of end of entity in the Doc.
    Label: Entity label, i.e. type.



In [3]:

text = "Apple Inc. is planning to open a new store in New York City. This is a sample number (555) 555-5555."
# Adding custom Rule
ruler = nlp.add_pipe("entity_ruler")
#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
            ]
#add patterns to ruler
ruler.add_patterns(patterns)

doc = nlp(text)
data = []
for ent in doc.ents:
  data.append([ent.text, ent.start_char, ent.end_char, ent.label_])

df = pd.DataFrame(data,columns=["Text", "Start_Char", "End_Char", "Label"])
df

Unnamed: 0,Text,Start_Char,End_Char,Label
0,Apple Inc.,0,10,ORG
1,New York City,46,59,GPE
2,555,86,89,CARDINAL
3,555,91,94,CARDINAL


In [4]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

In [5]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

## Word-Vector & Similarity



    Text: The original token text.
    has vector: Does the token have a vector representation?
    Vector norm: The L2 norm of the token’s vector (the square root of the sum of the values squared)
    OOV: Out-of-vocabulary



In [108]:
text = "dog cat banana afskfsd"
doc = nlp(text)

data = []
for token in doc:
    data.append([
        token.text,
        token.has_vector,
        token.vector_norm,
        token.is_oov
    ])


df = pd.DataFrame(data, columns=["Text", "has_Vector", "Vector_Norm", "OOV"])
df


Unnamed: 0,Text,has_Vector,Vector_Norm,OOV
0,dog,True,6.814786,True
1,cat,True,7.370902,True
2,banana,True,7.64607,True
3,afskfsd,True,7.192256,True


In [109]:
nlp2 = spacy.load("en_core_web_md")

doc1 = nlp2("I like salty fries and hamburgers.")
doc2 = nlp2("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))


I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761
salty fries <-> hamburgers 0.6938489675521851


## Vocab, hashes and lexemes 



    Text: The original text of the lexeme.
    Orth: The hash value of the lexeme.
    Shape: The abstract word shape of the lexeme.
    Prefix: By default, the first letter of the word string.
    Suffix: By default, the last three letters of the word string.
    is alpha: Does the lexeme consist of alphabetic characters?
    is digit: Does the lexeme consist of digits?



In [110]:
text = "I love coffee"
doc = nlp(text)


data = []
for word in doc:
    lexeme = doc.vocab[word.text]
    data.append([
        lexeme.text,
        lexeme.orth,
        lexeme.shape_,
        lexeme.prefix_,
        lexeme.suffix_,
        lexeme.is_alpha, 
        lexeme.is_digit, 
        lexeme.is_title, 
        lexeme.lang_
    ])


df = pd.DataFrame(data, columns=["Text", "Orth", "Shape", "Prefix","Suffix","is_Alpha","is_Digit","is_Title","Lang"])
df


Unnamed: 0,Text,Orth,Shape,Prefix,Suffix,is_Alpha,is_Digit,is_Title,Lang
0,I,4690420944186131903,X,I,I,True,False,True,en
1,love,3702023516439754181,xxxx,l,ove,True,False,False,en
2,coffee,3197928453018144401,xxxx,c,fee,True,False,False,en


  
  
  However, hashes cannot be reversed and there’s no way to resolve 3197928453018144401 back to “coffee”. All spaCy can do is look it up in the vocabulary. That’s why you always need to make sure all objects you create have access to the same vocabulary. If they don’t, spaCy might not be able to find the strings it needs.

In [111]:
from spacy.tokens import Doc
from spacy.vocab import Vocab

doc = nlp("I love coffee")  # Original Doc
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

empty_doc = Doc(Vocab())  # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(

empty_doc.vocab.strings.add("coffee")  # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

new_doc = Doc(doc.vocab)  # Create new doc with first doc's vocab
print(new_doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

3197928453018144401
coffee
coffee
coffee


## Pipeline, Architecture,Serialization 

0


## Traning Custom NER

In [125]:
from google.colab import drive
drive.mount("mydrive")

Mounted at mydrive


In [157]:
!unzip /content/mydrive/MyDrive/NER_Data/custom_ner_spacy.zip

!python -m spacy download en_core_web_lg

Archive:  /content/mydrive/MyDrive/NER_Data/custom_ner_spacy.zip
replace Corona2.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [158]:
nlp = spacy.load("en_core_web_lg")

In [159]:

import json
# https://www.kaggle.com/datasets/finalepoch/medical-ner 
with open('/content/Corona2.json', 'r') as f:
    data = json.load(f)

data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [128]:
data['examples'][0]['content']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [129]:
data['examples'][0]['annotations'][0]

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

In [160]:
training_data = []
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

In [161]:
training_data[0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


In [162]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()
     

In [163]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy") 

100%|██████████| 31/31 [00:00<00:00, 197.41it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [164]:
# https://spacy.io/usage/training#quickstart

!python -m spacy init fill-config base_config.cfg config.cfg # keep reproducible


[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Add output folder + 
Change in config.cfg

    [training]
    
    max_epochs = 10 # change here
   

In [165]:
!python -m spacy train config.cfg --output /content/ --paths.train ./train.spacy --paths.dev ./train.spacy 
     

[38;5;4mℹ Saving to output directory: /content[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-05-21 17:39:55,219] [INFO] Set up nlp object from config
[2023-05-21 17:39:55,254] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-05-21 17:39:55,261] [INFO] Created vocabulary
[2023-05-21 17:39:58,995] [INFO] Added vectors: en_core_web_lg
[2023-05-21 17:40:02,602] [INFO] Finished initializing nlp object
[2023-05-21 17:40:04,598] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    153.29    0.55    0.91    0.39    0.01
  7     200        499.25   3640.10   70.00   82.80   60.63    0.70
[38;5;2m✔ Saved pipeline to output directory[0m
/content/model-last


In [166]:
!zip -r custorm_ner_model.zip /content/model-best 

  adding: content/model-best/ (stored 0%)
  adding: content/model-best/tokenizer (deflated 81%)
  adding: content/model-best/vocab/ (stored 0%)
  adding: content/model-best/vocab/key2row (deflated 16%)
  adding: content/model-best/vocab/lookups.bin (stored 0%)
  adding: content/model-best/vocab/vectors (deflated 8%)
  adding: content/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/model-best/vocab/strings.json (deflated 77%)
  adding: content/model-best/ner/ (stored 0%)
  adding: content/model-best/ner/cfg (deflated 33%)
  adding: content/model-best/ner/model (deflated 8%)
  adding: content/model-best/ner/moves (deflated 62%)
  adding: content/model-best/meta.json (deflated 57%)
  adding: content/model-best/config.cfg (deflated 60%)
  adding: content/model-best/tok2vec/ (stored 0%)
  adding: content/model-best/tok2vec/cfg (stored 0%)
  adding: content/model-best/tok2vec/model (deflated 8%)


In [167]:
import shutil
shutil.copy("/content/custorm_ner_model.zip","/content/mydrive/MyDrive/NER_Data/")

'/content/mydrive/MyDrive/NER_Data/custorm_ner_model.zip'

In [168]:
nlp_ner = spacy.load("model-best")

In [169]:


doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)


In [None]:
#