Train Spacy Model with Magic the Gathering Named Entity Recognition

In [1]:
# All imports
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import spacy
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
import os.path
import re
import requests
import json

In [2]:
# Read card data from csv
df = pd.read_csv("DataCleansed.csv",delimiter="|", na_filter = False) #to not treat N/A as Nan

In [3]:
df.head()

Unnamed: 0,Color,Subtype,Text,Type,Loyalty,Power,Toughness,Keyword,Name,ManaValue
0,['White'],['Plains'],({T}: Add {W}.),['Land'],,,,['N/A'],Plains,0.0
1,['Blue'],['Island'],({T}: Add {U}.),['Land'],,,,['N/A'],Island,0.0
2,['Black'],['Swamp'],({T}: Add {B}.),['Land'],,,,['N/A'],Swamp,0.0
3,['Red'],['Mountain'],({T}: Add {R}.),['Land'],,,,['N/A'],Mountain,0.0
4,['Green'],['Forest'],({T}: Add {G}.),['Land'],,,,['N/A'],Forest,0.0


1) Get a list of each class of named entity in MTG

In [4]:
# CARD SUBTYPES
subtypes_from_df = list(df['Subtype'].unique())

In [5]:
# CARD TYPES
types_from_df = list(df['Type'].unique())

In [6]:
def remove_symbols(mylist,symbol):
    '''
    Remove all instances of symbol1 and symbol2 from all elements of a list.
            
            Parameters:
                    mylist (list): List of strings with symbols to be removed.

            Returns: 
                    no_curly (list): List of strings without symbols1 and 2.
    '''
    no_symbol = []
    for l in mylist:
        l = l.replace(symbol,'')
        no_symbol.append(l)
    return no_symbol

In [7]:
# Remove [] and ''
to_remove = ['[',']',"'"]

for s in to_remove:
    types_from_df = remove_symbols(types_from_df,s)
    subtypes_from_df = remove_symbols(subtypes_from_df,s)

In [8]:
# FIELD ZONES
zones = ["Library","Hand","Battlefield","Graveyard","Stack","Exile","Command","Ante"]

# ABILITIES
abilities = ["Battalion", "Commander", "Bloodrush", "Channel", "Chroma", "Cohort", "Constellation", "Converge", "Delirium", "Domain", "Fateful hour", "Ferocious", "Formidable", "Grandeur", "Hellbent", "Heroic", "Imprint", "Inspired", "Join forces", "Kinship", "Landfall", "Lieutenant", "Metalcraft", "Morbid", "Parley", "Radiance", "Raid", "Rally", "Spell mastery", "Strive", "Sweep", "Tempting offer", "Threshold", "Will of the council", "Adamant", "Addendum", "Council's dilemma", "Eminence", "Enrage", "Hero's Reward", "Kinfall", "Landship", "Legacy", "Revolt", "Underdog", "Undergrowth", "Magecraft", "Teamwork", "Pack tactics", "Coven", "Alliance"]

# KEY ABILITIES
keyAbilities = ["Living weapon", "Jump-start", "Basic landcycling", "Commander ninjutsu", "Legendary landwalk", "Nonbasic landwalk", "Totem armor", "Megamorph", "Haunt", "Forecast", "Graft", "Fortify", "Frenzy", "Gravestorm", "Hideaway", "Level Up", "Infect", "Reach", "Rampage", "Phasing", "Multikicker", "Morph", "Provoke", "Modular", "Ninjutsu", "Replicate", "Recover", "Poisonous", "Prowl", "Reinforce", "Persist", "Retrace", "Rebound", "Miracle", "Overload", "Outlast", "Prowess", "Renown", "Myriad", "Shroud", "Trample", "Vigilance", "Shadow", "Storm", "Soulshift", "Splice", "Transmute", "Ripple", "Suspend", "Vanishing", "Transfigure", "Wither", "Unearth", "Undying", "Soulbond", "Unleash", "Ascend", "Assist", "Afterlife", "Companion", "Fabricate", "Embalm", "Escape", "Fuse", "Menace", "Ingest", "Melee", "Improvise", "Mentor", "Partner", "Mutate", "Scavenge", "Tribute", "Surge", "Skulk", "Undaunted", "Riot", "Spectacle", "Forestwalk", "Islandwalk", "Mountainwalk", "Double strike", "Cumulative upkeep", "First strike", "Encore", "Sunburst", "Deathtouch", "Defender", "Foretell", "Amplify", "Affinity", "Bushido", "Convoke", "Bloodthirst", "Absorb", "Aura Swap", "Changeling", "Conspire", "Cascade", "Annihilator", "Battle Cry", "Cipher", "Bestow", "Dash", "Awaken", "Crew", "Aftermath", "Afflict", "Flanking", "Echo", "Fading", "Fear", "Eternalize", "Entwine", "Epic", "Dredge", "Delve", "Evoke", "Exalted", "Evolve", "Extort", "Dethrone", "Exploit", "Devoid", "Emerge", "Escalate", "Flying", "Haste", "Hexproof", "Indestructible", "Intimidate", "Lifelink", "Horsemanship", "Kicker", "Madness", "Hidden agenda", "Swampwalk", "Desertwalk", "Wizardcycling", "Slivercycling", "Cycling", "Landwalk", "Plainswalk", "Champion", "Enchant", "Plainscycling", "Islandcycling", "Swampcycling", "Mountaincycling", "Forestcycling", "Landcycling", "Typecycling", "Split second", "Flash", "Banding", "Augment", "Double agenda", "Partner with", "Hexproof from", "Boast", "Buyback", "Ward", "Demonstrate", "Devour", "Flashback", "Equip", "Reconfigure", "Compleated", "Daybound", "Nightbound", "Decayed", "Disturb", "Training", "Cleave", "Intensity", "Blitz", "Casualty", "Friends forever", "Protection", "Offering", "Enlist", "Read Ahead"]

# KEY ACTIONS
keyActions = ["Meld", "Bolster", "Clash", "Fateseal", "Manifest", "Monstrosity", "Populate", "Proliferate", "Scry", "Support", "Detain", "Explore", "Fight", "Amass", "Adapt", "Assemble", "Abandon", "Activate", "Attach", "Seek", "Cast", "Counter", "Create", "Destroy", "Discard", "Double", "Exchange", "Exile", "Investigate", "Play", "Regenerate", "Reveal", "Sacrifice", "Set in motion", "Shuffle", "Tap", "Untap", "Vote", "Transform", "Surveil", "Goad", "Planeswalk", "Mill", "Learn", "Conjure", "Exert", "Connive", "Venture into the dungeon"]

# COLORS
colors = ['White', 'Blue', 'Black', 'Red', 'Green', 'Colorless']

# KEY ACTION SYMBOLS
keyAction_symbols = ['{E}','{A}','{T}','{Q}']

In [9]:
def unpack_listoflists(oldlist):
    '''
    Some items in the subtypes and types list have more than 1 type/subtype (ex:  ['Artifact, Creature'] instead of ['Artifact','Creature']). 
    This function goes through the list and unpacks those entries, then removes all the duplicates.

            Parameters:
                    oldlist (list): List we need to unpack.

            Returns:
                    newlist (list): Unpacked list with only the unique values.
    '''
    newlist = []
    for element in oldlist:
        if ',' in element:
            sublist = element.split(', ')
            for subelement in sublist:
                if subelement not in newlist:
                    newlist.append(subelement)
        else:
            if element not in newlist:
                newlist.append(element)
    return newlist

# Apply function:
cardTypes = unpack_listoflists(types_from_df)
cardSubtypes = unpack_listoflists(subtypes_from_df)

In [10]:
# Adding some types/plurals I noticed were missing in cardTypes:
cardTypes.append('Token')
cardTypes.append('Tokens')
cardTypes.append('Lands')
cardTypes.append('Planeswalkers')
cardTypes.append('Creatures')
cardTypes.append('Enchantments')
cardTypes.append('Instants')
cardTypes.append('Sorceries')
cardTypes.append('Artifacts')

In [11]:
get_symbols = requests.get('https://api.scryfall.com/symbology')
symbol_dict = json.loads(get_symbols.text)

In [12]:
# Extracting only the symbols and explanation from the dictionary
symbols = []
for s in symbol_dict["data"]:
    symbols.append((s["symbol"],s["english"]))

In [13]:
# Extracting only mana-related symbols from symbols list
symbols_mana = []
for idx in range(len(symbols)):
    if 'mana' in symbols[idx][1]:
        symbols_mana.append(symbols[idx][0])

In [14]:
# Remove the curly brackets (or else regex will fail later on)
to_remove = ['{','}']

for s in to_remove:
    symbols_mana = remove_symbols(symbols_mana,s)
    keyAction_symbols = remove_symbols(keyAction_symbols,s)

In [15]:
# Dictionary with all the classes and the lists of entities per class

dict_classes = {
    0: {'class' : 'KEYACTION',
    'list' : keyActions},
    1: {'class' : 'KEYABILITY',
    'list' : keyAbilities},
    2: {'class' : 'ABILITY',
    'list' : abilities},
    3: {'class' : 'CARDCOLOR',
    'list' : colors},
    4: {'class' : 'CARDTYPE',
    'list' : cardTypes},
    5: {'class' : 'CARDSUBTYPE',
    'list' : cardSubtypes},
    6: {'class' : 'MANACOST',
    'list' : symbols_mana}, 
    7: {'class' : 'ZONE',
    'list' : zones},
    8: {'class' : 'KEYACTIONSYMBOL',
    'list' : keyAction_symbols}
}

2) Get/prepare card text data

In [16]:
# Shuffle and return a fraction of the dataframe (85%) to train the NER model
df_text = df[['Text']].sample(frac=0.85).reset_index(drop=True)

In [17]:
df_text

Unnamed: 0,Text
0,Green spells you cast cost {1} less to cast.
1,Untap target creature. Prevent all damage that...
2,Incandescent Aria deals 3 damage to each nonto...
3,Fiery Impulse deals 2 damage to target creatur...
4,"When Omnath, Locus of Creation enters the batt..."
...,...
20330,All creatures get -2/-2 until end of turn. If ...
20331,Exalted (Whenever a creature you control attac...
20332,
20333,You may look at the top card of your library a...


In [18]:
def find_entities(df,col_search,entity_list,col_entities):
    '''
    For each card text, search all entities inside the text and their index positions, then add this data as new columns in a dataframe.
            
            Parameters:
                    df (pandas dataframe): Dataframe where the information will be saved in the end.
                    col_search (str): Column of df where the text data is. The function will then look for entities in this text data.
                    col_entities (list): All the entity classes (labels).
                    entity_list (list): All the individual named entities in an entity class.
    '''
    for text,index in zip(df[col_search],range(len(df[col_search]))):
        entities_found = ''
        start_idx = ''
        end_idx = ''
        for word in entity_list:
            if col_entities == 'MANACOST' or col_entities == 'KEYACTIONSYMBOL': # search inside curly brackets {}
                if re.findall(r'\{'+word+'\}', text, re.IGNORECASE):
                    for match in re.finditer(r'\{'+word+'\}', text, re.IGNORECASE):
                        if entities_found == '':
                            entities_found = '{'+word+'}'
                            start = str(match.start()) 
                            end = str(match.end()) 
                            start_idx = start
                            end_idx = end
                        else:
                            entities_found = entities_found + '|' + '{'+word+'}'
                            start = str(match.start()) 
                            end = str(match.end()) 
                            start_idx = start_idx + '|' + start
                            end_idx = end_idx + '|' + end
            else:
                if re.findall(r"\b"+word+r"\b", text, re.IGNORECASE): # Else match word exactly
                    for match in re.finditer(r"\b"+word+r"\b", text, re.IGNORECASE):
                        if entities_found == '':
                            entities_found = word
                            start = str(match.start())
                            end = str(match.end())
                            start_idx = start
                            end_idx = end
                        else:
                            entities_found = entities_found + '|' + word
                            start = str(match.start())
                            end = str(match.end())
                            start_idx = start_idx + '|' + start
                            end_idx = end_idx + '|' + end
        
        # Create a new df column with the entities and index positions found
        df.loc[index,col_entities] = entities_found
        df.loc[index,col_entities+'_start'] = start_idx
        df.loc[index,col_entities+'_end'] = end_idx

In [19]:
# Run function to find_entities() for EACH entity class in the dict_classes dictionary
for key in range(len(dict_classes)):
    find_entities(df_text,'Text', dict_classes[key]['list'], dict_classes[key]['class'])

In [20]:
df_text[0:10]

Unnamed: 0,Text,KEYACTION,KEYACTION_start,KEYACTION_end,KEYABILITY,KEYABILITY_start,KEYABILITY_end,ABILITY,ABILITY_start,ABILITY_end,...,CARDSUBTYPE_end,MANACOST,MANACOST_start,MANACOST_end,ZONE,ZONE_start,ZONE_end,KEYACTIONSYMBOL,KEYACTIONSYMBOL_start,KEYACTIONSYMBOL_end
0,Green spells you cast cost {1} less to cast.,Cast|Cast,17|39,21|43,,,,,,,...,,{1},27,30,,,,,,
1,Untap target creature. Prevent all damage that...,Discard|Untap,97|0,104|5,Cycling,79,86,,,,...,,{2}|{2},87|92,90|95,,,,,,
2,Incandescent Aria deals 3 damage to each nonto...,,,,,,,,,,...,,,,,,,,,,
3,Fiery Impulse deals 2 damage to target creatur...,,,,,,,Spell mastery,49.0,62.0,...,,,,,Graveyard,127,136,,,
4,"When Omnath, Locus of Creation enters the batt...",,,,,,,Landfall,68.0,76.0,...,18.0,{W}|{U}|{R}|{G},252|255|246|249,255|258|249|252,Battlefield|Battlefield,42|106,53|117,,,
5,Enchant creature\nWhen Krovikan Fetish enters ...,,,,Enchant,0,7,,,,...,,,,,Battlefield,49,60,,,
6,"Haste\nWhenever Bomat Courier attacks, exile t...",Discard|Exile|Sacrifice,113|38|132,120|43|141,Haste,0,5,,,,...,,{R},108,111,Library|Hand|Exile,65|126|38,72|130|43,,,
7,"Flying\nWhen Dorothea, Vengeful Victim attacks...",Cast|Sacrifice,115|57,119|66,Flying|Disturb|Disturb,0|88|170,6|95|177,,,,...,,{1}|{W}|{U},96|99|102,99|102|105,Graveyard,140,149,,,
8,Vigilance,,,,Vigilance,0,9,,,,...,,,,,,,,,,
9,"Flying\n{2}, Return a land you control to its ...",,,,Flying,0,6,,,,...,,{2},7,10,Hand,53,57,,,


3. Create spacy file with training data

In [21]:
# FUNCTION: for each card text, annotate all entities found, the class they belong to and their start/end idxs

def create_NER_trainingData(df,col_search):
    '''
    For each card text, criate an annotation with data about the entities found, the class they belong to and their start/end indexes.
    Then, store this data inside a dictionary.
            
            Parameters:
                    df (pandas dataframe): Dataframe where the information will be saved in the end.
                    col_search (str): Column of df where the text data is. The function will then look for entities in this text data.

            Returns:
                    dictionary (dict)
    '''

    dictionary = {
    'classes' : ['KEYACTION', "KEYABILITY", "ABILITY", 'CARDCOLOR', 'CARDTYPE', 'CARDSUBTYPE', 'MANACOST', 'ZONE', 'KEYACTIONSYMBOL'], 
    'annotations' : []
    }

    # Text (examples)
    for text, index in zip(df[col_search],range(len(df[col_search]))):
        temp_dict = {}
        temp_dict['text'] = text
        temp_dict['entities'] = []

        # Entities/classes found in text (annotations)
        for key in range(len(dictionary['classes'])):
            ent_class = dictionary['classes'][key] 
            matchlist = df.loc[index,ent_class].split('|') # string w/ all matches

            if matchlist != ['']: # if there is at least 1 match
                startlist = list(map(int,df.loc[index,ent_class+'_start'].split('|'))) # string w/ all start idxs
                endlist = list(map(int,df.loc[index,ent_class+'_end'].split('|'))) # string w/ all end idxs

                for idx in range(0,len(matchlist)):
                    label = ent_class
                    start = startlist[idx]
                    end = endlist[idx]
                    temp_dict['entities'].append((start, end, label))
                
        dictionary['annotations'].append(temp_dict)
        
    return dictionary

In [22]:
# Get the text+annotation dictionary
NER_training_data = create_NER_trainingData(df_text,'Text')

For more details, check: https://newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3

In [23]:
# Load new and empty spacy model
nlp = spacy.blank("en")

# Create a DocBin object
doc_bin = DocBin() 

for training_example  in tqdm(NER_training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("mtg_NER_training_data.spacy") # save the docbin object

  8%|▊         | 1676/20335 [00:00<00:03, 5730.92it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 18%|█▊        | 3621/20335 [00:00<00:02, 6290.89it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 28%|██▊       | 5664/20335 [00:00<00:02, 6652.22it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 38%|███▊      | 7782/20335 [00:01<00:01, 6927.44it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 45%|████▌     | 9215/20335 [00:01<00:01, 7034.94it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 56%|█████▌    | 11354/20335 [00:01<00:01, 7091.45it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 63%|██████▎   | 12773/20335 [00:01<00:01, 7071.64it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 74%|███████▍  | 15007/20335 [00:02<00:00, 7300.65it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 85%|████████▍ | 17216/20335 [00:02<00:00, 7274.79it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

 92%|█████████▏| 18670/20335 [00:02<00:00, 7253.11it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

100%|██████████| 20335/20335 [00:02<00:00, 6922.50it/s]


Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


4. Train the NER model using the TERMINAL

Go to the directory where the mtg_NER_training_data.spacy file is using the 'cd xxx' command

Create a base config on spaCy’s training quickstart page (https://spacy.io/usage/training#quickstart). Use the following configurations: Language/English; Components/ner; Hardware/CPU; Optimize for/Efficiency. Save as 'base_config.cfg'

This is an incomplete file with only our custom options, so we’ll have to fill in the rest with the default values: 'python -m spacy init fill-config base_config.cfg config.cfg'. This command with create a 'config.cfg' file in your directory.

Command to train our model:
'python -m spacy train config.cfg --output ./MTG_NER --paths.train ./mtg_NER_training_data.spacy --paths.dev ./mtg_NER_training_data.spacy'

5. Combine the NER model we just trained with the default en_core_web_sm library from spacy. 
(For more info: https://github.com/explosion/projects/tree/v3/tutorials/ner_double)

In [24]:
# Load existing spacy en library
nlp = spacy.load("en_core_web_sm") 

# Remove the deafult NER pipeline
nlp.remove_pipe("ner")

# Confirm
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']


In [26]:
# Load our MTG NER model
nlp_mtg = spacy.load("MTG_NER\model-best")

# Check pipelines
print(nlp_mtg.pipe_names)

['tok2vec', 'ner']


In [27]:
# Add the NER from our MTG model to the default en library from spacy

nlp_mtg.replace_listeners("tok2vec", "ner", ["model.tok2vec"])

nlp.add_pipe(
    "ner",
    name="ner_mtg",
    source=nlp_mtg
)

# Confirm 
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner_mtg']


In [28]:
# Test NER of our combined model

doc = nlp(list(df['Text'])[0])

spacy.displacy.render(doc, style="ent", jupyter=True)

In [29]:
doc = nlp(list(df['Text'])[10])

spacy.displacy.render(doc, style="ent", jupyter=True)

In [30]:
doc = nlp(list(df['Text'])[20])

spacy.displacy.render(doc, style="ent", jupyter=True)

In [31]:
# Save combined model
nlp.to_disk("./CombinedModel")