In [85]:
import spacy
from sklearn.model_selection import train_test_split

In [2]:
spacy.__version__

'3.6.0'

In [4]:
from spacy.tokens import DocBin
import pandas as pd
import re

In [86]:
df=pd.read_excel("physical_france_generated.xlsx")    

In [87]:
df = df.fillna('')

In [88]:
shuffled_data = df.sample(frac=1, random_state=42)  # Shuffle the dataset randomly

In [91]:
split_index = int(0.8 * len(shuffled_data))  # Calculate the index to split the dataset

8000

In [92]:
df_train = shuffled_data[:split_index]  # Training data
df_test = shuffled_data[split_index:]  # Testing data

In [95]:
def massage_data(address):
    '''Pre process address string to remove new line characters, add comma punctuations etc.'''
    cleansed_address1=re.sub(r'(,)(?!\s)',', ',address)
    cleansed_address2=re.sub(r'(\\n)',', ',cleansed_address1)
    cleansed_address3=re.sub(r'(?!\s)(-)(?!\s)',' - ',cleansed_address2)
    cleansed_address=re.sub(r'\.','',cleansed_address3)
    return cleansed_address

In [96]:
def get_address_span(address=None,address_component=None,label=None):
    '''Search for specified address component and get the span.
    Eg: get_address_span(address="221 B, Baker Street, London",address_component="221",label="BUILDING_NO") would return (0,2,"BUILDING_NO")'''

    if pd.isna(address_component) or str(address_component)=='nan' or str(address_component)=='NaN' or address_component==None:
        pass
    else:
        address_component1=re.sub('\.','',address_component)
        address_component2 = re.escape(re.sub(r'(?!\s)(-)(?!\s)',' - ',address_component1))
        span=re.search('\\b(?:'+address_component2+')\\b',address)
        if span is not None:
            return (span.start(), span.end(), label)
        else:
            return None

In [97]:
def extend_list(entity_list, entity):
    if isinstance(entity, list):
        entity_list.extend(entity)
    elif not pd.isna(entity):
        entity_list.append(entity)
    return entity_list

In [98]:
def create_entity_spans(df,tag_list):
    '''Create entity spans for training/test datasets'''
    df['Address']=df['Address'].apply(lambda x: massage_data(x))
    df["SubdivisionTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Subdivision'],label='SUBDIVISION'),axis=1)
    df["building_nameTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['building_name'],label='BUILDING_NAME'),axis=1)
    df["sub_building_types_Tag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['sub_building_types'],label='SUBBUILDING'),axis=1)
    df["numberTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['number'],label='NUMBER'),axis=1)
    df["StreetTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Street'],label='STREET'),axis=1)
    df["PostcodeTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['postcode'],label='POSTCODE'),axis=1)
    df["CityTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['city'],label='CITY'),axis=1)
    df['EmptySpan']=df.apply(lambda x: [], axis=1)
    for i in tag_list:
        df['EntitySpans']=df.apply(lambda row: extend_list(row['EmptySpan'],row[i]),axis=1)
        df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
    return df['EntitySpans']

In [99]:
#Define custom entity tag list
tag_list=["SubdivisionTag","building_nameTag","sub_building_types_Tag","numberTag","StreetTag","PostcodeTag","CityTag","EmptySpan"]
# Get entity spans
df_entity_spans= create_entity_spans(df_train.astype(str),tag_list)
training_data= df_entity_spans.values.tolist()

In [102]:
def get_doc_bin(training_data, nlp):
    db = DocBin()
    for text, annotations in training_data:
        doc = nlp(text)
        ents = []
        entity_positions = set()
        for start, end, label in annotations:
            if any(start <= position < end for position in entity_positions):
                continue
            span = doc.char_span(start, end, label=label)
            if span is not None:
                ents.append(span)
                entity_positions.update(range(start, end))
        doc.ents = ents
        db.add(doc)
    return db

In [103]:
#Load blank French model. This is needed for initializing a Document object for our training/test set.
nlp = spacy.blank("fr")

In [104]:
# Get & Persist DocBin to disk
doc_bin_train= get_doc_bin(training_data,nlp)
doc_bin_train.to_disk("train.spacy")

In [105]:
###### Validation dataset prep ###########
# Get entity spans
df_entity_spans= create_entity_spans(df_test.astype(str),tag_list)
validation_data= df_entity_spans.values.tolist()

# Get & Persist DocBin to disk
doc_bin_test= get_doc_bin(validation_data,nlp)
doc_bin_test.to_disk("test.spacy")

In [82]:
nlp=spacy.load("output\models\model-best")

OSError: [E050] Can't find model 'output\models\model-best'. It doesn't seem to be a Python package or a valid path to a data directory.