[![tetis](https://www.umr-tetis.fr/images/logo-header-tetis.png)](https://www.umr-tetis.fr/index.php)

# 1. Prepare the model
## 1.1 Load the model from huggingFace

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("rdecoupes/tetis-geochallenge")

model = AutoModelForTokenClassification.from_pretrained("rdecoupes/tetis-geochallenge")

## 1.2 Create the pipeline

In [15]:
from transformers import pipeline

# transforms bilou format into IOB in order to do an aggregation
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
nlp.model.config.id2label = {k: v.replace('L-', 'I-').replace('U-', 'B-') for k, v in nlp.model.config.id2label.items()}

## 1.3 Define post-processing

In [2]:
def preprocessing(tweet_text):
    # remove hashtag
    if tweet_text[0] == "#": # if it's at the beginning of the sentence, we remove # by "," because otherwise tokenizer remove a character
        tweet_text = "'" + tweet_text[1:]
        pass
    # else:
    #     tweet_text = tweet_text.replace("#", " ")
    tweet_text = tweet_text.replace("#", " ")
    return tweet_text

In [3]:
def nlp_results_to_location_mentions(entities):
    list_location_mentions = []
    # trouble with pipeline tokenizer that can't aggregate successfully the subtokens
    for_restart = True
    while(for_restart):
        for_restart = False
        if len(entities) > 1:
            for i, ent in enumerate(entities):
                try:
                    if entities[i]["end"] == entities[i+1]["start"]:# they are subtokens
                        entities[i]["word"] = entities[i]["word"] + entities[i+1]["word"]
                        entities[i]["end"] = entities[i+1]["end"]
                        entities.remove(entities[i+1])
                        for_restart = True
                        break
                except:
                    continue

    for ent in entities:
        # trouble with pipeline tokenizer: it often puts a white space at the beginning of the token
        if ent["word"].startswith(" "):
            ent["word"] = ent["word"][1:]
        if ent["word"].startswith("#"): # we remove '#' if any
            ent["word"] = ent["word"][1:]
            ent["start"] = ent["start"] + 1
        if ent["word"].startswith("'"): # we remove ''' if any (comes from the preprocessing when the sentences starts by a keyword)
            ent["word"] = ent["word"][1:]
            ent["start"] = ent["start"] + 1
        location_mention = {
            "text": ent["word"],
            "start_offset": ent["start"],
            "end_offset": ent["end"]
        }
        list_location_mentions.append(location_mention)
    return list_location_mentions

# 2. Use the pipeline

In [10]:
import pandas as pd

example_list = [
    "My name is Sarah and I live in London",
    "My name is Wolfgang and I live in Berlin",
    "My name is Clara and I live in Berkeley, California."
]

df = pd.DataFrame(example_list, columns=["text"])



In [11]:
df

Unnamed: 0,text
0,My name is Sarah and I live in London
1,My name is Wolfgang and I live in Berlin
2,"My name is Clara and I live in Berkeley, Calif..."


In [18]:
df["text"] = df["text"].apply(preprocessing)
df["predicted"] = df["text"].apply(nlp)
df["location_mentions"] = df["predicted"].apply(nlp_results_to_location_mentions)

In [19]:
df

Unnamed: 0,text,predicted,location_mentions
0,My name is Sarah and I live in London,"[{'entity_group': 'LOC', 'score': 0.9956161, '...","[{'text': 'London', 'start_offset': 31, 'end_o..."
1,My name is Wolfgang and I live in Berlin,"[{'entity_group': 'LOC', 'score': 0.9885152, '...","[{'text': 'Berlin', 'start_offset': 34, 'end_o..."
2,"My name is Clara and I live in Berkeley, Calif...","[{'entity_group': 'LOC', 'score': 0.93376523, ...","[{'text': 'Berkeley', 'start_offset': 31, 'end..."
