# camembert-ner: model fine-tuned from camemBERT for NER task
https://huggingface.co/Jean-Baptiste/camembert-ner

Training data annotated as follows:

- `O`	Outside of a named entity
- `MISC`	Miscellaneous entity
- `PER`	Person’s name
- `ORG`	Organization
- `LOC`	Location

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")


##### Process text sample (from wikipedia)

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
nlp("Apple est créée le 1er avril 1976 dans le garage de la maison d'enfance de Steve Jobs à Los Altos en Californie par Steve Jobs, Steve Wozniak et Ronald Wayne14, puis constituée sous forme de société le 3 janvier 1977 à l'origine sous le nom d'Apple Computer, mais pour ses 30 ans et pour refléter la diversification de ses produits, le mot « computer » est retiré le 9 janvier 2015.")



Downloading (…)okenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

[{'entity_group': 'ORG',
  'score': 0.9921588,
  'word': 'Apple',
  'start': 0,
  'end': 5},
 {'entity_group': 'PER',
  'score': 0.99597645,
  'word': 'Steve Jobs',
  'start': 74,
  'end': 85},
 {'entity_group': 'LOC',
  'score': 0.99835855,
  'word': 'Los Altos',
  'start': 87,
  'end': 97},
 {'entity_group': 'LOC',
  'score': 0.9982911,
  'word': 'Californie',
  'start': 100,
  'end': 111},
 {'entity_group': 'PER',
  'score': 0.99870753,
  'word': 'Steve Jobs',
  'start': 115,
  'end': 126},
 {'entity_group': 'PER',
  'score': 0.99879086,
  'word': 'Steve Wozniak',
  'start': 127,
  'end': 141},
 {'entity_group': 'PER',
  'score': 0.99646753,
  'word': 'Ronald Wayne',
  'start': 144,
  'end': 157},
 {'entity_group': 'ORG',
  'score': 0.9449746,
  'word': 'Apple Computer',
  'start': 243,
  'end': 257}]

## Data preprocessing

In [33]:
import os

file_path = './gold-wikiner-tagset/42131-0.sample.txt'

def get_max_line_len(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        max_len = 0
        
        for line in f:
            line = line.strip().split()
            
            max_len = len(line) if len(line) > max_len else max_len
    
    return max_len

get_max_line_len(file_path)
                
    

file_dir = './gold-wikiner-tagset'

# show max nb of tokens per line for each .txt file
for file in os.listdir(file_dir):
    if file.endswith('.txt'):
        print(get_max_line_len(f'{file_dir}/{file}'))

40
54
88
57
24
67
55
71
40
61
45


In [35]:
len("Pierre-le-Grand")

15

## Write Bert predictions to file 

In [47]:
def write_bert_preds(input_file, nlp):
    ent_num = 0
    num_chars = 0
    
    with open(input_file, 'r', encoding='utf-8') as file:
        
        for line in file:
            output = nlp(line)
            
            for ent in output:
                
                ent_num += 1 
                
                 
                
                with open("output.ann", 'a+', encoding='utf-8') as new_file:
                    new_file.write(f"T{ent_num}\t{ent['entity_group']} {ent['start'] + num_chars} {ent['end'] + num_chars}\t{ent['word']}".strip())
            
            num_chars += len(line)
       
    

file_path = './gold-wikiner-tagset/42131-0.sample.txt'
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
                
                
write_bert_preds(file_path, nlp)