# Converting training data to .spacy format

In [None]:
import spacy
from spacy.tokens import DocBin
import json
import os

nlp = spacy.blank("en")
doc_bin = DocBin()

In [None]:
model_version = "transformers-ner-0.0.2"

In [None]:
dir = os.path.join(os.pardir, "data", "entity-extraction", "processed", "2023-05-31_label-export_39-articles")
# with open(os.path.join(dir, "data_metrics.json"), "r") as f:
#     metrics = json.load(f)

# train_files = metrics['train']['gdd_ids']
# val_files = metrics['val']['gdd_ids']

In [None]:
data = []
dataset = "val"
files = os.listdir(os.path.join(dir, dataset))

for f in files:
    print(f)
    training_object = []
    entities = []
    with open(f"{os.path.join(dir, dataset, f)}", 'r') as fin:
        article = fin.readlines()
        article_data = json.loads(article[0])
        text = article_data['task']['data']["text"]
    
    doc = nlp.make_doc(text)    

    for label in article_data['result']:
        start = label['value']['start']
        end = label['value']['end']
        ent = label['value']['labels'][0]
        
        span = doc.char_span(start, end, label=ent)
        if span is not None:
            entities.append(span)
            
    doc.ents = entities
    doc_bin.add(doc)
    data.append((doc, {"entities": entities}))

In [None]:
doc_bin.to_disk("val.spacy")

# Fine-tuning Tok2Vec model 

In [None]:
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.training import Example

config = {"model": DEFAULT_TOK2VEC_MODEL}
nlp = spacy.blank("en")

In [None]:
tok2vec = nlp.add_pipe("tok2vec")

In [None]:
training_data = [Example.from_dict(d[0], d[1]) for d in data]

In [None]:
optimizer = nlp.initialize()
losses = tok2vec.update(training_data, sgd=optimizer)

In [None]:
losses

## Create a train/val file (jsonl format)

In [None]:
import srsly
import os
import json

files = os.listdir('../../../data/processed/')
data_files = []
for f in files:
    #Read the file, get the text, create a list with element as {'text':text_from_file} and save it as jsonl file   
    data = json.load(open(f"../../../data/processed/{f}", 'r'))
    text = data['data']['text']
    data_files.append({'text':text})

srsly.write_jsonl(os.path.join(".","pretrain_data.jsonl"), data_files)

In [None]:
!python -m spacy init fill-config {config_pretrain.cfg} --pretraining

# Train model command

In [None]:
!python -m spacy init fill-config spacy_transformer.cfg spacy_transformer.cfg 


In [None]:
# !python -m spacy pretrain config_pretrain.cfg ./pretrain_output --paths.raw_text ./pretrain_data.jsonl --gpu-id 0 --paths.train train.spacy --paths.dev val.spacy --paths.vectors en_core_web_lg 
!python -m spacy train config_pretrain.cfg --paths.train train.spacy --paths.dev val.spacy --output ./output --gpu-id 0

In [None]:
!python -m spacy train spacy_transformer.cfg --paths.train train.spacy --paths.dev val.spacy --output ./output

In [None]:
!python -m spacy debug config spacy_transformer_config.cfg --paths.train train.spacy --paths.dev val.spacy

# Inference

In [None]:
spacy.require_cpu()
nlp = spacy.load("./output/transformer-best/")

In [None]:
text="Quaternary Research 80 ( 2013 ) 482-494 Contents lists available at ScienceDirect Quaternary Research journal homepage : www.elsevier.com/locate/yqres Timing of the last deglaciation in the Sierra Nevada of the Mérida Andes, Venezuela Julien Carcaillet aIsandra Angel b, Eduardo Carrillo b, Franck A. Audemard c, Christian Beck d a ISTerre, Université de Grenoble 1, UMR CNRS 5275, F-38041 Grenoble, France b Instituto de Ciencias de la Tierra, Universidad Central de Venezuela, Apdo. 3805, Caracas 1010-A, Venezuela c Fundación Venezolana de Investigaciones Sismológicas, FUNVISIS, El Llanito, Caracas 1030, Venezuela d ISTerre, Université de Savoie, UMR CNRS 5275, F-73376 Le Bourget-du-Lac, France article info Article history : Received 10 January 2013 Available online 29 September 2013 Keywords : Terrestrial cosmogenic nuclides dating Glacial landforms Andes de Mérida Venezuela Pleistocene Holocene abstract In the tropical Mérida Andes ( northwestern Venezuela -)glacial landforms were found at altitudes between 2600 and 5000 m, corresponding to 600 km2 of ice cover during the maximum glacial extension. However, the lack of sufﬁcient absolute age data prevents detailed reconstruction of the timing of the last deglaciation. On the northwestern ﬂank of the Mucuñuque Massif, successive moraines and striated eroded basement surfaces were sampled for cosmogenic 10Be investigation. Their compilation with published data allows the establishment of a detailed chronology of the post-LGM glacier history. The oldest moraines ( 18.1 and 16.8 ka ) correspond to the Oldest Dryas. Successive moraine ridges indicate stops in the overall retreat between the LGM and the Younger Dryas. The cold and short Older Dryas stadial has been identiﬁed. Results indicate that most of the ice withdrew during the Pleistocene. The dataset supports an intensiﬁcation of the vertical retreat rate from ~ 25 m/ka during the late Pleistocene to ~ 310 m/ka during the Pleistocene/Holocene. Afterwards, the glacier was conﬁned and located in the higher altitude zones. The altitude difference of the Younger Dryas moraines in the Mucubají, La Victoria and Los Zerpa valleys indicates a strong effect of valley orientation on the altitude of moraine development."

In [None]:
doc = nlp(text)

# Access entities
for ent in doc.ents:
    print(ent.label_)

# Create labeled files locally to upload to LabelStudio

In [None]:
spacy.require_gpu()
nlp = spacy.load("../src/training/spacy_ner/output/transformer-v3/")

In [None]:
files = os.listdir(f"../data/{model_version}_processed/")

In [None]:
for f in files:
    data = json.load(open(f"../data/{model_version}_processed/{f}", 'r'))
    
    doc = nlp(data['data']['text'])
    results = []
    for ent in doc.ents:
        results.append({
                "from_name": "label",
                "to_name": "text",
                "type": "labels",
                "value": {
                    "start": ent.start_char,
                    "end": ent.end_char,
                    "text": ent.text,
                    "score": 0.5,
                    "labels": [
                        ent.label_
                    ]   
                }
            })
    data['predictions'][0]['result'] = results
    with open(f"../data/{model_version}_labeled/{f}", 'w') as fout:
        json.dump(data, fout)