# Important Steps to run this colab
###- Create the datasets folder for the  xquad or any dataset you want
###- You can download the datasets from here https://drive.google.com/drive/folders/1LOH1wXwCs7YiJGE51uD25yXWSA9cMIyX?usp=sharing
###- Install the transformers , datasets and sklearn libraries
###- Follow the implementation to save a pretrained model to local files

In [None]:
mkdir datasets

# Libraries

In [3]:
!pip install datasets transformers sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

#Ixambert Transformer Use

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "MarcBrun/ixambert-finetuned-squad-eu-en"
#To download this model uncomment from here 
#model = AutoModelForQuestionAnswering.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(model_name)

#To save it once you download it
#tokenizer.save_pretrained("ixaFiles")
#model.save_pretrained("ixaFiles")
#to here

#once downloaded and saved the model you can comment  all the lines above and use the following ones
tokenizer = AutoTokenizer.from_pretrained("ixaFiles")
model = AutoModelForQuestionAnswering.from_pretrained("ixaFiles")

# To get predictions
context = "El Tajo es el río más largo de la península ibérica, a la que atraviesa en su parte central, siguiendo un rumbo este-oeste, con una leve inclinación hacia el suroeste, que se acentúa cuando llega a Portugal, donde recibe el nombre de Tejo. Nace en los montes Universales, en la sierra de Albarracín, sobre la rama occidental del sistema Ibérico y, después de recorrer 1007 km, llega al océano Atlántico en la ciudad de Lisboa. En su desembocadura forma el estuario del mar de la Paja, en el que vierte un caudal medio de 456 m³/s. En sus primeros 816 km atraviesa España, donde discurre por cuatro comunidades autónomas (Aragón, Castilla-La Mancha, Madrid y Extremadura) y un total de seis provincias (Teruel, Guadalajara, Cuenca, Madrid, Toledo y Cáceres)."
question = "¿Por qué provincias pasa el Tajo?"
def qaSP(qspvalue :str,cspvalue:str )->str:
    qa = pipeline("question-answering", model=model_name, tokenizer=model_name)
    pred = qa(question=qspvalue,context=cspvalue)
    return pred['answer']

print(qaSP(question,context))

# Finding the UNK token
### this is done finding what produces the UNK token  in the line  using the output text such as output.offsets[index] where index is the position of the UNK token

# Fine Tuning Section





# Dataset Management

In [1]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        MLQA = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in MLQA['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers
contexts, questions, answers = read_squad("datasets/xquad.es.json")


#Traning and Validation Data

In [2]:
from  sklearn.model_selection import train_test_split 
pTest=0.2 #20 percent  for eval
trainContext,valContext,trainQuestions,valQuestions,trainAnswers,valAnswers=train_test_split(contexts,questions,answers,shuffle=False) #randomize samples


# Start and End Character verification



In [3]:
def addEndId(tAnswers, tContexts):
    for i in range(len(tAnswers)):
        text = tAnswers[i]['text']
        startId  =tAnswers[i]['answer_start']
        endId = startId + len(text)
        # sometimes squad answers are off by a character or two – fix this
        if tContexts[i][startId:endId] == text:
            tAnswers[i]['answer_end'] = endId
        elif tContexts[i][startId-1:endId-1] == text:
            tAnswers[i]['answer_start'] = startId - 1
            tAnswers[i]['answer_end'] = endId - 1     # When the gold label is off by one character
        elif context[startId-2:endId-2] == text:
            tAnswers[i]['answer_start'] = startId - 2
            tAnswers[i]['answer_end'] = endId - 2     # When the gold label is off by two characters

addEndId(trainAnswers, trainContext)
addEndId(valAnswers, valContext)


# Tokenize

In [4]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

trainEncodings = tokenizer(trainContext, trainQuestions, truncation=True, padding=True)
valEncodings = tokenizer(valContext, valQuestions, truncation=True, padding=True)

# Start and End character to token

In [5]:
def addTokenPositions(encodings, answers):
    startPositions = []
    endPositions = []
    for i in range(len(answers)):
        startPositions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        endPositions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if startPositions[-1] is None:
            startPositions[-1] = tokenizer.model_max_length
        if endPositions[-1] is None:
            endPositions[-1] = tokenizer.model_max_length
    encodings.update({'startPositions': startPositions, 'endPositions': endPositions})

addTokenPositions(trainEncodings, trainAnswers)
addTokenPositions(valEncodings, valAnswers)

# Torch Training

In [6]:
import torch 
class XquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

trainDataset = XquadDataset(trainEncodings)

valDataset = XquadDataset(valEncodings)


In [19]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
# To load the model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained("MarcBrun/ixambert-finetuned-squad-eu-en")
tokenizer = AutoTokenizer.from_pretrained("MarcBrun/ixambert-finetuned-squad-eu-en")



Downloading (…)okenizer_config.json:   0%|          | 0.00/355 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/936k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Ixambert  base for QA

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(trainDataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['startPositions'].to(device)
        end_positions = batch['endPositions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

#You can ignore the next part, its only for test purposes

# Test

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline


#To download this model uncomment from here 
#tokenizer = AutoTokenizer.from_pretrained("mrm8488/bert2bert-spanish-question-generation")
#modelQ = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/bert2bert-spanish-question-generation")


#To save it once you download it
#tokenizer.save_pretrained("b2bFiles")
#modelQ.save_pretrained("b2bFiles")
#to here

#once downloaded and saved the model you can comment  all the lines above and use the following ones
tokenizer = AutoTokenizer.from_pretrained("b2bFiles")
modelQ = AutoModelForSeq2SeqLM.from_pretrained("b2bFiles")

def getQuestion(context:str, max_length:int=64)->str:
  inputText = "context: %s </s>" % ( context)
  features = tokenizer([inputText], return_tensors='pt')

  output = modelQ.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

  question=tokenizer.decode(output[0]).strip("[SEP]")
  question =question.strip("CLS]") 
  return question



  
context = "El Tajo es el río más largo de la península ibérica, a la que atraviesa en su parte central, siguiendo un rumbo este-oeste, con una leve inclinación hacia el suroeste, que se acentúa cuando llega a Portugal, donde recibe el nombre de Tejo. Nace en los montes Universales, en la sierra de Albarracín, sobre la rama occidental del sistema Ibérico y, después de recorrer 1007 km, llega al océano Atlántico en la ciudad de Lisboa. En su desembocadura forma el estuario del mar de la Paja, en el que vierte un caudal medio de 456 m³/s. En sus primeros 816 km atraviesa España, donde discurre por cuatro comunidades autónomas (Aragón, Castilla-La Mancha, Madrid y Extremadura) y un total de seis provincias (Teruel, Guadalajara, Cuenca, Madrid, Toledo y Cáceres)."


print(getQuestion(context))

###QUESTION GEN
#from transformers import AutoModelForQuestionAnswering
#Load withoud downloading
#model_name = AutoModelForQuestionAnswering.from_pretrained("MarcBrun/ixambert-finetuned-squad-eu-en")
#Load post downloading
#model_name = AutoModelForQuestionAnswering.from_pretrained("ixaFiles")
#To download this model
#To save it once you download it
#model.save_pretrained("ixaFiles")


#def qaSP(question :str,context:str )->str:
 #   qa = pipeline("question-answering", model=model_name, tokenizer=model_name)
  #  pred = qa(question=question,context=context, max_length=256)
   # return pred['answer']
#print(qaSP(question,context))

Downloading (…)okenizer_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/993M [00:00<?, ?B/s]

 ¿ Cuál es el nombre del río que corre a través del río Tajo? 


In [None]:
from transformers import  Trainer, TrainingArguments,AutoModelForSeq2SeqLM

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = AutoModelForSeq2SeqLM.from_pretrained("b2bFiles")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=valDataset             # evaluation dataset
)

trainer.train()

In [None]:
from datasets import load_dataset
dataset = load_dataset("xtreme", "MLQA.es.es")

#Tokenizing the data
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("b2bFiles")

def tokenize_function(dataset):
    return tokenizer(dataset["context"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Limpieza de duplicados

In [None]:
train_contexts=list(dict.fromkeys(train_contexts))
train_questions=list(dict.fromkeys(train_questions))
train_answers=list(dict.fromkeys(train_questions))

['¿Cuántos puntos dejaron escapar en defensa los Panthers?',
 '¿Cuántas capturas ha conseguido Jared Allen en su carrera?',
 '¿Cuántos derribos se anotó Luke Kuechly?',
 '¿Cuántos balones interceptó Josh Norman?',
 '¿Quién de todo el equipo ha conseguido más capturas esta temporada?',
 '¿Cuántas intercepciones se le atribuyen a la defensa de los Panthers en 2015?',
 '¿Quién lideró a los Panthers en capturas?',
 '¿Cuántos jugadores defensivos de los Panthers fueron seleccionados para la Pro Bowl?',
 '¿Cuántos balones sueltos forzados logró Thomas Davis?',
 '¿Qué jugador tuvo el mayor número de intercepciones de la temporada?',
 'En la temporada 2015, ¿cuántas intercepciones consiguió la defensa de los Panthers?',
 '¿Quién obtuvo cinco capturas en nueve partidos como titular de los Carolina Panthers?',
 '¿Quién fue el líder en derribos de los Panthers en 2015?',
 '¿Con cuántas intercepciones Josh Norman consiguió touchdowns en 2015?',
 '¿Quién perdió contra los Broncos en la ronda divisi