<a href="https://colab.research.google.com/github/LucaNicoleta/Clickbait-Spoiling/blob/main/models/synopsis_generation/QA_model_multi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate transformers[sentencepiece]
!pip install accelerate tensorflow tqdm dataset numpy

!apt install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.0.0
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0

In [None]:
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate

metric = evaluate.load("squad")

n_best=20

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [None]:

import pandas as pd

# citim json si il stocam intr-un dataframe
df_test = pd.read_json("/content/drive/MyDrive/click-data/validation.jsonl", lines=True)
# print(pd.concat([df_test["targetParagraphs"],df_test["postText"]],axis=1))

# coloana tags contine moment array cu un singur element asa ca desfacem array-ul
df_test["tags"] = list(map(lambda x: x[0], df_test["tags"].tolist()))
# etichetam valorile din tags cu valori numerice
df_test["tags"] = df_test["tags"].apply(lambda x: 0 if x == 'phrase' else 1 if x == 'passage' else 2)

# print(df_test["tags"])
# df_test = df_test.loc[df_test["tags"]!=2]
df_test["lung_par"] = df_test["targetParagraphs"].apply(lambda x: [len(i) for i in x])

df_test["nrPar"] = df_test["targetParagraphs"].apply(lambda x: len(x))

# desfacem array-ul de stringuri din coloana ce contine paragrafele
df_test["targetParagraphs"] = df_test["targetParagraphs"].apply(lambda x: " ".join(x))
df_test["postText"] = df_test["postText"].apply(lambda x: " ".join(x))
df_test["allText"] = df_test[["targetTitle", "targetParagraphs"]].apply(" ".join, axis=1)

# prelucram datele de antrenare repetand pasii de mai sus
df = pd.read_json("/content/drive/MyDrive/click-data/train.jsonl", lines=True)
df["tags"] = list(map(lambda x: x[0], df["tags"].tolist()))

df["tags"] = df["tags"].apply(lambda x: 0 if x == 'phrase' else 1 if x == 'passage' else 2)
df["lung_par"] = df["targetParagraphs"].apply(lambda x: [len(i) for i in x])
df["nrPar"] = df["targetParagraphs"].apply(lambda x: len(x))
df["targetParagraphs"] = df["targetParagraphs"].apply(lambda x: " ".join(x))
df["postText"] = df["postText"].apply(lambda x: " ".join(x))
df["allText"] = df[["targetTitle", "targetParagraphs"]].apply(" ".join, axis=1)

In [None]:
from datasets import Dataset
def get_real_position(di, i):
    #print(di["spoilerPositions"][0])
    ps = di["spoilerPositions"][i][0][0]
    cs = di["spoilerPositions"][i][0][1]
    if ps<0:
        if (di["allText"][cs:(cs + 10)] != di["spoiler"][i][:10]):
            pass
            #print("$$$$  ", di["uuid"])
        return cs
    if di["lung_par"][ps]<cs:
        cs+=len(di["targetTitle"])
        #print(di["allText"][cs:(cs + 10)])
        #print(di["spoiler"][0])
        if (di["allText"][cs:(cs + 10)] != di["spoiler"][i][:10]):
            pass
            #print("$$$$  ", di["uuid"])
        return cs
    #cf = di["spoilerPositions"][0][1][1]
    #lung_sp = cf-cs
    #print("Lung spoiler:",lung_sp)
    #print(ps," ",cs," ")
    s = sum(di["lung_par"][0:ps])+ps+cs+len(di["targetTitle"])+1
    #print(s)
    #print(di["allText"][s:(s+10)])
    #print(di["spoiler"][0])
    if(di["allText"][s:(s+10)]!=di["spoiler"][i][:10]):
        pass
        #print("$$$$  ",di["uuid"])
    return s




def transform_df_in_dict(old):
    new_dict = []
    for d in old:
      texts =[]
      answers =[]
      for i in range(len(d["spoiler"])):
        texts.append(d["spoiler"][i])
        answers.append(get_real_position(d,i))
      nd = { 
            "id": d["uuid"],
            "context": d["allText"],
            "question": d["postText"],
            "answers": 
                        {
                            "text": texts,
                            "answer_start": answers
                        }
                    
        }
      new_dict.append(nd)
    return new_dict

phrase_df = df[df['tags']==2]

a = phrase_df.to_dict('records')
print(a[0]["spoiler"])
print(a[0]["spoilerPositions"])
raw_train_data= Dataset.from_list(transform_df_in_dict(a))
print(raw_train_data[0]["answers"])

phrase_df_test = df_test[df_test['tags']==0]

a = phrase_df_test.to_dict('records')
raw_test_data= Dataset.from_list(transform_df_in_dict(a))

['Purpose connects us to something bigger and in doing so makes us right sized', 'be ruthless with your "No’s."', 'Practice means greatness is doable ... one tiny step after another', 'planning of the SMART goal and number-crunching variety', 'Objectivity — the ability to see the world as it truly is']
[[[11, 25], [11, 101]], [[17, 56], [17, 85]], [[23, 240], [23, 306]], [[28, 65], [28, 120]], [[37, 106], [37, 163]]]
{'answer_start': [2358, 3806, 5146, 6301, 8359], 'text': ['Purpose connects us to something bigger and in doing so makes us right sized', 'be ruthless with your "No’s."', 'Practice means greatness is doable ... one tiny step after another', 'planning of the SMART goal and number-crunching variety', 'Objectivity — the ability to see the world as it truly is']}


In [None]:
max_length = 384
stride = 128
from datasets import Dataset
from transformers import AutoTokenizer
model_checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_dataset = raw_train_data.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_train_data.column_names,
)
print(len(raw_train_data), len(train_dataset))

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

559 2306


In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = raw_test_data.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_test_data.column_names,
)
print(len(raw_test_data), len(validation_dataset))


  0%|          | 0/1 [00:00<?, ?ba/s]

335 772


In [None]:
from transformers import Trainer, AutoTokenizer, TrainingArguments
import pickle
from transformers import AutoModelForQuestionAnswering, AutoConfig

model_checkpoint = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
config = AutoConfig.from_pretrained('deepset/roberta-base-squad2')
model = AutoModelForQuestionAnswering.from_config(config)
def model_init():
    return AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

args = TrainingArguments(
            output_dir='checkpoints',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(

    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
pickle.dump(trainer.model, open('task2_phrase.sav', 'wb'))
compute_metrics(start_logits, end_logits, validation_dataset, raw_test_data)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/d39b8d4166b0683451bbce6f047de1a238c0b5bf/config.json
Model config RobertaConfig {
  "_name_or_path": "deepset/roberta-base-squad2",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /ro

Epoch,Training Loss,Validation Loss
1,No log,No log
2,0.856400,No log
3,0.856400,No log


The following columns in the evaluation set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 772
  Batch size = 8
Saving model checkpoint to checkpoints/checkpoint-500
Configuration saved in checkpoints/checkpoint-500/config.json
Model weights saved in checkpoints/checkpoint-500/pytorch_model.bin
tokenizer config file saved in checkpoints/checkpoint-500/tokenizer_config.json
Special tokens file saved in checkpoints/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `RobertaForQuestionAnswering.forward`,  you can safely ignore this 

  0%|          | 0/335 [00:00<?, ?it/s]

{'exact_match': 44.47761194029851, 'f1': 55.409111623035905}