NOTE: les outputs de ce notebook sont justes à titre d'exemple et ne représentent pas les résultats finaux

In [3]:
!pip install transformers evaluate datasets wget textacy rouge_score

[0m

In [4]:
!ls

__notebook_source__.ipynb  newsroom-release.tarm_koenag.tmp


In [5]:
import os
from wget import download

if "release" not in os.listdir():
    
    #downloading dataset
    if "newsroom-release.tar" not in os.listdir():
        print("* Downloading Dataset")
        download("https://lil.nlp.cornell.edu/resources/newsroom/r8625bda324/newsroom-release.tar")
    
    #unzipping tar file
    print("\n* Unzipping dataset")
    os.system("tar xvf newsroom-release.tar")
#unzipping gz archives
if "test.jsonl" not in os.listdir("release"):
    print("\n* Unzipping release files")
    os.system("gzip -d release/test.jsonl.gz")
    # os.system("gzip -d release/train.jsonl.gz")
    # os.system("gzip -d release/dev.jsonl.gz")
    print("\n++ Done")

* Downloading Dataset

* Unzipping dataset
release/
release/dev.jsonl.gz
release/test.jsonl.gz
release/train.jsonl.gz

* Unzipping release files

++ Done


In [6]:
!rm  newsroom-release.tar

In [7]:
from evaluate import evaluator,visualization,push_to_hub
from datasets import Dataset
from transformers import BartForConditionalGeneration,BartTokenizerFast
from textacy import preprocessing
from functools import partial
import matplotlib.pyplot as plt

model_checkpoint = "Yahiael1/mymodel_final_v2"



In [8]:
model = BartForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = BartTokenizerFast.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [9]:
import pandas as pd

df_test = pd.read_json(r'release/test.jsonl', lines=True,keep_default_dates=False,nrows=10000)

In [10]:
df_test = df_test[["text","summary"]] # remove unnecessary columns

In [11]:
test_dataset = Dataset.from_pandas(df_test)

In [12]:
# cleaning the dataset

preproc = preprocessing.make_pipeline( # cleaning pipeline
  partial(preprocessing.replace.urls, repl=""),
  preprocessing.normalize.bullet_points,
  preprocessing.normalize.hyphenated_words,
  preprocessing.normalize.quotation_marks,
  preprocessing.normalize.unicode,
  preprocessing.normalize.whitespace,
  preprocessing.remove.accents,
  preprocessing.remove.brackets,
  preprocessing.remove.html_tags,
  partial(preprocessing.replace.emails, repl=""),
  partial(preprocessing.replace.emojis, repl=""),
  partial(preprocessing.replace.hashtags, repl=""),
  )

def batch_clean(examples):
  for i in range(len(examples)):
    examples["text"][i] = preproc(examples["text"][i])
    examples["summary"][i] = preproc(examples["summary"][i])
  return examples

test_dataset = test_dataset.map(batch_clean,batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
eval = evaluator( task = 'summarization') # metric is rouge by default

In [14]:
print(vars(eval))

{'task': 'summarization', 'default_metric_name': 'rouge'}


In [15]:
# checking if columns are present
eval.check_required_columns(data = test_dataset,columns_names = {'input_column' : 'text', 'label_column' : 'summary'})

In [16]:
results = eval.compute(model_or_pipeline=model,
             data = test_dataset,
             metric = 'rouge',
             tokenizer = tokenizer,
             input_column = 'text',
             label_column = 'summary'
             )

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Your max_length is set to 128, but you input_length is only 74. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 128, but you input_length is only 67. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)


In [17]:
print(results)

{'rouge1': 0.4598138564725939, 'rouge2': 0.36707135010626196, 'rougeL': 0.43242902398940225, 'rougeLsum': 0.4315330683468712, 'total_time_in_seconds': 75.46245773999999, 'samples_per_second': 0.13251622461667256, 'latency_in_seconds': 7.546245773999999}


In [19]:
!huggingface-cli login --token # token ici

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [20]:
# upload les resultats sur huggingface
push_to_hub(
  model_id=model_checkpoint,  # model repository on hub
  metric_value=float(results["rouge1"]),                       # metric value
  metric_type="rouge1",                     # metric name, e.g. accuracy.name
  metric_name="rouge1",                     # pretty name which is displayed
  dataset_type="newsroom",                # dataset name on the hub
  dataset_name="newsroom",                # pretty name
  dataset_split="test",                   # dataset split used
  task_type="summarization",            # task id, see https://github.com/huggingface/datasets/blob/master/src/datasets/utils/resources/tasks.json
  task_name="summarization",             # pretty name for task
  overwrite=True
)

Downloading (…)solve/main/README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

'https://huggingface.co/Yahiael1/mymodel_final_v2/blob/main/README.md'

In [21]:
# upload les resultats sur huggingface
push_to_hub(
  model_id=model_checkpoint,  # model repository on hub
  metric_value=float(results["rouge2"]),                       # metric value
  metric_type="rouge2",                     # metric name, e.g. accuracy.name
  metric_name="rouge2",                     # pretty name which is displayed
  dataset_type="newsroom",                # dataset name on the hub
  dataset_name="newsroom",                # pretty name
  dataset_split="test",                   # dataset split used
  task_type="summarization",            # task id, see https://github.com/huggingface/datasets/blob/master/src/datasets/utils/resources/tasks.json
  task_name="summarization",             # pretty name for task
  overwrite=True
)

Downloading (…)solve/main/README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

'https://huggingface.co/Yahiael1/mymodel_final_v2/blob/main/README.md'

In [22]:
# upload les resultats sur huggingface
push_to_hub(
  model_id=model_checkpoint,  # model repository on hub
  metric_value=float(results["rougeL"]),                       # metric value
  metric_type="rougeL",                     # metric name, e.g. accuracy.name
  metric_name="rougeL",                     # pretty name which is displayed
  dataset_type="newsroom",                # dataset name on the hub
  dataset_name="newsroom",                # pretty name
  dataset_split="test",                   # dataset split used
  task_type="summarization",            # task id, see https://github.com/huggingface/datasets/blob/master/src/datasets/utils/resources/tasks.json
  task_name="summarization",             # pretty name for task
  overwrite=True
)

Downloading (…)solve/main/README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

'https://huggingface.co/Yahiael1/mymodel_final_v2/blob/main/README.md'

In [23]:
# upload les resultats sur huggingface
push_to_hub(
  model_id=model_checkpoint,  # model repository on hub
  metric_value=float(results["rougeLsum"]),                       # metric value
  metric_type="rougeLsum",                     # metric name, e.g. accuracy.name
  metric_name="rougeLsum",                     # pretty name which is displayed
  dataset_type="newsroom",                # dataset name on the hub
  dataset_name="newsroom",                # pretty name
  dataset_split="test",                   # dataset split used
  task_type="summarization",            # task id, see https://github.com/huggingface/datasets/blob/master/src/datasets/utils/resources/tasks.json
  task_name="summarization",             # pretty name for task
  overwrite=True
)

Downloading (…)solve/main/README.md:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

'https://huggingface.co/Yahiael1/mymodel_final_v2/blob/main/README.md'