# Mask-Filling task

## Import libraries

In [1]:
import pandas as pd
import numpy as np

In [43]:
import random

In [None]:
import tqdm

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [15]:
import string

In [10]:
import os

In [11]:
os.getcwd()

'/content'

## Dataset

In [12]:
df = pd.DataFrame(np.zeros((1,1)))
with open('data/lepetitprinceexupery.txt', 'r') as f:
     lines = f.readlines()
     for line in lines:
            data = line.strip().split("\t")
            for el in data:
                phrases = el.strip().split(".")
                for phrase in phrases:
                    df = pd.DataFrame(np.append(df.values, [[phrase]], axis=0))

In [16]:
list_punctuation_to_remove = "!#$%&()*+-./:;<=>?@[\]^_{|}~)"

In [17]:
def check_row_is_to_remove(row):
    # convert and remove punctuation
    new_row = row.translate(str.maketrans('','',list_punctuation_to_remove))
    # we want it has at least 5 words.
    len_row = len(new_row.split())
    if len_row < 5:
        to_clean = True
    else:
        to_clean = False
    return [to_clean, new_row]

In [18]:
df.rename(columns = {0:'original'}, inplace = True)

In [19]:
df[['check','new']] = [[check_row_is_to_remove(el)[0],check_row_is_to_remove(el)[1]] for el in df['original']]

In [20]:
df = df.loc[df['check']== False]

In [22]:
df.head(5)

Unnamed: 0,original,check,new
9,"Lorsque j'avais six ans j'ai vu, une fois, une...",False,"Lorsque j'avais six ans j'ai vu, une fois, une..."
10,Ça représentait un serpent boa qui avalait un...,False,Ça représentait un serpent boa qui avalait un...
11,Voilà la copie du dessin,False,Voilà la copie du dessin
14,"On disait dans le livre :""Les serpents boas av...",False,"On disait dans le livre ""Les serpents boas ava..."
15,Ensuite ils ne peuvent plus bouger et ils dor...,False,Ensuite ils ne peuvent plus bouger et ils dor...


In [40]:
df['original'].iloc[0]

'Lorsque j\'avais six ans j\'ai vu, une fois, une magnifique image, dans un livre sur la Forêt Vierge qui s\'appelait"Histoires Vécues"'

## Model

In [23]:
tokenizer = AutoTokenizer.from_pretrained("moussaKam/mbarthez")

model = AutoModelForSeq2SeqLM.from_pretrained("moussaKam/mbarthez")

Example of BART model for mask-filling task in NLP.

A french sequence to sequence pretrained model based on BART.
BARThez is pretrained by learning to reconstruct a corrupted input sentence. A corpus of 66GB of french raw text is used to carry out the pretraining.
Unlike already existing BERT-based French language models such as CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks (such as abstractive summarization), since not only its encoder but also its decoder is pretrained.

## Decode the input

In [24]:
def f_decode_input(txt):
    input_ids = tokenizer([txt], return_tensors="pt")["input_ids"]
    logits = model(input_ids).logits
    masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    probs = logits[0, masked_index].softmax(dim=0)
    values, predictions = probs.topk(5)
    return tokenizer.decode(predictions).split()

## Example

In [33]:
TXT = "Lorsque j'avais six ans j'ai vu, une fois, une <mask> image."

In [34]:
TXT2 = "J'ai montré mon chef-d'oeuvre aux grandes <mask> et je leur ai demandé si mon dessin leur faisait peur."

In [35]:
TXT3 = "je faisais l'expérience sur <mask> de mon dessin numéro Un"

In [36]:
for txt in [TXT, TXT2, TXT3]:
    print(f_decode_input(txt))

['image', 'fois', 'seule', 'autre', 'photo']
['personnes', 'écoles', 'et', 'heures']
['la', 'le', 'l', 'de', 'les']


## Training

In [55]:
dict_QA = {'question' : [], 'answer' : []}

for row in df.iterrows():
    words = row[1]['new'].split()
    rnd = random.randint(0,len(words)-1)
    to_mask = words[rnd]
    words[rnd] = '<mask>'
    to_txt = ''
    for el in words:
        to_txt += el + ' '
    txt = to_txt[:-1]
    predictions = f_decode_input(txt)
    predictions.append(to_mask)
    dict_QA['question'].append(txt)
    dict_QA['answer'].append(predictions)

## End

In [57]:
df['question'] = dict_QA['question']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = dict_QA['question']


In [61]:
df['answer'] = dict_QA['answer']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer'] = dict_QA['answer']


## Save the output

In [63]:
df.drop(labels = ['original','check'], axis = 1 , inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [66]:
df.to_csv("output/df_QA.csv", index = False)