# Notebook for data preparation and model training

1) Data acquisition from wikidata  
2) Wikipedia acquisition and matching  
3) Preparing the datasets for GPT-2 and XLNet  
4) Training and generation test for GPT-2 EN  
5) Training and generation test for XLNet EN  
6) Training and generation test for GPT-2 IT (GePpeTto)

In [None]:
!pip install wikipedia
!pip install transformers

import wikipedia
import requests
from requests import utils
import csv
import pandas as pd

from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoTokenizer
import torch

from tqdm.notebook import tqdm

tqdm.pandas()

## Gathering data from Wikidata

In [None]:
def label_item(item):
  return '?{} rdfs:label ?{}Label .'.format(item, item)

def optional_item(source, item):
  return 'OPTIONAL{?'+source+' '+item+' . }'

def group_concat(item, as_name):
  return '(GROUP_CONCAT(DISTINCT ?{}; separator=", ") AS ?{})'.format(item, as_name)

def get_optionals_string(optionals, item_name):
  return ' '.join([optional_item(item_name, "{} ?{}".format(optional[0], optional[1])) for optional in optionals])

# Query service for wikidata (probably dbpedia should be tested as well)
endpoint = "https://query.wikidata.org/sparql"

# Translate the wikidata entities
instance_of = "wdt:P31"
human = "wd:Q5"
birth_place = "wdt:P19"
lived_in = "wdt:P551"
citizenship = "wdt:P27"
republic_of_venice = "wd:Q4948"
venice = "wd:Q641"
picture = "wdt:P18"

# The resulting projection names and parameters
selections = [
              '?item',
              '?itemLabel',
              group_concat("fnameLabel", "fnames"),
              group_concat("lnameLabel", "lnames"),
              '?itemDescription',
              '?birth',
              '?death',
              group_concat("professionLabel", "professions"),
              ]

birth = 'wdt:P569'
death = 'wdt:P570'
profession = 'wdt:P106'
last_name = 'wdt:P734'
first_name = 'wdt:P735'

# Optionals which extract the data from the query go here
optionals = [
              (birth, "birth"),
              (death, "death"),
              (profession, "profession"),
              (last_name, "lname"),
              (first_name, "fname"),
             ]

# Labels used to extract name by id of the predicate go here
labels = ["profession", "item", "fname", "lname"]

# Elements for the groupby
groupby = [
           "?item",
           "?itemLabel",
           "?itemDescription",
           "?birth",
           "?death"
]


# Prepare query string based on parameters
selections_string = " ".join(selections)
parameters_string = f"{instance_of} {human}; {citizenship} {republic_of_venice};"
optionals_string = get_optionals_string(optionals, "item")
labels_string = " ".join([label_item(label) for label in labels])
groupby_string = " ".join(groupby)

wikidata_language ="en"

statement = ' '.join(['SELECT DISTINCT',
         selections_string,
         'WHERE { ?item ' + parameters_string + ' .',
         optionals_string,
         'SERVICE wikibase:label { bd:serviceParam wikibase:language "'+ wikidata_language + '".',
         labels_string,
         '?item schema:description ?itemDescription .',
         '}} GROUP BY',
         groupby_string])

print(statement)

SELECT DISTINCT ?item ?itemLabel (GROUP_CONCAT(DISTINCT ?fnameLabel; separator=", ") AS ?fnames) (GROUP_CONCAT(DISTINCT ?lnameLabel; separator=", ") AS ?lnames) ?itemDescription ?birth ?death (GROUP_CONCAT(DISTINCT ?professionLabel; separator=", ") AS ?professions) WHERE { ?item wdt:P31 wd:Q5; wdt:P27 wd:Q4948; . OPTIONAL{?item wdt:P569 ?birth . } OPTIONAL{?item wdt:P570 ?death . } OPTIONAL{?item wdt:P106 ?profession . } OPTIONAL{?item wdt:P734 ?lname . } OPTIONAL{?item wdt:P735 ?fname . } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". ?profession rdfs:label ?professionLabel . ?item rdfs:label ?itemLabel . ?fname rdfs:label ?fnameLabel . ?lname rdfs:label ?lnameLabel . ?item schema:description ?itemDescription . }} GROUP BY ?item ?itemLabel ?itemDescription ?birth ?death


In [None]:
result = requests.get(endpoint, params={'query': statement}, headers={'Accept': 'text/csv'}).text

In [None]:
data = list(csv.reader(result.split("\r\n"), delimiter=',', quotechar='"'))[:-1]
df = pd.DataFrame(data[1:], columns=data[0])
df["item"] = df["item"].str.replace("http://www.wikidata.org/entity/", "")
df = df.set_index("item")
df["birth"] = df.birth.str[:4]
df["death"] = df.death.str[:4]
df.head()

# We can get more or less 3000 entries this way

Unnamed: 0_level_0,itemLabel,fnames,lnames,itemDescription,birth,death,professions
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q1918086,"Meinhard I, Count of Gorizia",Meinhard,,"Count of Gorizia, Count Palatine in Carinthia",1070,1142,aristocrat
Q18945651,Marsilio Cagnati,,,1543-1612,1543,1612,physician
Q55901911,Angelo Gualandris,Angelo,,botanist (1750-1788),1750,1788,"naturalist, geologist, agronomist, botanist"
Q1522146,Giacomo Nani,Giacomo,Nani,politician (1725-1797),1725,1797,politician
Q723300,Pietro Gradenigo,Pietro,Gradenigo,doge of Venice,1251,1311,politician


## Gathering Wikipedia text

In [None]:
def get_wikipage_from_title(pageTitle, lang='en', content_type='content'):#summary or content
    wikipedia.set_lang(lang)

    if pageTitle == None:
      return None
       
    output=''
    try:
        p = wikipedia.page(title=pageTitle, auto_suggest=False)
        output = eval('p.'+content_type)  # summary of page.


    except wikipedia.exceptions.DisambiguationError as e:
        print('-----------------')
        print(pageTitle)
        print(e.options[0])
        print('-----------------')
    except wikipedia.exceptions.PageError as e:
        print('+++++++++++++++++')
        print(pageTitle)
        print('+++++++++++++++++')   
    return output

def get_wikipage_from_pageID(pageID):    

    try:
        p = wikipedia.page(pageid=pageID)
    except wikipedia.exceptions.DisambiguationError as e:
        print('-----------------')
        print(pageTitle)
        print(e.options[0])
        print('-----------------')
    except wikipedia.exceptions.PageError as e:
        print('+++++++++++++++++')
        print(pageTitle)
        print('+++++++++++++++++')   
    return summary

In [None]:
def get_wikipedia_title_from_wikidata_id(wikidata_id, lang='en', debug=False):
    url = (
        'https://www.wikidata.org/w/api.php'
        '?action=wbgetentities'
        '&props=sitelinks/urls'
        f'&ids={wikidata_id}'
        '&format=json')
    json_response = requests.get(url).json()
    if debug: print(wikidata_id, url, json_response) 

    entities = json_response.get('entities')    
    if entities:
        entity = entities.get(wikidata_id)
        if entity:
            sitelinks = entity.get('sitelinks')
            if sitelinks:
                if lang:
                    # filter only the specified language
                    sitelink = sitelinks.get(f'{lang}wiki')
                    if sitelink:
                        wiki_title = sitelink.get('title')
                        if wiki_title:
                            return requests.utils.unquote(wiki_title)
                else:
                    # return void                    
                    return None
    return None  

In [None]:
df["summary"] = df.progress_apply(lambda row: get_wikipage_from_title(get_wikipedia_title_from_wikidata_id(row.name)), axis=1)

HBox(children=(FloatProgress(value=0.0, max=3284.0), HTML(value='')))




In [None]:
df_none = df[df.summary.isnull()]
df_filtered = df[df.summary.notna()]
df_none.to_csv('dataset_no_en_full.csv')
df_filtered.to_csv("dataset_en_full.csv")

In [None]:
df["summary_it"] = df.progress_apply(lambda row: get_wikipage_from_title(get_wikipedia_title_from_wikidata_id(row.name, 'it'),'it'), axis=1)

HBox(children=(FloatProgress(value=0.0, max=3280.0), HTML(value='')))




In [None]:
df["summary_it"][745]

'Lazzaro Bastiani (Padova, 1429 – 5 aprile 1512) è stato un pittore italiano.\n\n\n== Biografia ==\n\nNacque probabilmente a Venezia o a Padova nel 1429 circa, figlio di Iacopo.\nSi formò presumibilmente presso la bottega di Antonio Vivarini da Murano, in un contesto impregnato dalla personalità di Iacopo Bellini e di Andrea Mantegna.\nLe prime notizie riguardanti la sua carriera risalirono al 1449, anno in cui risultò essere già pictor indipendente.\nDal 1460 si stabilì stabilmente a Venezia, dove eseguì vari lavori, come quello per la chiesa di S.Samuele (Polittico non più rintracciato) e per la Scuola Grande di San Marco.\nDurante il 1470 divenne membro della Scuola di San Girolamo.. Dopo il 1480, assieme a Gentile Bellini, Vittore Carpaccio e Giovanni Mansueti esegui alcuni \'teleri\' per la Scuola Grande di San Marco.\nNegli anni successivi gli venne affidato uno dei "teleri" per la Scuola di S:Giovanni Evangelista.\nNel 1508 ricevette l\'incarico da Giovanni Bellini, di valutare 

In [None]:
df_filtered_it = df[df.summary_it.notna()]
df_filtered_it.to_csv("dataset_it_new.csv")

In [None]:
#italian dataset
print(len(df), len(df_filtered_it))

3277 2547


In [None]:
#english dataset
print(len(df), len(df_filtered))

3277 1659


In [None]:
df_filtered_it['summary_it'][876]

'Marco Antonio De Dominis (Arbe, 1560 – Roma, 8 settembre 1624) è stato un arcivescovo cattolico, teologo e scienziato dalmata.'

## English dataset ready for training

In [None]:
df_filtered = pd.read_csv("data/dataset_en_full.csv")

# Remove the "rubbish sections", and all after them as they are almost exclusively
# in the footnotes anyway
# df_filtered["summary"] = df_filtered.summary.str.split("== [^=]+ ==").str[0]
rubbish_sections = ["References", "See also", "Sources", "Notes", "Bibliography", "Footnotes", "External links"]

for section in rubbish_sections:
  df_filtered["summary"] = df_filtered.summary.str.split(f"== {section} ==").str[0]

df_filtered[20:50]

Unnamed: 0,item,itemLabel,fnames,lnames,itemDescription,birth,death,professions,summary
20,Q267651,Faustina Bordoni,Regina,Bordoni,opera singer,1697.0,1781.0,"singer, stage actor, opera singer",Faustina Bordoni (30 March 1697 – 4 November 1...
21,Q508079,Simone Luzzatto,Simone,Luzzatto,Italian rabbi,1583.0,1663.0,rabbi,Simone (Simcha) Luzzatto (Hebrew: שמחה לוצאטו‎...
22,Q3847989,Marin Bocconio,Marin,,revolutionary,,1299.0,conspirator,"In 1300, in protest of the Serrata del Maggior..."
23,Q3762999,Giandomenico Coleti,Giovanni,,Italian writer and historian,1727.0,1827.0,"writer, historian","Giovanni Giacomo Coleti or Coletti (May 2, 173..."
24,Q2035003,Andrea Antico,Andrea,Antico,Italian composer and publisher,1470.0,1540.0,"composer, editor","Andrea Antico (also Andrea Antico da Montona, ..."
25,Q20085666,Lucantonio Giunta,Luca,Giunta,Florentine printer publisher active in Venice ...,1457.0,1538.0,publisher,Lucantonio Giunti or Giunta (1457 – 3 April 15...
26,Q2307438,Morto da Feltre,Lorenzo,,Italian painter (1480-1527),1480.0,1527.0,painter,Morto da Feltre was an Italian painter of the ...
27,Q202267,Jacopo Sansovino,Jacopo,,"Italian artist, 1486-1570",1486.0,1570.0,"architect, sculptor",Jacopo d'Antonio Sansovino (2 July 1486 – 27 N...
28,Q774070,Francesco Venier,Francesco,Venier,diplomat (1489-1556),1489.0,1556.0,diplomat,Francesco Venier was the Doge of Venice from 1...
29,Q363741,Gian Antonio Selva,Gian,,Italian architect,1753.0,1819.0,architect,Gian Antonio Selva (2 September 1751 - 22 Janu...


# DATA PREPARATION - GPT2 EN

In [None]:
def data2text(row):
  out = ' <|start|> '
  out += row.itemLabel
  if str(row.itemDescription) != "nan":
    out += ' <|description|> '
    out += str(row.itemDescription)
  if str(row.professions) != "nan":
    out += f' <|professions|> {row.professions}'
  if str(row.birth) != "nan":
    out += f' <|birth|> {row.birth}'
  if str(row.death) != "nan":
    out += f' <|death|> {row.death}'
  summary = row.summary.replace("\n==", "<|section|>\n==")
  
  return out + " <|summary|> " + summary + "<|end|>"

In [None]:
data2text(df_filtered.loc[46])

' <|start|> Giovanna Dandolo <|birth|> 1500 <|summary|> Giovanna Dandolo was a dogaressa of Venice by marriage to doge Pasquale Malipiero (reign 1457–1462). \nShe was born circa 1400 as the daughter of Antonio Dandolo and married to Pasquale Malipiero in 1414. She had four children: Lorenzo, Antonio, Maddalena and Polo. \nHer spouse was elected doge in 1457.  She was given an elaborate coronation and entry in to Venice as dogaressa in January 1458. As her predecessor before her, Giovanna Dandolo came to play a very public role as dogaressa, performing representational tasks and acting as the protector of trades and individual artists.  She supported the newly introduced art of book printing in Venice, the lace industry of Burano, and acted as a financier for many writers, artists and scientists. She was referred to as the \'Empress of Printing\' and the \'Queen of Lace\' due to her role as the benefactor of these trades.She gathered a circle of \'men of letters\' and writers around her

In [None]:
# Create the dataset from the dataframe:

# We have to create custom dataset class for pytorch to make use of all the automated training pipelines
class WikipediaDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        # The sample is composed od parts of texts, with control tokens inbetween
        samples = df_filtered.apply(lambda row: data2text(row), axis=1)
        samples = [str(sample) for sample in samples.values]

        # We finally tokenize the samples to get the numerical values instead of text
        self.encodings = tokenizer(samples, truncation=True, padding=True)
        print(self.encodings['input_ids'][0])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# We have to specify to the tokenizer manually NOT to split the control tokens
tokenizer = AutoTokenizer.from_pretrained("gpt2")
special_tokens = {
    "additional_special_tokens": [
        "<|start|>",
        "<|description|>",
        "<|summary|>",
        "<|professions|>",
        "<|birth|>",
        "<|death|>",
        "<|section|>",
        "<|end|>"
    ],
    "pad_token": "<|pad|>"
}

tokenizer.add_special_tokens(special_tokens)
dataset = WikipediaDataset(df_filtered, tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…


[50257, 5308, 259, 10424, 314, 11, 2764, 286, 19097, 528, 544, 50258, 12332, 286, 19097, 528, 544, 11, 2764, 3175, 265, 500, 287, 1879, 9304, 544, 50260, 34566, 35128, 50261, 940, 2154, 50262, 1157, 3682, 50259, 5308, 259, 10424, 314, 357, 66, 13, 447, 231, 940, 2154, 784, 1367, 3682, 828, 281, 31836, 286, 262, 15581, 2097, 286, 19097, 528, 544, 357, 5308, 259, 10424, 7274, 30968, 828, 373, 6922, 954, 286, 19097, 528, 544, 422, 13539, 17, 1566, 465, 1918, 13, 679, 635, 2714, 262, 9730, 286, 257, 2764, 6340, 265, 500, 287, 262, 360, 794, 88, 286, 1879, 9304, 544, 355, 880, 355, 35421, 83, 8153, 286, 262, 48665, 378, 286, 11446, 576, 544, 290, 286, 520, 5613, 33128, 287, 262, 2805, 286, 314, 301, 7496, 13, 50263, 855, 5155, 6624, 198, 464, 2185, 259, 10424, 7274, 15581, 1641, 373, 286, 37313, 3699, 8159, 26, 2185, 259, 10424, 338, 2988, 8336, 259, 10424, 318, 12395, 355, 257, 954, 287, 262, 37313, 3699, 350, 5819, 6916, 287, 9796, 22, 13, 383, 30968, 550, 587, 1498, 284, 12831, 1588, 42

In [None]:
test = [data2text(df_filtered.loc[i]) for i in range(10, 20)]
test

[' <|start|> Francesco Aviani <|description|> Italian painter (1662-1715) <|professions|> painter <|birth|> 1662 <|death|> 1715 <|summary|> Francesco Aviani (1662?–1715), a native of Vicenza, flourished about the year 1630. He excelled in painting perspective and architectural views, which were frequently embellished with figures by Giulio Carpioni. His pictures usually represent the most remarkable views in Venice. He also produced some landscapes and seaports.\nHe was born in Venice, most likely on 25 November 1662, by Bernardo and by a Magdalene whose surname is unknown, and was most probably baptized in the cathedral on 3 December  1662. Between 1701 and 1703 he decorated with frescoes (today illegible) the villa Chiericati in Soella (his brother Marco the sculptor was also with him). On 16 October 1703 he married Isabella Carcano. On 26 March 1715 he made a will. On 3 April 1715 he died in Vicenza and it is from his age at the moment of death, about fifty-two years that his date o

# DATA PREPARATION - XLNET EN

In [None]:
# Create the dataset from the dataframe:

# We have to create custom dataset class for pytorch to make use of all the automated training pipelines
class WikipediaDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        # The sample is composed od parts of texts, with control tokens inbetween
        samples = df_filtered.apply(lambda row: data2text(row), axis=1)
        samples = [str(sample) for sample in samples.values]

        # We finally tokenize the samples to get the numerical values instead of text
        self.encodings = tokenizer(samples, truncation=True, padding=False, max_length=1024)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# We have to specify to the tokenizer manually NOT to split the control tokens
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
special_tokens = {
    "additional_special_tokens": [
        "<|start|>",
        "<|description|>",
        "<|summary|>"
        "<|professions|>",
        "<|birth|>",
        "<|death|>"
    ],
    "pad_token": "<|pad|>"
}

tokenizer.add_special_tokens(special_tokens)
dataset = WikipediaDataset(df_filtered, tokenizer)

# DATA PREPARATION - GPT2 IT

In [None]:
# Create the dataset from the italian dataframe:
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoTokenizer
import torch

# We have to create custom dataset class for pytorch to make use of all the automated training pipelines
class WikipediaDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
      
        # translator = Translator()
        # The sample is composed od parts of texts, with control tokens inbetween
        samples = ' <|start|> '+df["itemLabel"]+' <|description|> '+df["itemDescription"]+' <|summary_it|> '+df["summary_it"]
        samples = [str(sample) for sample in samples.values]

        # We finally tokenize the samples to get the numerical values instead of text
        self.encodings = tokenizer(samples, truncation=True, padding=True)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids']
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# We have to specify to the tokenizer manually NOT to split the control tokens
tokenizer_it = AutoTokenizer.from_pretrained("LorenzoDeMattei/GePpeTto")
special_tokens = {
    "additional_special_tokens": [
        "<|start|>",
        "<|description|>",
        "<|summary_it|>"
    ],
    "pad_token": "<|pad|>"
}

tokenizer_it.add_special_tokens(special_tokens)
dataset_it = WikipediaDataset(df_filtered_it, tokenizer_it)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1069.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=546781.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=286907.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# MODEL TRAINING GPT2 EN

In [None]:
# English model goes here
# More than one epoch is required to train the model properly!

from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    learning_rate=5e-05,
    save_total_limit=1
)

model = GPT2LMHeadModel.from_pretrained("gpt2-medium") # using GPT2-small for now. This can be replaced with larger, or italian model
model.resize_token_embeddings(len(tokenizer))

# We have everything prepared properly according to the pipeline, so we can train using automatic trainer class
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset         # training dataset
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=718.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1520013706.0, style=ProgressStyle(descr…




Step,Training Loss
100,3.493191
200,1.724989
300,1.383774
400,1.511388
500,1.451156
600,1.498034
700,1.429216
800,1.463037
900,1.519437
1000,1.378483


TrainOutput(global_step=8310, training_loss=1.0864303643953068)

In [None]:
model.save_pretrained("model")
tokenizer.save_pretrained("model")
!tar -czvf model-en-v5.tar.gz model/

model/
model/tokenizer_config.json
model/special_tokens_map.json
model/added_tokens.json
model/pytorch_model.bin
model/config.json
model/merges.txt
model/vocab.json


# MODEL USAGE EN

In [None]:
# Generate based on the trained model
# Same as for the pre-trained one below, we just use different "model" variable

from transformers import pipeline
from tqdm import tqdm

person_definition = "<|start|> Marco Polo <|description|> Painter <|professions|> Painter, Writer <|birth|> 1720 <|death|> 1793 <|summary|> "

sections = [
    "Biography",
    "Death",
    "Legacy"
    ]

prompts = [f"<|section|>== {section} ==" for section in sections]
prompts.insert(0, person_definition)

outputs = ""

end_token = tokenizer.encode("<|end|>")[0]

for i, prompt in enumerate(tqdm(prompts)):
  if len(outputs)+len(prompt) >= 1024:
    break

  outputs += prompt 

  encoded_prompt = tokenizer.encode(outputs, add_special_tokens=True, return_tensors="pt")
  encoded_prompt = encoded_prompt.to("cuda:0")

  outputs_enc = model.generate(encoded_prompt, do_sample=True, max_length=1024, device=0, 
                            top_k=50, top_p=0.0, num_return_sequences=1, num_beams=5, repetition_penalty=3.0,
                            eos_token_id=end_token)[0]

  outputs = tokenizer.decode(outputs_enc)

  # We don't care about end tokens, we may want to add another section afterwards!
  outputs.replace("<|end|>", "")

  # Allow only one new section in every iteration
  sections_split = outputs.split("<|section|>")
  outputs = "<|section|>".join(outputs.split("<|section|>")[:min(len(sections_split), i+1)])
  print(outputs)

print(outputs.split("<|summary|>")[1].replace("<|section|>", "\n\n"))











  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[ASetting `pad_token_id` to `eos_token_id`:50264 for open-end generation.










 25%|██▌       | 1/4 [00:23<01:11, 23.71s/it][A[A[A[A[A[A[A[A[A[ASetting `pad_token_id` to `eos_token_id`:50264 for open-end generation.


<|start|> Marco Polo <|description|> Painter <|professions|> Painter, Writer <|birth|> 1720 <|death|> 1793 <|summary|> Marco Polo (died 1793) was an Italian painter of the late-Baroque period.
Marco Polo was born in Verona and died in Venice. He painted for the church of Santa Maria della Vigna at San Giovanni e Paolo. His works include a series of altarpieces depicting scenes from the life of Marco Polo, as well as frescoes representing the lives of his subjects. 












 50%|█████     | 2/4 [00:38<00:42, 21.06s/it][A[A[A[A[A[A[A[A[A[A

<|start|> Marco Polo <|description|> Painter <|professions|> Painter, Writer <|birth|> 1720 <|death|> 1793 <|summary|> Marco Polo (died 1793) was an Italian painter of the late-Baroque period.
Marco Polo was born in Verona and died in Venice. He painted for the church of Santa Maria della Vigna at San Giovanni e Paolo. His works include a series of altarpieces depicting scenes from the life of Marco Polo, as well as frescoes representing the lives of his subjects. <|section|> == Biography ==
Marco Polo was born in Verona on 16 January 1720.  He is said to have studied under Sebastiano Carpioni; he then went to Rome where he became a pupil of Giacomo Zaccaria. In 1723 he moved to Venice, where he worked mainly with Francesco Contarini, who had been appointed by Pope Clement VIII as ambassador to the Republic of Venice. The following year he returned to Venice, where he painted for the church of San Giorgio e Paolo. On 5 June 1725 he completed a series of altarpieces for the church of Sa

In [None]:
print(outputs.split("<|summary|>")[1].strip())

Marco Polesti (Italian: Мильмена) or "Poland", was a Venetian painter.


== Biography == His father died in 1792 at the age of twenty years and his mother lived until 1819 when he began to work on paintings under an active influence from Venice; some time later this same period saw him becoming well known for many works which are described as being unfinished rather than full engravings by Antonio de Sanzio da Varese who also worked with Piazzetta delle Paolo Luzzo et littoretto). This is considered one such painting that could not be done during their lifetime but it has been shown numerous times before where these exist today."In fact most early attempts were made against them only because they did so long without success".He painted two other large canvases entitled Apella di Domenico e Gallicee d'Alta Festa ("The Sculptor's House"). These had no effect except perhaps making more money while others resulted mainly through commissions paid off already after death," though still impor

# MODEL TRAINING XLNET EN

In [None]:
from transformers import XLNetLMHeadModel, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.0,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    fp16 = True,
    fp16_opt_level = "O3"
)

model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased") # using GPT2-small for now. This can be replaced with larger, or italian model
model.resize_token_embeddings(len(tokenizer))

# We have everything prepared properly according to the pipeline, so we can train using automatic trainer class
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset         # training dataset
)

trainer.train()



Step,Training Loss
100,0.576928
200,0.006851
300,0.018916
400,0.0162
500,0.012494
600,0.003791
700,9.6e-05
800,8e-05
900,0.004183
1000,0.001504


TrainOutput(global_step=1662, training_loss=0.0387805429176303)

# MODEL TRAINING GPT-2 IT

In [None]:
  #Italian training
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, AutoModelWithLMHead

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.0,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

model_it =  AutoModelWithLMHead.from_pretrained("LorenzoDeMattei/GePpeTto") # using GPT2-small for now. This can be replaced with larger, or italian model
model_it.resize_token_embeddings(len(tokenizer_it))

# We have everything prepared properly according to the pipeline, so we can train using automatic trainer class
trainer = Trainer(
    model=model_it,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_it         # training dataset
)

trainer.train()



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=485894375.0, style=ProgressStyle(descri…




Step,Training Loss
100,1.588297
200,0.216704
300,0.307778
400,0.226794
500,0.249033
600,0.20528
700,0.221912
800,0.209601
900,0.194266
1000,0.210316


TrainOutput(global_step=1274, training_loss=0.32646845124392904)

# MODEL USAGE IT

In [None]:
model_cpu = model_it.cpu()

In [None]:
# Italian model goes here

from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, GPT2Tokenizer


text_generator = pipeline('text-generation', model=model_cpu, tokenizer=tokenizer_it)
prompts = ["<|start|> Antonio Giggi <|description|> Religioso  <|summary_it|>",
           "<|start|> Leonardo Lombardi <|description|> scrittore  <|summary_it|>",
           "<|start|> Mario Luigi <|description|> Commerciante  <|summary_it|>"]


samples_outputs = text_generator(
    prompts,
    do_sample=True,
    max_length=128,
    device=0, # this means that we predict on GPU
    top_k=50,
    top_p=0.95,
    num_return_sequences=1
)

# print(samples_outputs[0]["generated_text"])

In [None]:
for it in samples_outputs:
  print(it)

[{'generated_text': '<|start|> Antonio Giggi <|description|> Religioso  <|summary_it|>Antonio Giggi (Venezia, 1603 – Venezia, 1753) è stato un monaco benedettino italiano. Gili di Lombardia, 1581 – Venezia, 1599. da una tradizione nota come "Passa di terra" e "poverosa", anche il culto di San Giuseppe e del Bambino della Madonna del Pianto.quoto di Spagna, 1585 – Venezia, 1599. \nSanto Gigliola Zeno, nel 1583, pubblicava una traduzione in lingua'}]
[{'generated_text': '<|start|> Leonardo Lombardi <|description|> scrittore  <|summary_it|>Leonardo Lombardi (Venezia, 20 febbraio 1629 – 1692) è stato uno scrittore italiano. da una lettera indirizzata a Don Francesco Malipiero di San Secondo, a Don Domenico Cavrila, a Girolamo Falesco, alla poetessa italiana Marita Cappelli e ai genitori della celebre scrittrice italiana Elena Candida e al nipote Bernardo Falesco. da due lettere conservate dalla casa editrice nel 1693. scrive anche che il poeta avrebbe scritto in stampella:\nLa lettera vien