# KELM

## Instalamos dependencias y biliotecas

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 28.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [None]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output


In [None]:
from datasets import load_dataset
dataset_kelm = load_dataset("kelm")

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset kelm/default (download: 1.52 GiB, generated: 1.56 GiB, post-processed: Unknown size, total: 3.08 GiB) to /root/.cache/huggingface/datasets/kelm/default/0.0.0/1c5fee915d5686ab150ef3f41b364abc402845c3721d8e028570153a54431fe2...


Downloading data:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6371131 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/796471 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/796493 [00:00<?, ? examples/s]

Dataset kelm downloaded and prepared to /root/.cache/huggingface/datasets/kelm/default/0.0.0/1c5fee915d5686ab150ef3f41b364abc402845c3721d8e028570153a54431fe2. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from datasets import get_dataset_infos


In [None]:
get_dataset_infos('kelm')

Using custom data configuration default


{'default': DatasetInfo(description='Data-To-Text Generation involves converting knowledge graph (KG) triples of the form (subject, relation, object) into\na natural language sentence(s). This dataset consists of English KG data converted into paired natural language text.\nThe generated corpus consists of ∼18M sentences spanning ∼45M triples with ∼1500 distinct relations.\n', citation='@misc{agarwal2020large,\n      title={Large Scale Knowledge Graph Based Synthetic Corpus Generation for Knowledge-Enhanced Language Model Pre-training},\n      author={Oshin Agarwal and Heming Ge and Siamak Shakeri and Rami Al-Rfou},\n      year={2020},\n      eprint={2010.12688},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n', homepage='https://github.com/google-research-datasets/KELM-corpus', license='', features={'triple': Value(dtype='string', id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='kelm', c

In [None]:
train_kelm = dataset_kelm['train']
train_test = dataset_kelm['test']
train_val = dataset_kelm['validation']

In [None]:
train_kelm[73]

{'sentence': "The company maintains a botanical garden, the Jardin botanique Yves Rocher de La Gacilly, at Yves Rocher ( company )'s industrial site in La Gacilly.",
 'triple': 'Yves Rocher ( company ) location of formation La Gacilly, headquarters location La Gacilly'}

In [None]:
train_kelm

Dataset({
    features: ['triple', 'sentence'],
    num_rows: 6371131
})

## Limpieza y preprocesamiento de los datos

In [None]:
import json

In [None]:
def toUTF8(example):
    example['sentence'] = example['sentence'].encode('latin-1').decode('utf-8')
    example['triple'] = example['triple'].encode('latin-1').decode('utf-8')
    return example


In [None]:
kelm_train = train_kelm.map(toUTF8)

In [None]:
print(kelm_train[99]['sentence'])

Lucius Jacques Dupré was born in St. Landry Parish, Louisiana, and served as a state court judge in 1853.


In [None]:
print(json.dumps(kelm_train[72], indent=4, sort_keys=True))

{
    "sentence": "Albert Solomonovich Schwarz (/wrts/; Russian: . . ; born June 24, 1934) is a mathematician and a theoretical physicist educated in the Soviet Union and now a professor at the University of California, Davis.",
    "triple": "Albert Schwarz occupation Mathematician, occupation university teacher, occupation Physicist, date of birth 24 June 1934, Kazan located in the administrative territorial entity Soviet Union"
}


In [None]:
print(json.dumps(train_kelm[6], indent=4, sort_keys=True))

In [None]:
train_kelm = kelm_train

In [None]:
print(len(train_kelm))

In [None]:
SIZE = 100000

In [None]:
train_kelm[:5]

In [None]:
len(train_kelm)

In [None]:
tripleset_kelm = train_kelm[:SIZE]['triple']
text_kelm =  train_kelm[:SIZE]['sentence']


## Carga del tokenizer y del modelo

In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("t5-small")

model = AutoModelWithLMHead.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
if torch.cuda.is_available():
    dev = torch.device("cuda:0") 
    print("Running on the GPU")
else:
    dev = torch.device("cpu")
    print("Running on the CPU")
model.to(dev)

Running on the CPU


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [None]:

from IPython.display import HTML, display

def progress(loss,value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss,value=value, max=max))

In [None]:
num_of_epochs = 1
batch_size = 4
num_of_batches = SIZE/batch_size
num_of_batches = int(num_of_batches)

In [None]:
from transformers.optimization import  Adafactor 
optimizer = Adafactor(
    model.parameters(),
    lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

## Entrenamiento del modelo

In [None]:
#Sets the module in training mode
model.train()

loss_per_10_steps=[]
for epoch in range(1,num_of_epochs+1):
  print('Running epoch: {}'.format(epoch))
  
  running_loss=0

  out = display(progress(1, num_of_batches+1), display_id=True)
  for i in range(num_of_batches):
    inputbatch=[]
    labelbatch=[]
    for triple,sentence in zip(tripleset_kelm[i*batch_size:i*batch_size+batch_size],text_kelm[i*batch_size:i*batch_size+batch_size]):
      input = '<eos>' + triple + '</s>' 
      labels = sentence+'</s>'   
      inputbatch.append(input)
      labelbatch.append(labels)
    inputbatch=tokenizer.batch_encode_plus(inputbatch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
    labelbatch=tokenizer.batch_encode_plus(labelbatch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]
    inputbatch=inputbatch.to(dev)
    labelbatch=labelbatch.to(dev)

    # clear out the gradients of all Variables 
    optimizer.zero_grad()

    # Forward propogation
    outputs = model(input_ids=inputbatch, labels=labelbatch)
    loss = outputs.loss
    loss_num=loss.item()
    logits = outputs.logits
    running_loss+=loss_num
    if i%10 ==0:      
      loss_per_10_steps.append(loss_num)
    out.update(progress(loss_num,i, num_of_batches+1))

    # calculating the gradients
    loss.backward()

    #updating the params
    optimizer.step()
    
  running_loss=running_loss/int(num_of_batches)
  print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))
  

## Guardo el modelo

In [None]:
model = model.from_pretrained("T5_KELM")
model.to(dev)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

## Prueba del modelo

In [None]:
model.eval()
input_ids = tokenizer.encode("William Shakespeare birthplace Stratford-upon-Avon, birthdate april 1564, </s>", return_tensors="pt")  # Batch size 1
input_ids=input_ids.to(dev)
outputs = model.generate(input_ids)
result = tokenizer.decode(outputs[0])
result

'<pad> William Shakespeare was born in Stratford-upon-Avon on 1564.</s>'

In [None]:
model.eval()
input_ids = tokenizer.encode("William Shakespeare birthplace Stratford-upon-Avon, birthdate april 1564, occupation playwright </s>", return_tensors="pt")  # Batch size 1
input_ids=input_ids.to(dev)
outputs = model.generate(input_ids)
result = tokenizer.decode(outputs[0])
result

'<pad> William Shakespeare was born in Stratford-upon-Avon on april'

In [None]:
model.eval()
input_ids = tokenizer.encode("Mary likes football, play football with friends </s>", return_tensors="pt")  # Batch size 1
input_ids=input_ids.to(dev)
outputs = model.generate(input_ids)
result = tokenizer.decode(outputs[0])
result

"<pad> Mary likes football, play football with the club, ''Facility'"