In [None]:
!pip install transformers
!pip install datasets

In [7]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
base_tokenizer = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne")

base_model = AutoModelForCausalLM.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne")

In [None]:
# se definen los eos y bos tokens
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# se añade el nuevo token al tokenizer 
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# configuración del modelo a la que añadimos los tokens especiales
config = AutoConfig.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne", 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)

# cargamos el modelo pre-entrenado con la configuración personalizada
base_model = GPT2LMHeadModel.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne", config=config)

# resizing del embeding en el modelo
base_model.resize_token_embeddings(len(base_tokenizer))

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data/NLP/poems.csv', encoding = 'utf-8', usecols=['title', 'content'])\
                    .rename(columns={'title': 'text'})

In [None]:
def process_headlines(df, text_colname):
  
    # Remove empty and null rows
    titulo_vacio = (df[text_colname].str.len() == 0) | df[text_colname].isna()
    df = df[~titulo_vacio]

    # Drop duplicates
    text_df = df.drop_duplicates(subset = [text_colname])\
                [[text_colname]]

    #Change capital letters
    text_df[text_colname] = text_df[text_colname].str.capitalize()

    return text_df
    
df = process_headlines(df, 'text')

In [None]:
df['text'] = bos + ' ' + df['text'] + ' ' + eos
df_train, df_val = train_test_split(df, train_size = 0.9, random_state = 77)
print(f'Hay {len(df_train)} titulos de poema y {len(df_val)} para la validación')

In [None]:
# cargamos los datasets directamente desde un dataframe de pandas
train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])

In [None]:
def tokenize_function(examples):
  return base_tokenizer(examples['text'], padding=True)

tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)

In [None]:
model_headlines_path = '/content/drive/MyDrive/Data/NLP/Base_BNE/Titulo'

training_args = TrainingArguments(
    output_dir=model_headlines_path,          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_headlines_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=2300
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

In [None]:
trainer = Trainer(
    model=base_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)
trainer.train()

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_headlines_path)

In [None]:
trainer.evaluate()

In [10]:
def generar_poema(model, tokenizer, input_text):
  text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
  generated_text_samples = model.generate(
          text_ids, 
          max_length= 125,  
          num_return_sequences= 1,
          no_repeat_ngram_size= 2,
          repetition_penalty= 1.5,
          top_p= 0.92,
          temperature= 0.95,
          do_sample= True,
          top_k= 125,
          early_stopping= True
      )
  text = tokenizer.decode(generated_text_samples[0], skip_special_tokens=True)

  return text

In [None]:
model_headlines_path = '/content/drive/MyDrive/Data/NLP/Base_BNE/Titulo'


headlines_model = AutoModelForCausalLM.from_pretrained(model_headlines_path)
headlines_tokenizer = AutoTokenizer.from_pretrained(model_headlines_path)

In [26]:
input_text = headlines_tokenizer.bos_token + 'Las'
titulo = generar_poema(headlines_model, headlines_tokenizer, input_text)
print(titulo)

Las leyes de la muerte 


In [None]:
# se definen los eos y bos tokens
bos = '<|endoftext|>'
eos = '<|EOS|>'
body = '<|body|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'sep_token': body}

# se añade el nuevo token al tokenizer 
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# configuración del modelo a la que añadimos los tokens especiales
config = AutoConfig.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne", 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)

# cargamos el modelo pre-entrenado con la configuración personalizada
base_model = GPT2LMHeadModel.from_pretrained("PlanTL-GOB-ES/gpt2-base-bne", config=config)

if base_tokenizer.pad_token is None:
    base_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# resizing del embeding en el modelo
base_model.resize_token_embeddings(len(base_tokenizer))

In [None]:
def tokenize_function(examples):
  return base_tokenizer(examples['text'])#, truncation=True, max_length=1024)

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/Data/NLP/poems.csv', encoding = 'utf-8', usecols=['title', 'content'])\
                    .rename(columns={'title': 'text'})

In [None]:
def process_poems(df, title_col, content_col):
    # Remove rows with empty or null title or content
    titulo_vacio = (df[title_col].str.len() == 0) | df[title_col].isna()
    contenido_vacio = (df[content_col].str.len() == 0) | df[content_col].isna()
    df = df[~titulo_vacio & ~contenido_vacio]

    # Drop duplicates
    df = df.drop_duplicates(subset = [title_col])
  
    #Change capital letters
    df[title_col] =df[title_col].str.capitalize()

    # Keep the first 100 words from the content
    df[content_col] = df[content_col].str.split(' ').apply(lambda x: ' '.join(x[:100]))
    
    #Remove /n+
    #df[content_col] = df[content_col].str.replace(r'[\n]+', ' ', regex=True)

    return df

In [None]:
# We add the tokens
prepare_text = lambda x: ' '.join([bos, x['content'], eos])
#prepare_text = lambda x: ' '.join([bos, x['title'], body, x['content'], eos])
df2['text'] = df2.apply(prepare_text, axis=1)

print(df2['text'][0])
# Split in train and test
df_train, df_val = train_test_split(df2, train_size = 0.9, random_state = 77)

# we load the datasets from pandas df
train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])

# tokenization
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

In [None]:
model_poems_path = '/content/drive/MyDrive/Data/NLP/Base_BNE/Content'

In [None]:
training_args = TrainingArguments(
    output_dir=model_poems_path,          # output directory
    num_train_epochs=2,              # total # of training epochs
    per_device_train_batch_size=3,  # batch size per device during training
    per_device_eval_batch_size=48,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_poems_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=5000
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()
base_tokenizer.save_pretrained(model_poems_path)

In [8]:
#Titulo
model_headlines_path = '/content/drive/MyDrive/Data/NLP/Base_BNE/Titulo'

headlines_model = AutoModelForCausalLM.from_pretrained(model_headlines_path)
headlines_tokenizer = AutoTokenizer.from_pretrained(model_headlines_path)
device = "cuda"
input_text = headlines_tokenizer.bos_token

#Contenido
model_poems_path = '/content/drive/MyDrive/Data/NLP/Base_BNE/Content'
poem_model = AutoModelForCausalLM.from_pretrained(model_poems_path)
poem_tokenizer = AutoTokenizer.from_pretrained(model_poems_path)

Downloading:   0%|          | 0.00/973 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/250 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
def pretty_print(text, max_len_line=100):
    words = text.split(' ')
    len_line = 0
    line = ''
    for w in words:
        if w == '\n':
            print(line)
            line = ''
            continue
        if (len(line) + len(w)) > max_len_line:
            print(line)
            line = ''
        line += ' ' + w
    print(line)

In [None]:
tituloV = generar_poema(headlines_model, headlines_tokenizer, input_text)
tituloT= tituloV + ' ' + poem_tokenizer.bos_token
while True:
    content = generar_poema(poem_model, poem_tokenizer, tituloT)
    content = content.replace(tituloV, "")

    for indice in range(len(content)):
      caracter = content[indice]
      inv_caracter=content[len(content)-indice-1]
      if caracter=='.' or caracter=='?':
        final=indice
      if inv_caracter.isupper:
        inicio=len(content)-indice+1
    print(content)
    content= content[inicio-1:final+1]
    palabras = content.split(" ")
    if len(palabras)>=70:
        break

print('\n\n\033[1m' + tituloV + '\033[0m\n')
pretty_print(content)
print()