# Import Libraries

In [2]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
# from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import TextDataset, DataCollatorForLanguageModeling
import math

In [3]:
poem_file = open('/content/poem.txt','r')
poem = poem_file.read()

In [4]:
poem_corpus = poem.split("\n")
print(poem_corpus[:5])

['‡§®‡§õ‡§æ‡§°‡•Ä ‡§ú‡§æ‡§®‡•ã‡§∏‡•ç ‡§π‡•á ‡§Æ‡•á‡§∞‡§æ ‡§™‡•ç‡§∞‡§æ‡§£ ! ‡§Ö‡§ï‡•á‡§≤‡•Ä ‡§Æ‡§≤‡§æ‡§à,', '‡§Æ‡§®‡§ï‡•ã ‡§µ‡§®‡§Æ‡§æ ‡§®‡§®‡§ø‡§≠‡•ç‡§®‡•á ‡§ó‡§∞‡•Ä ‡§µ‡§ø‡§∞‡§π ‡§ú‡§≤‡§æ‡§à !', '‡§®‡§®‡§ø‡§≠‡•ç‡§®‡•á ‡§ó‡§∞‡•Ä ‡§µ‡§ø‡§∞‡§π ‡§ú‡§≤‡§æ‡§à,', '‡§≤‡•ã‡§ö‡§®‡§ï‡§æ ‡§§‡§æ‡§∞‡§æ ! ‡§π‡•á ‡§Æ‡•á‡§∞ ‡§™‡•ç‡§Ø‡§æ‡§∞‡§æ ! ‡§Ø‡•ã ‡§ú‡•ã‡§§‡§ø  ‡§¨‡§ø‡§≤‡§æ‡§è !', '‡§ï‡•á ‡§≠‡§®‡•Ç‡§Å? ‡§≠‡§®‡•ç‡§®‡•á ‡§Æ ‡§ï‡•á‡§π‡•Ä ‡§•‡§ø‡§á‡§®  ‡§µ‡§ø‡§∑ ‡§®‡•à ‡§™‡§ø‡§≤‡§æ‡§è !']


In [5]:
def remove_noise(sentences):
    punctuations = ['\n','\ufeff','0','1','2','3','4','5','6','7','8','9','‡•¶','‡•ß','‡•®','‡•©','‡•™','‡•´','‡•¨','‡•≠','‡•Æ','‡•Ø','‡•ß‡•¶','\u200d']
    processed_sentences = []
    for sentence in sentences:
        for punct in punctuations:
            sentence = sentence.replace(punct,'')
        processed_sentences.append(sentence)

    return processed_sentences

In [6]:
processed_poem_corpus = remove_noise(poem_corpus)
print(processed_poem_corpus[:5])

['‡§®‡§õ‡§æ‡§°‡•Ä ‡§ú‡§æ‡§®‡•ã‡§∏‡•ç ‡§π‡•á ‡§Æ‡•á‡§∞‡§æ ‡§™‡•ç‡§∞‡§æ‡§£ ! ‡§Ö‡§ï‡•á‡§≤‡•Ä ‡§Æ‡§≤‡§æ‡§à,', '‡§Æ‡§®‡§ï‡•ã ‡§µ‡§®‡§Æ‡§æ ‡§®‡§®‡§ø‡§≠‡•ç‡§®‡•á ‡§ó‡§∞‡•Ä ‡§µ‡§ø‡§∞‡§π ‡§ú‡§≤‡§æ‡§à !', '‡§®‡§®‡§ø‡§≠‡•ç‡§®‡•á ‡§ó‡§∞‡•Ä ‡§µ‡§ø‡§∞‡§π ‡§ú‡§≤‡§æ‡§à,', '‡§≤‡•ã‡§ö‡§®‡§ï‡§æ ‡§§‡§æ‡§∞‡§æ ! ‡§π‡•á ‡§Æ‡•á‡§∞ ‡§™‡•ç‡§Ø‡§æ‡§∞‡§æ ! ‡§Ø‡•ã ‡§ú‡•ã‡§§‡§ø  ‡§¨‡§ø‡§≤‡§æ‡§è !', '‡§ï‡•á ‡§≠‡§®‡•Ç‡§Å? ‡§≠‡§®‡•ç‡§®‡•á ‡§Æ ‡§ï‡•á‡§π‡•Ä ‡§•‡§ø‡§á‡§®  ‡§µ‡§ø‡§∑ ‡§®‡•à ‡§™‡§ø‡§≤‡§æ‡§è !']


In [7]:
with open('processed_poem.txt','w') as f:
  for line in processed_poem_corpus:
    f.write(line + '\n')

In [8]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


In [24]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = AutoModelForCausalLM.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [26]:
# you need to set parameters
train_file_path = "/content/processed_poem.txt"
model_name = 'Sakonii/distilgpt2-nepali'
output_dir = '/content/'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 250
save_steps = 1000

In [27]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,2.8798
1000,0.2905
1500,0.11
2000,0.0811
2500,0.0669
3000,0.0594
3500,0.056
4000,0.0537
4500,0.052
5000,0.0513


In [6]:
def load_model(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "C:/Users/Ghost/Desktop/gits/Nepali_Poem_Generator/trainings/GPT2/models/dGPT2"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [7]:
sequence = input()
max_len = int(input()) # 20
generate_text(sequence, max_len)



‡§Æ‡•á‡§∞‡•ã ‡§Æ‡§®‡§Æ‡§æ ‡§ú‡§æ‡§ó‡•á‡§ï‡•ã, ‡§ù‡§≤‡•ç‡§ù‡§≤‡•Ä ‡§¶‡•á‡§ñ‡•Ä ‡§µ‡§ø‡§∞‡§π ‡§≤‡§æ‡§ó‡•á‡§ï‡•ã! ‡§µ‡§ö‡§® ‡§§‡§ø‡§Æ‡•ç‡§∞‡•ã ‡§§‡§æ‡§∞‡§Æ‡§æ ‡§Æ‡§®‡§ï‡•ã ‡§®‡§ø‡§¶‡§æ‡§á‡§∞‡§π‡§®‡•ç‡§õ, ‡§∏‡§Æ‡•ç‡§ù‡•á‡§∞ ‡§Ü‡§Ø‡•ã ‡§ù‡§®‡•ç‡§ï‡§®‡•ç‡§õ ‡§≠‡§ø‡§§‡•ç‡§∞, ‡§ï‡§π‡§æ‡§®‡•Ä ‡§ï‡§π‡§®‡•ç‡§õ, ‡§¶‡•Å‡§É‡§ñ‡§ï‡•ã ‡§ï‡§æ‡§®‡§Æ‡§æ ‡§∏‡•Å‡§ñ‡§ï‡•ã ‡§ï‡§•‡§æ ‡§¨‡§ú‡§æ‡§á‡§∞‡§π‡§®‡•ç‡§õ ‡•§ ‡§™‡§ñ‡•á‡§ü‡§æ ‡§õ‡•à‡§®‡§®‡•ç ‡§â‡§°‡•á‡§∞ ‡§ú‡§æ‡§® ‡§ö‡§ø‡§°‡§ø‡§Ø‡§æ ‡§â‡§°‡•á‡§ï‡§æ, ‡§π‡•á‡§∞‡•á‡§∞ ‡§¨‡§∏‡•Ä ‡§Ü‡§Å‡§∏‡•Å‡§ï‡§æ ‡§•‡•ã‡§™‡§æ ‡§ó‡§π‡§Æ‡§æ ‡§õ‡•Å‡§ü‡•á‡§ï‡§æ, ‡§¶‡•á‡§ñ‡•á‡§®‡•å ‡§§‡§ø‡§Æ‡•ç‡§≤‡•á ‡§ï‡§§‡§ø‡§ï‡§æ ‡§•‡§ø‡§è ‡§õ‡§æ‡§§‡•Ä‡§Æ‡§æ ‡§ó‡•Å‡§°‡•á‡§ï‡§æ! ‡§ï‡§ø‡§® ‡§π‡•ã ‡§ï‡§ø‡§®, ‡§Ø‡•ã ‡§Æ‡•á‡§∞‡•ã ‡§Æ‡§® ‡§¨‡§æ‡§¶‡§≤‡§≤‡•á ‡§¢‡§æ‡§ï‡•ç‡§¶‡§õ
