# Import Libraries

In [2]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
# from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import TextDataset, DataCollatorForLanguageModeling
import math

In [3]:
poem_file = open('/content/poem.txt','r')
poem = poem_file.read()

In [4]:
poem_corpus = poem.split("\n")
print(poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण ! अकेली मलाई,', 'मनको वनमा ननिभ्ने गरी विरह जलाई !', 'ननिभ्ने गरी विरह जलाई,', 'लोचनका तारा ! हे मेर प्यारा ! यो जोति  बिलाए !', 'के भनूँ? भन्ने म केही थिइन  विष नै पिलाए !']


In [5]:
def remove_noise(sentences):
    punctuations = ['\n','\ufeff','0','1','2','3','4','5','6','7','8','9','०','१','२','३','४','५','६','७','८','९','१०','\u200d']
    processed_sentences = []
    for sentence in sentences:
        for punct in punctuations:
            sentence = sentence.replace(punct,'')
        processed_sentences.append(sentence)

    return processed_sentences

In [6]:
processed_poem_corpus = remove_noise(poem_corpus)
print(processed_poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण ! अकेली मलाई,', 'मनको वनमा ननिभ्ने गरी विरह जलाई !', 'ननिभ्ने गरी विरह जलाई,', 'लोचनका तारा ! हे मेर प्यारा ! यो जोति  बिलाए !', 'के भनूँ? भन्ने म केही थिइन  विष नै पिलाए !']


In [7]:
with open('processed_poem.txt','w') as f:
  for line in processed_poem_corpus:
    f.write(line + '\n')

In [8]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


In [24]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = AutoModelForCausalLM.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [26]:
# you need to set parameters
train_file_path = "/content/processed_poem.txt"
model_name = 'Sakonii/distilgpt2-nepali'
output_dir = '/content/'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 250
save_steps = 1000

In [27]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,2.8798
1000,0.2905
1500,0.11
2000,0.0811
2500,0.0669
3000,0.0594
3500,0.056
4000,0.0537
4500,0.052
5000,0.0513


In [6]:
def load_model(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "C:/Users/Ghost/Desktop/gits/Nepali_Poem_Generator/trainings/GPT2/models/dGPT2"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [7]:
sequence = input()
max_len = int(input()) # 20
generate_text(sequence, max_len)



मेरो मनमा जागेको, झल्झली देखी विरह लागेको! वचन तिम्रो तारमा मनको निदाइरहन्छ, सम्झेर आयो झन्कन्छ भित्र, कहानी कहन्छ, दुःखको कानमा सुखको कथा बजाइरहन्छ । पखेटा छैनन् उडेर जान चिडिया उडेका, हेरेर बसी आँसुका थोपा गहमा छुटेका, देखेनौ तिम्ले कतिका थिए छातीमा गुडेका! किन हो किन, यो मेरो मन बादलले ढाक्दछ
