## Imports

In [2]:
from datasets import Dataset,load_dataset
import json
import pandas as pd
import numpy as np
import pronouncing
import re
from collections import Counter
from tqdm import tqdm

## Load Datasets

In [3]:
merve = load_dataset("merve/poetry")

Using custom data configuration merve--poetry-f27664155e5efc39
Found cached dataset csv (/home/shahul/.cache/huggingface/datasets/merve___csv/merve--poetry-f27664155e5efc39/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
poem_df = pd.read_csv("/home/shahul/Data/PoetryFoundationData.csv")

In [5]:
author_dict = poem_df['Poet'].value_counts().to_dict()
poem_df["poems_counts"] = poem_df["Poet"].map(author_dict)

In [6]:
poem_df = poem_df[poem_df["poems_counts"]>2]
poem_df = poem_df.rename({"Poet":"author","Poem":"content",
                "Title":'poem name',"Tags":"type"},axis=1)
poem_df = poem_df[['poem name', 'content', 'author', 'type']]
poem_df["age"] = "None"
poems_dataset = Dataset.from_pandas(poem_df,split="train",preserve_index=False)

In [7]:
from transformers import pipeline
nlp = pipeline(task='text-classification', 
               model='nickwong64/bert-base-uncased-poems-sentiment')



In [8]:
def get_best_rhymes(content):
    
    best_rhymes = []
    first_word = 0
    lines = [re.sub(r'\r','',line) for line in content.split("\n") if line!=""]
    if len(lines)>1:
        last_words = [re.sub('[^A-Za-z0-9]+', '', line.split(" ")[-1]) for line in lines]
        for word in last_words:
            rhymes = pronouncing.rhymes(word)
            index = last_words.index(word)
            rhymes_words = np.intersect1d(rhymes,last_words[index:index+10])
            if len(rhymes_words) > len(best_rhymes):
                best_rhymes = list(rhymes_words)
                best_rhymes.insert(0,word)
    if best_rhymes:            
        first_word = min([last_words.index(word) for word in best_rhymes])
    return last_words[first_word],best_rhymes

### Templates

In [10]:
poem_synonyms = ["poem","poetry"]
compose_synonyms = ["Write","Help me write", "Compose", "Please craft", "Give me"]
complete_synonyms = ["Complete", "Finish", "Put the finishing touches to"]
sentiment_phrase = {"positive":["positive ","happy "],"negative":["negative ","sad "]}
writing_style = ["in the manner of {}.","in {}'s writing style.",]
writing_period = [" written in {}."," written during {} period"]

In [11]:
prompts = {
    
    
    "begining":{
    "default": ["{} a {} entitled {}"],
    "about": ["{} a {} on the topic {}",
         "{} a {} about {} "],
    "rhyming":["{} a {} containing rhyming words for the word '{}' entitled {}.",
       "{} a {} containing rhyming words for the word '{}'"],
    "genre_age":["{} a {} on the topic {} ", 
      "{} a {} of the genre {} "],
    },
    
    "completion":{"completion":["{} the {}",
         "{} the {} entitled {}"],}
 
 }

## Augmentation

In [16]:
def toss_prompt(possible_prompts,prompt_types):
    prompt_type = np.random.choice(prompt_types,p=[0.9,0.1])
    prompt = np.random.choice(possible_prompts[prompt_type])
    return prompt_type,prompt

def get_emotion(content):
    
    labels = [nlp(re.sub(r'\r|\n','',line))[0]["label"] for line in content.split(".")][:3]
    if labels.count("negative") >= 2:
        sentiment = "negative"
    elif labels.count("positive") >= 2:
        sentiment = "positive"
    else:
        sentiment = None
    return sentiment
        

def build_prompt(possible_prompts,prompt_types,args,sentiment,rhyming_list):
    
    compose = np.random.choice(compose_synonyms)
    poem = np.random.choice(poem_synonyms)
    complete = np.random.choice(complete_synonyms)
    
    prompt_type,prompt = toss_prompt(possible_prompts,prompt_types)
    if prompt_type == "default":
        
        prompt = prompt.format(compose,poem,args["poem name"])
    
    elif prompt_type == "about":
        
        prompt = prompt.format(compose,poem,args["poem name"].lower())
        
    elif prompt_type == "genre_age":
        
        prompt = prompt.format(compose,poem,args["type"],args["age"])
    
    
    elif prompt_type == "completion":
        
        prompt = prompt.format(complete,poem,args["poem name"])
    
    if ((np.random.randint(0,5)) and (sentiment!=None)):
            index = prompt.find(poem)
            prompt = prompt[:index] + np.random.choice(sentiment_phrase[sentiment]) + prompt[index:]
    
    if ((np.random.randint(0,5)) and (len(rhyming_list)>2)):
            index = prompt.find(poem) + len(poem)
            prompt = prompt[:index] + " containing rhyming words for the word '{}'".format(rhyming_list[0]) + prompt[index:]

        
    return prompt

def add_author(prompt,author,top_authors):
    
    if (author.lower() in top_authors) and (np.random.randint(0,2)):
        style = np.random.choice(writing_style).format(author)
        prompt= prompt + " " + style
        
    return prompt

def get_top_authors(dataset):
    
    counter = Counter([poem["author"] for poem in dataset]).most_common(100)
    authors,_ = zip(*counter)
    return authors 

MIN_RHYMES = 3
def create_poem_instructions(dataset):
    
    top_authors = get_top_authors(dataset)
    all_prompts = []
    for item in tqdm(dataset):
        item["poem name"] = re.sub(r'\r|\n|\[.*\]','',item["poem name"]).strip()
        poem_name, content, author, genre, age = item.values()
        prompt_type = np.random.choice(["completion","begining"],p=[0.3,0.7])
        
        sentiment = get_emotion(content)
        rh_word,rh_wordslist = get_best_rhymes(item["content"])
        item["rhyming"] = rh_word
        
        possible_prompts = prompts[prompt_type]

        
        if prompt_type == "begining":
            
            if ((genre!=None) and np.random.randint(0,2)):
                
                prompt = build_prompt(possible_prompts,["genre_age","default"],item,sentiment,rh_wordslist)
                
                if ((item["age"]!="") and (np.random.randint(0,2))):
                    prompt += np.random.choice(writing_period).format(item["age"])
                    

            elif poem_name.lower().startswith("the"):
                prompt = build_prompt(possible_prompts,["about","default"],item,sentiment,rh_wordslist)
             
            else:
                prompt = build_prompt(possible_prompts,["default","default"],item,sentiment,rh_wordslist)
                
            prompt = add_author(prompt,author,top_authors)
            response = item["content"].strip()
            
        else:
            prompt = build_prompt(possible_prompts,["completion","completion"],item,sentiment,rh_wordslist)
            prompt = add_author(prompt,author,top_authors)
            num_lines = np.random.randint(3,6)
            poem_lines = item["content"].split("\n")
            prompt = prompt + "\n" + "\n".join(poem_lines[:num_lines])
            response = "\n".join(poem_lines[num_lines:]).strip()
            
        all_prompts.append({"prompt":prompt,"response":response})
        
    return all_prompts
            
                
    

In [17]:
create_poem_instructions(dataset=merve["train"].select(range(100)))

100%|█████████████████████████████████████████████████| 100/100 [01:01<00:00,  1.63it/s]


[{'prompt': 'Compose a negative poem entitled The Phoenix and the Turtle',
  'response': 'Let the bird of loudest lay\r\nOn the sole Arabian tree\r\nHerald sad and trumpet be,\r\nTo whose sound chaste wings obey.\r\n\r\nBut thou shrieking harbinger,\r\nFoul precurrer of the fiend,\r\nAugur of the fever\'s end,\r\nTo this troop come thou not near.\r\n\r\nFrom this session interdict\r\nEvery fowl of tyrant wing,\r\nSave the eagle, feather\'d king;\r\nKeep the obsequy so strict.\r\n\r\nLet the priest in surplice white,\r\nThat defunctive music can,\r\nBe the death-divining swan,\r\nLest the requiem lack his right.\r\n\r\nAnd thou treble-dated crow,\r\nThat thy sable gender mak\'st\r\nWith the breath thou giv\'st and tak\'st,\r\n\'Mongst our mourners shalt thou go.\r\n\r\nHere the anthem doth commence:\r\nLove and constancy is dead;\r\nPhoenix and the Turtle fled\r\nIn a mutual flame from hence.\r\n\r\nSo they lov\'d, as love in twain\r\nHad the essence but in one;\r\nTwo distincts, divisi