In [1]:
import os

import pandas as pd
import torch
import yaml
from tqdm import tqdm_notebook as tqdm
from transformers import BertTokenizer, BertModel

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
config_path = os.path.join(project_root, "config.yaml")

with open(config_path, "r") as f:
    config = yaml.safe_load(f)

In [3]:
csv_path = os.path.join(config['paths']['data_dir'], "data", "pokemon.csv")
df = pd.read_csv(csv_path, encoding='UTF-16', delimiter='\t', index_col=0)

In [4]:
df.head()

Unnamed: 0_level_0,gen,english_name,japanese_name,primary_type,secondary_type,classification,percent_male,percent_female,height_m,weight_kg,...,evochain_1,evochain_2,evochain_3,evochain_4,evochain_5,evochain_6,gigantamax,mega_evolution,mega_evolution_alt,description
national_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I,Bulbasaur,Fushigidane,grass,poison,Seed Pokémon,88.14,11.86,0.7,6.9,...,Level,Ivysaur,Level,Venusaur,,,,,,There is a plant seed on its back right from t...
2,I,Ivysaur,Fushigisou,grass,poison,Seed Pokémon,88.14,11.86,1.0,13.0,...,Level,Ivysaur,Level,Venusaur,,,,,,"When the bulb on its back grows large, it appe..."
3,I,Venusaur,Fushigibana,grass,poison,Seed Pokémon,88.14,11.86,2.0,100.0,...,Level,Ivysaur,Level,Venusaur,,,Gigantamax Venusaur,Mega Venusaur,,Its plant blooms when it is absorbing solar en...
4,I,Charmander,Hitokage,fire,,Lizard Pokémon,88.14,11.86,0.6,8.5,...,Level,Charmeleon,Level,Charizard,,,,,,It has a preference for hot things. When it ra...
5,I,Charmeleon,Lizardo,fire,,Flame Pokémon,88.14,11.86,1.1,19.0,...,Level,Charmeleon,Level,Charizard,,,,,,"It has a barbaric nature. In battle, it whips ..."


In [5]:
print(f'Dataset_length: {len(df)}')
print(f'Dataset_shape: {df.shape}')
print(f'Dataset_dtype: {df.dtypes}')
print(f'Dataset_columns: {df.columns}')

Dataset_length: 898
Dataset_shape: (898, 54)
Dataset_dtype: gen                    object
english_name           object
japanese_name          object
primary_type           object
secondary_type         object
classification         object
percent_male           object
percent_female         object
height_m              float64
weight_kg             float64
capture_rate           object
base_egg_steps          int64
hp                      int64
attack                  int64
defense                 int64
sp_attack               int64
sp_defense              int64
speed                   int64
abilities_0            object
abilities_1            object
abilities_2            object
abilities_hidden       object
against_normal        float64
against_fire          float64
against_water         float64
against_electric      float64
against_grass         float64
against_ice           float64
against_fighting      float64
against_poison        float64
against_ground        float64
against_fl

In [6]:
print(df['description'].iloc[0])

There is a plant seed on its back right from the day this Pokémon is born. The seed slowly grows larger.


In [7]:
interesting_columns = [
    'english_name',
    'japanese_name',
    'primary_type',
    'secondary_type',
    'classification',
    'height_m',
    'weight_kg',
    'abilities_0',
    'abilities_1',
    'abilities_2',
    'abilities_hidden',
    'is_legendary',
    'is_mythical',
    'is_sublegendary',
    'mega_evolution',
    'description'
]
interesting_columns = [
    'english_name',
    'primary_type',
    'secondary_type',
    'classification',
    'description'
]

df['full_description'] = "This is a pokemon description. "
for col in interesting_columns:
    col_name = col.replace('_', ' ').capitalize()
    df['full_description'] += col_name + ' is ' + df[col].astype(str).replace(
        {'0': 'False', '1': 'True'}) + '. ' if col != 'description' else col_name + ' is ' + df[col].astype(
        str).replace({'0': 'False', '1': 'True'})

In [8]:
a = df['full_description'].iloc[0]
print(a, len(a))

This is a pokemon description. English name is Bulbasaur. Primary type is grass. Secondary type is poison. Classification is Seed Pokémon. Description is There is a plant seed on its back right from the day this Pokémon is born. The seed slowly grows larger. 258


In [9]:
df['full_description'].to_csv('text_description.csv', sep=',', encoding='utf-8', index=False)

In [26]:

tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-mini")
model = BertModel.from_pretrained("prajjwal1/bert-mini")

inputs = tokenizer(df['full_description'].iloc[0],
                   return_tensors="pt",
                   return_attention_mask=True)
print(len(inputs['input_ids'][0]))
# Forward pass through the model
outputs = model(**inputs)

# Get last hidden states
last_hidden_states = outputs.last_hidden_state

127


In [27]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [47]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

model_name = "tuner007/pegasus_paraphrase"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
df = pd.read_csv("text_description.csv", header=0)


def generate_paraphrases(text, num_return=10):
    input_text = f"paraphrase this pokemon description, be sure to write the description about the specified pokemom: {text}"
    encoding = tokenizer.encode_plus(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    outputs = model.generate(
        input_ids=encoding["input_ids"],
        attention_mask=encoding["attention_mask"],
        max_length=168,
        num_beams=20,
        num_return_sequences=num_return,
        no_repeat_ngram_size=2,
        early_stopping=True,
    )

    results = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print(results)
    return "<|>".join(list(dict.fromkeys(results)))  # remove duplicates


# Generate for all entries
paraphrased = []
for text in tqdm(df["full_description"].iloc[:5], desc="Generating paraphrases"):
    try:
        paraphrased.append(generate_paraphrases(text))
    except Exception as e:
        print(e)
        paraphrased.append(text)

# Save result
df = pd.DataFrame(paraphrased)
df.to_csv("rewritten_descriptions.csv", index=False, header=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text in tqdm(df["full_description"].iloc[:5], desc="Generating paraphrases"):


Generating paraphrases:   0%|          | 0/5 [00:00<?, ?it/s]

['English and Japanese names are Bulbasaur and Fushigidane, respectively.', 'English and Japanese names are Bulbasaur and Fushigidane.', 'This is a description of a pokemon.', 'The name Bulbasaur is English and Fushigidane is Japanese.', 'The English name is Bulbasaur and the Japanese is Fushigidane.', 'The English name is Bulbasaur, the Japanese is Fushigidane.', 'This is a description of a Pokemon.', 'The English name is Bulbasaur.', 'This is a pokemon description, be sure to write it down.', 'Japanese name is Fushigidane, English is Bulbasaur.']
['English and Japanese names are Ivysaur and Fushigisou, respectively.', 'English and Japanese names are Ivysaur and Fushigisou.', 'The name Ivysaur is English and the name Fushigisou is Japanese.', 'The English name is Ivysaur and the Japanese one is Fushigisou.', 'The English name is Ivysaur and the Japanese is Fushigisou.', 'The name Ivysaur is English and the Japanese name is Fushigisou.', 'This is a description of a pokemon.', 'Japanese