In [1]:
import os

DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

from tqdm.notebook import tqdm
# import numpy as np
import pandas as pd

import torch
# from torch import nn
# get Dataset class
from src.lib.decoder import Decoder
from src.lib.paraphrase_model import Paraphraser
from src.lib.style_classifier import StyleEncoder
from src.lib.style_transfer import StyleTransferer
# from src.lib.util import to_device
# from transformers import GPT2LMHeadModel, AdamW, GPT2Tokenizer

In [2]:

def load_decoder(state_dict_path):
    state_dict = torch.load(state_dict_path)
    for key in state_dict:
        state_dict[key] = state_dict[key].cpu()
    decoder = Decoder()
    decoder.load_state_dict(state_dict)
    return decoder
    

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path = "training_results/decoder_0_0.0979/model.pth"
decoder = load_decoder(path)
paraphraser = Paraphraser()
style_encoder = StyleEncoder()
style_transferer = StyleTransferer(style_encoder, decoder, device)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.weight', 'transformer.extra_embedding_project.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.weight', 'transformer.extra_embedding_project.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architect

In [4]:
test_df = pd.read_csv("data/decoded_cds/balanced/test.csv", index_col=0)
test_df

Unnamed: 0,label,text,paraphrase
0,coha_1890,The imprisonment of Grotius was not the worst ...,Grotius was not the worst of all.
1,poetry,The unfettered sun takes his unbounded reign,the unfettered sun is free to reign
2,aae,lol srry fun question but keep your head up do...,"excuse me, but I'm sorry, but I'm sorry, but I'm"
3,coha_1890,"I tried to speak, but could not . ""","I'm trying to talk, but I can't."
4,poetry,"Dancing upon the waves, as if to please","dancing on the waves, as if they were happy"
...,...,...,...
14218,coha_1810,"A high railing ran, rough and irregular, along...","just as we were, the high railing was rough an..."
14219,shakespeare,"Mercy but murders, pardoning those that kill.","mercy, mercy, mercy, mercy."
14220,coha_1810,The house at which I proposed to stop was upwa...,the house I'd like to stop is a mile away.
14221,coha_1990,"The only thing is, Grandma's going a bit batty.","the only thing is, Grandma's a little crazy."


In [6]:
output_df = pd.DataFrame(columns=["semantic_sentence", "paraphrase", "style_sentence", "label", "transferred_sentence"])

label_groups = test_df.groupby("label")

# use tqdm and iterrows over dataframe
for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    label, text = row["label"], row["text"]
    paraphrase = row["paraphrase"]
    # paraphrase = paraphraser.paraphrase(text) # let's not do this for speed reasons, and because there _may_ be something wrong with how we use the paraphraser

    # chose random text with the same label
    random_text = label_groups.get_group(label).sample(1)["text"].values[0]

    sample = {
        "semantic_sentence": text,
        "paraphrase": paraphrase,
        "style_sentence": random_text,
        "label": label,
    }

    transferred_sentence = style_transferer.transfer_style(text, random_text, truncate=True, max_length=25)
    sample["transferred_sentence"] = transferred_sentence

    output_df.loc[i] = sample
    if i % 10 == 0:
        output_df.to_csv("data/decoded_cds/balanced/test_transferred.csv")

    


  0%|          | 0/14223 [00:00<?, ?it/s]

KeyboardInterrupt: 