In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import json
from tqdm.auto import tqdm
import torch

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# model = model.to(device)

In [11]:
model_save_path = '/content/drive/My Drive/FYP content/Final thesis/model'
tokenizer_save_path = '/content/drive/My Drive/FYP content/Final thesis/tokenizer'


In [12]:
model = T5ForConditionalGeneration.from_pretrained(model_save_path)


In [13]:
tokenizer = T5Tokenizer.from_pretrained(tokenizer_save_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
file_path = '/content/drive/My Drive/FYP content/Final thesis/components.jsonl'

In [15]:
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_data = json.loads(line.strip())
            data.append(json_data)
    return data

In [16]:
data = load_jsonl(file_path)

In [17]:
print(f"Number of components: {len(data)}")

Number of components: 5617


In [18]:
def generate_prompts_for_json(data, model, tokenizer, device):
    model.to(device)
    generated_prompts = []
    for json_input in tqdm(data, desc="Generating Prompts"):
        input_text = f"generate prompt: {json.dumps(json_input)}"  # Adjust the prefix based on how the model was trained
        input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
        output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
        prompt = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated_prompts.append(prompt)
    return generated_prompts


In [19]:
generated_prompts = generate_prompts_for_json(data, model, tokenizer, device)

Generating Prompts:   0%|          | 0/5617 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors


In [None]:
#rgdf'
#fssd
#sadf
#fsdf
#snkdfjc
#fadscs
#dgtf
#gsdfgs
#asdfxc
#srgfs
#hfgbjn
#srdfg
#sfgv
#jadfk
#gfgvsf
#sdrfg
#dgrfed
#sfhdfghv
#sdrfgsd
#sfvsx
#ailhudj
#dgchvbdxg
#naidfnkaj
#sdfgvsdf
#sdrfgdf


In [33]:
json_file_path = '/content/drive/My Drive/FYP content/Final thesis/json_only.txt'
prompts_file_path = '/content/drive/My Drive/FYP content/Final thesis/prompts_only.txt'
pairs_file_path = '/content/drive/My Drive/FYP content/Final thesis/json_prompt_pairs.txt'
pairs_for_seq2seq_file_path = '/content/drive/My Drive/FYP content/Final thesis/json_prompt_seq2seq.txt'


In [37]:
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    for json_input in tqdm(data, desc="Saving JSON"):
        json_file.write(json.dumps(json_input) + '\n')

Saving JSON:   0%|          | 0/5617 [00:00<?, ?it/s]

In [36]:
with open(prompts_file_path, 'w', encoding='utf-8') as prompts_file:
    for prompt in tqdm(generated_prompts, desc="Saving Prompts"):
        prompts_file.write(prompt + '\n')

Saving Prompts:   0%|          | 0/5617 [00:00<?, ?it/s]

In [38]:
with open(pairs_file_path, 'w', encoding='utf-8') as pairs_file:
    for json_input, prompt in tqdm(zip(data, generated_prompts), desc="Saving Pairs"):
        pair = {"input": json_input, "output": prompt}
        pairs_file.write(json.dumps(pair) + '\n')


Saving Pairs: 0it [00:00, ?it/s]

In [34]:
with open(pairs_for_seq2seq_file_path, 'w', encoding='utf-8') as seq2seq_file:
    for json_input, prompt in tqdm(zip(data, generated_prompts), desc="Saving Seq2Seq Pairs"):
        formatted_input = json.dumps(json_input)
        seq2seq_pair = f"{formatted_input}[SEP]{prompt}"
        seq2seq_file.write(seq2seq_pair + '\n')


Saving Seq2Seq Pairs: 0it [00:00, ?it/s]

In [43]:
def print_file_lines(file_path, num_lines=5, description=""):
    print(f"\n{description} - First {num_lines} lines:\n" + "-"*50)
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in range(num_lines):
            print(file.readline().strip())
    print("-"*50)

In [44]:
print_file_lines(json_file_path, description="JSON Only")
print_file_lines(prompts_file_path, description="Prompts Only")
print_file_lines(pairs_file_path, description="JSON-Prompt Pairs")
print_file_lines(pairs_for_seq2seq_file_path, description="Seq2Seq Format")


JSON Only - First 5 lines:
--------------------------------------------------
{"variant_properties": {"color": "rgba(255, 255, 255, 1.0)", "strokes": ["rgba(126, 86, 216, 1.0)"], "strokeWeight": 1.0, "text": "Button CTA", "textColor": "rgba(255, 255, 255, 1.0)", "borderRadius": 10.0, "fontFamily": "Inter", "fontWeight": 500, "fontSize": 14.0, "effects": [{"type": "DROP_SHADOW", "color": "rgba(16, 24, 40, 0.05000000074505806)"}], "padding": 0, "width": 77.0, "height": 20.0, "x": -4619.0, "y": -2135.0, "style": "Professional", "component_name": "Button", "subtype": "Default", "variant_details": {"State": ["Default"], "Size": ["Small"]}}}
{"variant_properties": {"color": "rgba(255, 255, 255, 1.0)", "strokes": ["rgba(126, 86, 216, 1.0)"], "strokeWeight": 1.0, "text": "Button CTA", "textColor": "rgba(255, 255, 255, 1.0)", "borderRadius": 10.0, "fontFamily": "Inter", "fontWeight": 500, "fontSize": 14.0, "effects": [{"type": "DROP_SHADOW", "color": "rgba(16, 24, 40, 0.05000000074505806)"}], 

In [45]:
print(f"Number of components: {len(data)}")

Number of components: 5617


Testing lidsubg

In [47]:
import pandas as pd

In [48]:
def load_file_to_df(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return pd.DataFrame(lines, columns=['text'])

json_df = load_file_to_df(json_file_path)
prompts_df = load_file_to_df(prompts_file_path)

In [49]:
def load_pairs_to_df(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    pairs = [eval(line) for line in lines]  # Use eval carefully, only if you trust the source
    return pd.DataFrame(pairs)

In [50]:
pairs_df = load_pairs_to_df(pairs_file_path)

In [51]:
formatted_pairs = json_df['text'].apply(lambda x: f"translate JSON to prompt: {x.strip()}") + " " + prompts_df['text']


In [52]:
seq2seq_df = pd.DataFrame({
    'input_text': formatted_pairs,
    # Assuming prompts_df['text'] contains the expected output
    'target_text': prompts_df['text'].values
})

In [53]:
print(seq2seq_df.head())


                                          input_text  \
0  translate JSON to prompt: {"variant_properties...   
1  translate JSON to prompt: {"variant_properties...   
2  translate JSON to prompt: {"variant_properties...   
3  translate JSON to prompt: {"variant_properties...   
4  translate JSON to prompt: {"variant_properties...   

                                         target_text  
0  Generate a Professional Button with a stroke w...  
1  Generate a Professional Button with a stroke w...  
2  Generate a Professional Button with a stroke w...  
3  Generate a Professional Button with a stroke w...  
4  Generate a Professional Button with a stroke w...  


In [57]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from transformers import AdamW
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import GPT2Model

In [60]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, df, source_len, target_len):
        self.tokenizer = tokenizer
        self.df = df
        self.source_len = source_len
        self.target_len = target_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        source_text = str(self.df.iloc[index, 'input_text'])
        target_text = str(self.df.iloc[index, 'target_text'])

        # Tokenize source text
        source = self.tokenizer.batch_encode_plus([source_text], max_length=self.source_len, padding='max_length', truncation=True, return_tensors="pt")
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()

        # Tokenize target text
        target = self.tokenizer.batch_encode_plus([target_text], max_length=self.target_len, padding='max_length', truncation=True, return_tensors="pt")
        target_ids = target['input_ids'].squeeze()

        return {
            'input_ids': source_ids,
            'attention_mask': source_mask,
            'labels': target_ids
        }


In [61]:
class CustomSeq2SeqModel(nn.Module):
    def __init__(self, encoder_pretrained_model_name='gpt2', decoder_hidden_size=768, decoder_output_size=50257, decoder_num_layers=2):
        super(CustomSeq2SeqModel, self).__init__()
        self.encoder = GPT2Model.from_pretrained(encoder_pretrained_model_name)
        self.decoder = nn.LSTM(input_size=decoder_hidden_size, hidden_size=decoder_hidden_size, num_layers=decoder_num_layers, batch_first=True)
        self.fc = nn.Linear(decoder_hidden_size, decoder_output_size)

    def forward(self, input_ids, labels=None):
        encoder_outputs = self.encoder(input_ids=input_ids).last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        logits = self.fc(decoder_outputs)
        return logits

In [63]:
source_len = 512
target_len = 128

custom_dataset = CustomDataset(tokenizer, seq2seq_df, source_len, target_len)
dataloader = DataLoader(custom_dataset, batch_size=1, shuffle=True)

In [65]:
model = CustomSeq2SeqModel()


In [66]:
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import GradScaler, autocast
import torch.optim as optim

In [67]:
optimizer = optim.Adam(model.parameters(), lr=1e-5)
scaler = GradScaler()



In [68]:
model.train()
for epoch in range(1):  # Adjust epochs as needed
    for batch in dataloader:
        optimizer.zero_grad()

        with autocast():
            input_ids = batch['input_ids'].to("cuda")
            labels = batch['labels'].to("cuda")
            logits = model(input_ids=input_ids)
            loss = ... # Define your loss function. You might need a custom loss for Seq2Seq tasks.

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types