In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import csv
from tqdm import tqdm

In [2]:
base_path = r'C:\Users\mrmrk\OneDrive\Documents\GitHub\recipe_generation\\'

ids_path = 'prepeard_data.csv'
ing_vocab_path = "ingredient_vocab.csv"
rec_vocab_path = "recipie_vocab.csv"

data_len = 2231142

In [3]:
with open(base_path + ing_vocab_path, 'r') as file:
    ingredient_vocab = [t[0] for t in list(csv.reader(file))[1:]]
with open(base_path + rec_vocab_path, 'r') as file:
    recipie_vocab = [t[0] for t in list(csv.reader(file))[1:]]
print(recipie_vocab)



In [10]:
rec_input_offset = len(ingredient_vocab)
input_vocab =  ingredient_vocab + recipie_vocab + ["<ING>", "<REC>", "<END>"]
output_vocab = recipie_vocab + ["<END>"]
len(input_vocab)

12966

In [5]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Your input sequences
sequences = [
    [torch.tensor([1, 2, 3]), torch.tensor([1, 2])],
    [torch.tensor([4, 5]), torch.tensor([4])],
    [torch.tensor([6, 7, 8, 9]), torch.tensor([6, 7, 8])]
]


padded_sequences = []
for seq in sequences:
    padded_sequences.extend(pad_sequence(seq, batch_first=True, padding_value=0))

padded_sequences = pad_sequence(padded_sequences, batch_first=True, padding_value=0)

y, x = padded_sequences.shape

adjusted_y = y - (y % 2)

# Reshape the tensor
output_tensor = padded_sequences[:adjusted_y, :].view(adjusted_y // 2, 2, x).permute(1,0,2)

# 'output_tensor' now has the desired shape (adjusted_y/2, 2, x)
output_tensor


tensor([[[1, 2, 3, 0],
         [4, 5, 0, 0],
         [6, 7, 8, 9]],

        [[1, 2, 0, 0],
         [4, 0, 0, 0],
         [6, 7, 8, 0]]])

In [6]:


# Define the Generator
class Generator(nn.Module):
    def __init__(self, input_vocab_len, hidden_size, rec_vocab_len):
        super(Generator, self).__init__()
        self.embedding = nn.Embedding(input_vocab_len, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, rec_vocab_len)

    def forward(self, input):
        embedded = self.embedding(input)
        output, _ = self.rnn(embedded)
        output = self.fc(output)
        return output

# Define the Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_vocab_len, hidden_size):
        super(Discriminator, self).__init__()
        self.embedding = nn.Embedding(input_vocab_len, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, input):
        embedded = self.embedding(input)
        output, _ = self.rnn(embedded)
        output = self.fc(output)
        return output

In [7]:
class CSVDataLoader:
    def __init__(self, csv_file_path, batch_size, num_rows):
        self.csv_file_path = csv_file_path
        self.batch_size = batch_size
        self.num_rows = num_rows
        self.current_index = 0
        self.file_handle = None
        self.csv_writer = None
        self.headers = None

    def open_csv_file(self):
        self.file_handle = open(self.csv_file_path, 'r', newline='')
        self.csv_writer = csv.reader(self.file_handle)
        # Assuming the first row contains headers
        self.headers = next(self.csv_writer)

    def close_csv_file(self):
        if self.file_handle is not None and not self.file_handle.closed:
            self.file_handle.close()

    def reset(self):
        self.close_csv_file()
        self.open_csv_file()
        self.current_index = 0

    def has_next_batch(self):
        return self.current_index < self.num_rows

    def get_next_batch(self, pad1, pad2, end_toekn):
        batch = [] 
        for _ in range(self.batch_size):
            if not self.has_next_batch():
                break
            ing, rec = next(self.csv_writer)
            batch.append([torch.tensor(eval(ing)), torch.tensor(eval(rec) + [end_toekn])])
            self.current_index += 1
        padded_sequences = []
        for seq in batch:
            padded_sequences.extend(pad_sequence(seq, batch_first=True, padding_value=-1))

        padded_sequences = pad_sequence(padded_sequences, batch_first=True, padding_value=-1)

        y, x = padded_sequences.shape

        adjusted_y = y - (y % 2)

        # Reshape the tensor
        output_tensor = padded_sequences[:adjusted_y, :].view(adjusted_y // 2, 2, x).permute(1,0,2)
        output_tensor[0][output_tensor[0] == -1] = pad1
        output_tensor[1][output_tensor[1] == -1] = pad2
        return output_tensor
    
    def __del__(self):
        self.close_csv_file()

In [12]:
data_path = r"C:\Users\mrmrk\OneDrive\Documents\GitHub\recipe_generation\prepeard_data.csv"
data_loader = CSVDataLoader(data_path, 1, 2231142)
data_loader.open_csv_file()
d = data_loader.get_next_batch(ingredient_vocab.index("<PAD>"),
                               recipie_vocab.index("<PAD>") + rec_input_offset, 
                               output_vocab.index("<END>"))
data_loader.close_csv_file()
d

tensor([[[   3,   67,  206, 1827,   96,    4,   59,  345,    2,   23,  742,
            55,  227,    2,    3,   28,   34,   48,  196,  103,    2,    3,
           483,  117,   34,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1]],

        [[  31, 4028,  208,   16,  158,   12,   53,   91,    2,   31,   60,
            16,   44,   12,  208,    2,   21,  185,    4,   57,   62,   20,
            47,   19,   60,    2,   29,    3,  368,    3,   30, 2349,   14,
            77,  105,    2, 8195]]])

In [48]:
# Hyperparameters
vocab_size = 10000  # Adjust based on your dataset
embedding_size = 128
hidden_size = 256
seq_length = 20  # Adjust based on your prompt length
lr = 0.001
batch_size = 64

In [None]:
def patch_batch(ings,recs, start, midel, end):
    start_tokens = torch.full((ings.shape[0],), start)
    midel_token = torch.full((ings.shape[0],), midel)
    end_token = torch.full((ings.shape[0],), end)
    real_ingredients = torch.cat((start_tokens, real_ingredients), dim=1)

In [None]:
# Instantiate Generator and Discriminator
generator = Generator(vocab_size, embedding_size, vocab_size)
discriminator = Discriminator(vocab_size, embedding_size)

# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
gen_optimizer = optim.Adam(generator.parameters(), lr=lr)
disc_optimizer = optim.Adam(discriminator.parameters(), lr=lr)


real_ingredients = torch.tensor([])
# Training Loop
num_epochs = 1000  # Adjust based on your dataset and convergence
for epoch in range(num_epochs):
    # Training the Discriminator
    pad1 = ingredient_vocab.index("<PAD>")
    pad2 = recipie_vocab.index("<PAD>") + rec_input_offset
    real_ingredients, real_recepies = data_loader.get_next_batch(pad1,pad2)
    # Add the first column at the beginning
    

    # Add the last column at the end
    output_matrix = torch.cat((output_matrix, last_column), dim=1)
    generated_recepies = generator(real_ingredients)

    real_labels = torch.ones((batch_size, 1))
    fake_labels = torch.zeros((batch_size, 1))

    disc_optimizer.zero_grad()

    real_output = discriminator(real_recepies)
    real_loss = criterion(real_output, real_labels)

    fake_output = discriminator(generated_recepies.detach())
    fake_loss = criterion(fake_output, fake_labels)

    disc_loss = real_loss + fake_loss
    disc_loss.backward()
    disc_optimizer.step()

    # Training the Generator
    gen_optimizer.zero_grad()
    fake_output = discriminator(generated_recepies)
    gen_loss = criterion(fake_output, real_labels)
    gen_loss.backward()
    gen_optimizer.step()

    # Print losses
    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{num_epochs}], Disc Loss: {disc_loss.item()}, Gen Loss: {gen_loss.item()}')