<a href="https://colab.research.google.com/github/GarimaChopra/Generative_AI/blob/main/Final_project_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np


In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
df=pd.read_csv('https://raw.githubusercontent.com/GarimaChopra/Generative_AI/main/Research%20Project/Indian_Food_Cleaned_Dataset1.csv')
df.head(20)

In [None]:
df.shape

In [None]:
#Dropping columns not needed
columns_to_drop = [ 'URL','TranslatedIngredients','TotalTimeInMins', 'image-url',]
df = df.drop(columns = columns_to_drop).dropna()
df.shape

In [None]:
df.head(10)

In [None]:
 #Dropping irrelevant receipes from other cuisines
cuisines_to_drop = ['Mexican', 'Italian Recipes', 'Thai', 'Chinese', 'Asian', 'Middle Eastern', 'European',
                   'Arab', 'Japanese', 'Vietnamese', 'British', 'Greek', 'French', 'Mediterranean', 'Sri Lankan',
                   'Indonesian', 'African', 'Korean', 'American', 'Carribbean', 'World Breakfast', 'Malaysian', 'Dessert',
                   'Afghan', 'Snack', 'Jewish', 'Brunch', 'Lunch', 'Continental', 'Fusion']

df = df[~df['Cuisine'].isin(cuisines_to_drop)]
df.shape

In [None]:
#Cleaning Receipes Name
import re
def clean_recipe(recipe_str):

    # Remove content inside parentheses
    cleaned = re.sub(r'\(.*?\)', '', recipe_str)

    # Extract the string before " - " pattern
    cleaned = cleaned.split(" - ")[0]
    cleaned = cleaned.split(" | ")[0]
    return cleaned.strip()

df["TranslatedRecipeName"] = df["TranslatedRecipeName"].apply(clean_recipe)


In [None]:
df.sample(20)

In [None]:
df.shape

In [None]:
#Cleaning Ingredients
def clean_ingredients(ingredient_str):
    # Split the input ingredient string by commas
    ingredients = ingredient_str.split(',')
    # List of descriptors to remove from ingredient list
    descriptors = [
        'teaspoon', 'tablespoon', 'cup','cups','as per use', 'grams', 'pieces', 'sliced', 'chopped',
        'finely', 'diced', 'desi', 'gms', 'tbsp', 'tsp', 'ml', 'inch', 'large',
        'medium', 'small', 'shredded', 'to taste', 'roughly', 'fresh', 'peeled',
        'de seeded', 'deseeded', 'crushed', 'whole', 'cubes', 'cube', 'round',
        'grated', 'powder', 'optional', 'dry', 'washed', 'soaked', 'cooked',
        'uncooked', 'ripe', 'unripe', 'frozen', 'thin', 'thick', 'cleaned', 'thinly',
        'for', 'into', 'and', 'as per taste', 'to', 'cut', 'overnight', 'leaves',
        'into strips', 'ti','minutes','whisked','salt','to taste','taste','Salt','tablespoons','Red','Green','as','Haldi','Jeera','Dhaniya'
        'oil','or','a','black','dry','masala','deep','Oil','Teaspoon','Leaves','powder','Dry','Whole','fresh',
        'Garam','gram','seeds','Required','water','Water','green','white','per','pinch','paste','slit','&','leaf','boiled',
        'yellow','garnish','curry','minced','mashed','optional','few','fresh','required','in','use','of','sprig','Badi','Kala','red'
    ]

    cleaned_ingredients = []
    for ingredient in ingredients:
        # Remove digits
        ingredient = re.sub(r'\d+', '', ingredient)
        # Replace dashes and slashes with spaces
        ingredient = re.sub(r'[-/]', ' ', ingredient)
        ingredient=re.sub(r"[\([{})\]]", "", ingredient)

        # Remove descriptor words
        ingredient = ' '.join(
            [word for word in ingredient.split() if word not in descriptors]
        )
        cleaned_ingredients.append(ingredient.strip())
    return ', '.join(cleaned_ingredients)

# Fill NA values in 'Cleaned-Ingredients' column
df['Cleaned-Ingredients'].fillna("", inplace=True)
# Filter out rows containing Devanagari script characters
df = df[~df["Cleaned-Ingredients"].str.contains(r'[ऀ-ॿ]')]
df["Cleaned-Ingredients"] = df["Cleaned-Ingredients"].apply(clean_ingredients)

In [None]:
df.shape

In [None]:
import nltk
vocabulary = nltk.FreqDist()

for ingredients in df['Cleaned-Ingredients']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
# initialize nltk's lemmatizer
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

In [None]:
df = df[~df["TranslatedInstructions"].str.contains(r'[ऀ-ॿ]')]

In [None]:
df.shape

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm

In [None]:
model_name = 'gpt2'

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

In [None]:
model_save_path = '/content/sample_data/Research'


In [None]:
tokenizer.save_pretrained(model_save_path)

In [None]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

In [None]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

In [None]:
def print_recipe(idx):
    print(f"{df['Ingredients'][idx]}\n\n{df['instructions'][idx]}")

In [None]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [None]:
data = df.apply(lambda x:form_string(x['Cleaned-Ingredients'],x['TranslatedInstructions']),axis=1).to_list()

In [None]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [None]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [None]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

In [None]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=1,
                         save_strategy='no'
                        )

In [None]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [None]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [None]:
trainer.train()

Step,Training Loss


In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

In [None]:
pl = pipeline(task='text-generation',model= '/content/sample_data/Research')


In [None]:

output_dir = './model_80-20-train/'

# # Create output directory if needed
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
output_dir = './model_80-20-train/'
#Load a trained model and vocabulary that you have fine-tuned
model = GPT2LMHeadModel.from_pretrained(output_dir)

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

model.to(device)

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['Rice,CHICKEN,LAMB,paneer']

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])