In [2]:
pip install groq

Collecting groq
  Downloading groq-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.13.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [51]:
import pandas as pd
import random
from groq import Groq
# import streamlit as st
import re
import time

In [64]:
class VietnameseDishChatbot:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.client = Groq(api_key='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')
        self.system_prompts = {
            'Dish': [
                "Create unique questions about the name and origin of the dish",
                "Generate interesting questions related to the name and meaning of the dish"
            ],
            'Description': [
                "Create in-depth questions exploring the characteristics and culture of the dish",
                "Generate questions that highlight interesting details in the description"
            ],
            'Recipe': [
                "Create specialized questions about cooking techniques",
                "Generate questions related to ingredients and preparation methods"
            ],
            'Similar_Dishes': [
                "Create comparison questions to distinguish similar dishes",
                "Generate questions exploring the relationships between dishes"
            ],
            'Famous_Restaurants': [
                "Create questions about famous culinary locations",
                "Generate questions exploring local culinary culture"
            ]
        }

    def generate_qa_with_retry(self, dish, attribute, max_retries=3):
        """Generate Q&A with retry mechanism"""
        for attempt in range(max_retries):
            try:
                system_prompt = random.choice(self.system_prompts[attribute])
                
                response = self.client.chat.completions.create(
                    messages=[
                        {
                            "role": "system", 
                            "content": f"{system_prompt}. You are a Vietnamese culinary expert."
                        },
                        {
                            "role": "user", 
                            "content": f"""Information about {dish}: 
                            {attribute}: {self.df.loc[self.df['Dish'] == dish, attribute].values[0]}
                            
                            Create an original question and an in-depth answer."""
                        }
                    ],
                    model="mixtral-8x7b-32768",
                    max_tokens=8192
                )

                full_response = response.choices[0].message.content
                
                # Find question and answer
                qa_match = re.findall(r'Question:\s*(.+)\n*Answer:\s*(.+)', full_response, re.DOTALL)
                
                if qa_match:
                    question, answer = qa_match[0]
                    return {
                        'Dish': dish,
                        'Attribute': attribute,
                        'Question': question.strip(),
                        'Answer': answer.strip()
                    }
                
                # If no match, try alternative approach
                return {
                    'Dish': dish,
                    'Attribute': attribute,
                    'Question': f"Details about {attribute} of {dish}?",
                    'Answer': full_response.strip()
                }
            
            except Exception as e:
                print(f"Error on attempt {attempt + 1} for {dish} - {attribute}: {e}")
                time.sleep(2)  # Wait between attempts
        
        # Return default value if out of retries
        return {
            'Dish': dish,
            'Attribute': attribute,
            'Question': f"Information about {attribute} of {dish}",
            'Answer': "Unable to generate question and answer."
        }

    def generate_comprehensive_qa_dataset(self, batch_size=5):  # Added default value here
        """Generate Q&A by batch to optimize time"""
        all_qa_pairs = []
        attributes = ['Dish', 'Description', 'Recipe', 'Similar_Dishes', 'Famous_Restaurants']
        
        # Split into batches for processing
        for i in range(0, len(self.df), batch_size):
            batch_dishes = self.df['Dish'][i:i+batch_size]
            
            batch_qa_pairs = []
            for dish in batch_dishes:
                dish_qa_pairs = []
                for attribute in attributes:
                    # Generate 5 Q&A for each attribute
                    for _ in range(5):
                        qa_pair = self.generate_qa_with_retry(dish, attribute)
                        dish_qa_pairs.append(qa_pair)
                
                batch_qa_pairs.extend(dish_qa_pairs)
                
                # Print progress
                print(f"Processed {dish}")
                
                # Short wait between calls
                time.sleep(1)
            
            all_qa_pairs.extend(batch_qa_pairs)
            
            # Save each batch in case of error
            batch_df = pd.DataFrame(all_qa_pairs)
            batch_df.to_csv(f'vietnamese_dishes_qa_batch_{i//batch_size + 1}.csv', 
                             index=False, encoding='utf-8-sig')
        
        return pd.DataFrame(all_qa_pairs)

In [68]:
class VietnameseDishChatbotPart3:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.client = Groq(api_key='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')
        self.system_prompts = {
            'Dish': [
                "Create unique questions about the name and origin of the dish",
                "Generate interesting questions related to the name and meaning of the dish"
            ],
            'Description': [
                "Create in-depth questions exploring the characteristics and culture of the dish",
                "Generate questions that highlight interesting details in the description"
            ],
            'Recipe': [
                "Create specialized questions about cooking techniques",
                "Generate questions related to ingredients and preparation methods"
            ],
            'Similar_Dishes': [
                "Create comparison questions to distinguish similar dishes",
                "Generate questions exploring the relationships between dishes"
            ],
            'Famous_Restaurants': [
                "Create questions about famous culinary locations",
                "Generate questions exploring local culinary culture"
            ]
        }

    def generate_qa_with_retry(self, dish, attribute, max_retries=3):
        """Generate Q&A with retry mechanism"""
        for attempt in range(max_retries):
            try:
                system_prompt = random.choice(self.system_prompts[attribute])
                
                response = self.client.chat.completions.create(
                    messages=[
                        {
                            "role": "system", 
                            "content": f"{system_prompt}. You are a Vietnamese culinary expert."
                        },
                        {
                            "role": "user", 
                            "content": f"""Information about {dish}: 
                            {attribute}: {self.df.loc[self.df['Dish'] == dish, attribute].values[0]}
                            
                            Create an original question and an in-depth answer."""
                        }
                    ],
                    model="mixtral-8x7b-32768",
                    max_tokens=8192
                )

                full_response = response.choices[0].message.content
                
                # Find question and answer
                qa_match = re.findall(r'Question:\s*(.+)\n*Answer:\s*(.+)', full_response, re.DOTALL)
                
                if qa_match:
                    question, answer = qa_match[0]
                    return {
                        'Dish': dish,
                        'Attribute': attribute,
                        'Question': question.strip(),
                        'Answer': answer.strip()
                    }
                
                # If no match, try alternative approach
                return {
                    'Dish': dish,
                    'Attribute': attribute,
                    'Question': f"Details about {attribute} of {dish}?",
                    'Answer': full_response.strip()
                }
            
            except Exception as e:
                print(f"Error on attempt {attempt + 1} for {dish} - {attribute}: {e}")
                time.sleep(2)  # Wait between attempts
        
        # Return default value if out of retries
        return {
            'Dish': dish,
            'Attribute': attribute,
            'Question': f"Information about {attribute} of {dish}",
            'Answer': "Unable to generate question and answer."
        }

    def generate_comprehensive_qa_dataset(self, batch_size=5):  # Added default value here
        """Generate Q&A by batch to optimize time"""
        all_qa_pairs = []
        attributes = ['Dish', 'Description', 'Recipe', 'Similar_Dishes', 'Famous_Restaurants']
        
        # Split into batches for processing
        for i in range(0, len(self.df), batch_size):
            batch_dishes = self.df['Dish'][i:i+batch_size]
            
            batch_qa_pairs = []
            for dish in batch_dishes:
                dish_qa_pairs = []
                for attribute in attributes:
                    # Generate 5 Q&A for each attribute
                    for _ in range(5):
                        qa_pair = self.generate_qa_with_retry(dish, attribute)
                        dish_qa_pairs.append(qa_pair)
                
                batch_qa_pairs.extend(dish_qa_pairs)
                
                # Print progress
                print(f"Processed {dish}")
                
                # Short wait between calls
                time.sleep(1)
            
            all_qa_pairs.extend(batch_qa_pairs)
            
            # Save each batch in case of error
            batch_df = pd.DataFrame(all_qa_pairs)
            batch_df.to_csv(f'vietnamese_dishes_qa_part3_batch_{i//batch_size + 1}.csv', 
                             index=False, encoding='utf-8-sig')
        
        return pd.DataFrame(all_qa_pairs)

In [70]:
chatbot = VietnameseDishChatbotPart3('/kaggle/input/dataset3/dishes_from_hutieu.csv')

In [71]:
qa_dataset = chatbot.generate_comprehensive_qa_dataset()

Processed Hủ tiếu
Processed Bánh cu đơ
Processed Nem nướng
Processed Bánh mì cay
Processed Cơm cháy
Processed Bò bía
Processed Bánh đậu xanh
Processed Bánh đa cua
Processed Bún cá


In [None]:
qa_dataset.to_csv('comprehensive_vietnamese_dishes_qa.csv', index=False, encoding='utf-8-sig')

In [None]:
print(qa_dataset.sample(10))