# Setup Vertex AI 

In [5]:
import pandas as pd
import vertexai
from vertexai.generative_models import GenerativeModel
import random
import time
import logging
from datetime import datetime, timedelta
import re
import json
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

PROJECT_ID = "bhagwasanatantimes"
vertexai.init(project=PROJECT_ID, location="us-central1")

model = GenerativeModel("gemini-1.5-flash")


# Function to generate synthetic product data


In [None]:
def generate_product_data(n):
    data = []
    requests_per_minute = 2  # Adjust this based on the API's specified limit
    sleep_time = 60 / requests_per_minute  # Calculates the necessary delay

    for i in range(n):
        # Pause the execution to avoid hitting the API limit
        if i % requests_per_minute == 0 and i != 0:
            print("Sleeping to avoid hitting the rate limit.Record : f{i}")
            time.sleep(sleep_time)
            
        # Generate a structured product description
        response = model.generate_content(
            "Generate a detailed description for a product in the 'Health & Household' category, "
            "including type, key features, and target audience."
        )
        description = response.text.strip()
#         print(description)
        # Extracting parts from the description (This part may require NLP tools for better accuracy)
        parts = description.split('\n')
        product_type = 'Unknown Product Type'
        key_features = 'Unknown Key Features'
        target_audience = 'General Audience'

        # Product Type is found after 'Type:' in the second item of the array
        if len(parts) > 1 and 'Type:' in parts[2]:
            product_type = parts[2].split(":")[-1].strip().replace('*', '')
            

        for i, part in enumerate(parts):
        # Handle Key Features section
            if 'Key Features:' in part:
                key_feature_list = []
                for feature in parts[i+1:]:
                    if 'Target Audience:' in feature:  # Stop at 'Target Audience:'
                        break
                    if feature.strip():  # Only include non-empty lines
                        # Remove '**' without adding extra space
                        cleaned_feature = re.sub(r'\*', '', feature)
                        key_feature_list.append(cleaned_feature)
                key_features = ' '.join(key_feature_list)  # Join the list into a single string

            # Handle Target Audience section
            if 'Target Audience:' in part:
                target_audience_list = []
                for audience in parts[i+1:]:
                    if 'Benefits:' in audience:  # Stop at 'Benefits:'
                        break
                    if audience.strip():  # Only include non-empty lines
                        # Remove '**' without adding extra space
                        cleaned_audience = re.sub(r'\*', '', audience)
                        target_audience_list.append(cleaned_audience)
                target_audience = ' '.join(target_audience_list) 
                        
        print(target_audience,key_features,product_type)
        # Simplifying for example: using direct string manipulation which should ideally be more dynamic
        categories = "['Health & Household', 'Personal Care', 'Skincare', 'Sensitive Skin Care', 'Rosacea Routines']"
        title = f"{product_type} for {target_audience} with features: {key_features}"
#         print(title)
        data.append([i, title, 'B0' + str(1000000 + i), categories, 'Health & Household', 'Personal Care', 
                     'Skincare', 'Sensitive Skin Care', 'Rosacea Routines', ''])

    return pd.DataFrame(data, columns=['X', 'title', 'parent_asin', 'categories', 'cat1', 'cat2', 
                                       'cat3', 'cat4', 'cat5', 'cat6'])

# Generate and Save Product Data

In [None]:
# Generate entries
product_data = generate_product_data(50)

# Save to CSV
product_data.to_csv('synthetic_product_data50.csv', index=False, quotechar='"', quoting=2)

In [None]:
def generate_product_data_with_categories(n):
    data = []
    requests_per_minute = 2  # Adjust this based on the API's specified limit
    sleep_time = 60 / requests_per_minute  # Calculates the necessary delay

    # Define different category sets with matching product descriptions
    category_sets = [
        {
            'category': 'Skincare',
            'prompt': 'Generate a detailed description and relevant categories for a skincare product, including type, key features, target audience, and return the result in JSON format with fields: title, parent_asin, categories, and category-related keys.'
        },
        {
            'category': 'Vitamins & Supplements',
            'prompt': 'Generate a detailed description and relevant categories for a vitamin or supplement product, including type, key features, target audience, and return the result in JSON format with fields: title, parent_asin, categories, and category-related keys.'
        },
        {
            'category': 'Baby Care',
            'prompt': 'Generate a detailed description and relevant categories for a baby care product, including type, key features, target audience, and return the result in JSON format with fields: title, parent_asin, categories, and category-related keys.'
        },
        {
            'category': 'Medical Supplies',
            'prompt': 'Generate a detailed description and relevant categories for a medical supply product, including type, key features, target audience, and return the result in JSON format with fields: title, parent_asin, categories, and category-related keys.'
        },
        {
            'category': 'Oral Care',
            'prompt': 'Generate a detailed description and relevant categories for an oral care product, including type, key features, target audience, and return the result in JSON format with fields: title, parent_asin, categories, and category-related keys.'
        }
    ]

    def extract_category_names(categories):
        """ Recursively extract category names from the nested structure. """
        category_names = []

        def extract_from_level(category_level):
            if 'category' in category_level:
                category_names.append(category_level['category'])
            # Recursively handle subcategories
            if 'subcategories' in category_level:
                for subcat in category_level['subcategories']:
                    extract_from_level(subcat)
            if 'subsubcategories' in category_level:
                for subsubcat in category_level['subsubcategories']:
                    extract_from_level(subsubcat)
            if 'subsubsubcategories' in category_level:
                for subsubsubcat in category_level['subsubsubcategories']:
                    extract_from_level(subsubsubcat)

        for category in categories:
            extract_from_level(category)
        
        return category_names[:6]  # Limit to 6 category levels

    for i in range(n):
        # Pause the execution to avoid hitting the API limit
        if i % requests_per_minute == 0 and i != 0:
            print(f"Sleeping to avoid hitting the rate limit. Record: {i}")
            time.sleep(sleep_time)

        # Randomly choose a category set for each product
        category_info = random.choice(category_sets)
        category = category_info['category']
        prompt = category_info['prompt']

        # Generate a structured product description and relevant categories in JSON format using the selected prompt
        response = model.generate_content(prompt)
        raw_text = response.text.strip()

        # Remove code block markers or extra characters
        if raw_text.startswith('```json'):
            raw_text = raw_text[7:]  # Remove ```json
        if raw_text.endswith('```'):
            raw_text = raw_text[:-3]  # Remove closing ```

        # Attempt to parse the cleaned JSON
        try:
            json_response = json.loads(raw_text)
        except json.JSONDecodeError as e:
#             print(f"Error parsing JSON: {e}")
#             print(f"Response Text: {raw_text}")  # Print the raw text for debugging
            continue  # Skip to the next iteration if JSON parsing fails

        # Extract fields from JSON response
        title = json_response.get('title', 'Unknown Product')
        parent_asin = json_response.get('parent_asin', 'Unknown ASIN')
        categories = json_response.get('categories', [])


        # Extract category names, handling nested structures
        category_names = extract_category_names(categories)

        # Assign up to 6 categories
        cat1 = category_names[0] if len(category_names) > 0 else 'Health & Household'
        cat2 = category_names[1] if len(category_names) > 1 else ''
        cat3 = category_names[2] if len(category_names) > 2 else ''
        cat4 = category_names[3] if len(category_names) > 3 else ''
        cat5 = category_names[4] if len(category_names) > 4 else ''
        cat6 = category_names[5] if len(category_names) > 5 else ''

        # Append product data to list
        data.append([i, title, parent_asin, str(category_names), cat1, cat2, cat3, cat4, cat5, cat6])

    # Convert to DataFrame
    return pd.DataFrame(data, columns=['X', 'title', 'parent_asin', 'categories', 'cat1', 'cat2', 
                                       'cat3', 'cat4', 'cat5', 'cat6'])

# Generate entries
product_data = generate_product_data_with_categories(50)

# Save to CSV
product_data.to_csv('synthetic_product_data_with category50.csv', index=False, quotechar='"', quoting=2)

# Function to generate synthetic review data

In [None]:


product_data = pd.read_csv('synthetic_product_data100.csv')

def generate_review_data(n):
    data = []
    requests_per_minute = 2  # Adjust this based on the API's specified limit
    sleep_time = 60 / requests_per_minute  # Calculates the necessary delay
    
    for i in range(n):
        # Pause the execution to avoid hitting the API limit
        if i % requests_per_minute == 0 and i != 0:
            print(f"Sleeping to avoid hitting the rate limit. Record: {i}")
            time.sleep(sleep_time)

        # Randomly select a product title from the synthetic product data
        product_row = product_data.sample(1).iloc[0]
        product_title = product_row['title']

        # Randomly choose a rating and determine sentiment
        rating = random.randint(1, 5)
        sentiment = 'positive' if rating > 3 else 'negative'
        prompt_text = f"Generate a {sentiment} review for a health supplement product: {product_title}."

        # Generate review content using Vertex AI (replace 'model.generate_content' with appropriate method)
        response = model.generate_content(prompt_text)  # Assuming model.generate_content exists
        review_text = response.text.strip().replace('\n', ' ')

        # Generate synthetic metadata
        asin = 'B' + str(random.randint(10000000, 99999999))
        user_id = 'AG' + ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', k=16))
        timestamp = datetime.now() - timedelta(days=random.randint(0, 1000))
        helpful_vote = random.randint(0, 20)
        verified_purchase = random.choice(["True", "False"])
        date = timestamp.strftime('%Y-%m-%d')
        time = timestamp.strftime('%H:%M:%S')

        # Append the generated review to the list
        data.append([
            rating, 
            product_title,  # Using product title from the synthetic product data
            review_text, 
            asin, 
            asin,  # parent_asin same as asin for simplicity
            user_id, 
            timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3],  # Including milliseconds
            helpful_vote, 
            verified_purchase, 
            date, 
            time
        ])

    return pd.DataFrame(data, columns=[
        'rating', 'title', 'text', 'asin', 'parent_asin',
        'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
        'date', 'time'
    ])

# Generate the synthetic reviews data
synthetic_reviews = generate_review_data(100)

# Save the generated reviews to a CSV file
synthetic_reviews.to_csv('synthetic_reviews.csv', index=False, quotechar='"', quoting=2)

# Generate synthetic & save Reviews data

In [None]:
# Generate 10 synthetic reviews
synthetic_reviews = generate_review_data(2)

# Save to CSV
synthetic_reviews.to_csv('synthetic_reviews.csv', index=False, quotechar='"', quoting=2)