In [2]:
!pip install pandas scikit-learn



In [111]:
from google.cloud import storage
import pandas as pd
import numpy as np

In [4]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [5]:
bucket_name = 'synthetic-dataset-harsh-bucket'
download_blob(bucket_name, 'product_asin.csv', '/tmp/product_asin.csv')
download_blob(bucket_name, 'reviews_supplements.csv', '/tmp/reviews_supplements.csv')

In [6]:
product_df = pd.read_csv('/tmp/product_asin.csv')
review_df = pd.read_csv('/tmp/reviews_supplements.csv')

In [7]:

print(product_df.isnull().sum())

X                   0
title              25
parent_asin         0
categories          0
cat1                0
cat2            17863
cat3            33814
cat4           187060
cat5           386735
cat6           461201
dtype: int64


In [8]:
for column in product_df.columns:
    if product_df[column].dtype == 'object':  
        product_df[column].fillna('', inplace=True)
    else:  
        product_df[column].fillna(0, inplace=True)

In [84]:
print(product_df.isnull().sum())

X              0
title          0
parent_asin    0
categories     0
cat1           0
cat2           0
cat3           0
cat4           0
cat5           0
cat6           0
dtype: int64


In [9]:
print(review_df.isnull().sum())

rating               0
title                7
text                 5
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
date                 0
time                 0
dtype: int64


In [72]:
for column in review_df.columns:
    if review_df[column].dtype == 'object':  
        review_df[column].fillna('', inplace=True)
    else:  
        review_df[column].fillna(0, inplace=True)

In [85]:
print(review_df.isnull().sum())

rating               0
title                0
text                 0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
date                 0
time                 0
dtype: int64


In [90]:
product_df['categories'] = product_df['categories'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat1'] = product_df['cat1'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat2'] = product_df['cat2'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat3'] = product_df['cat3'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat4'] = product_df['cat4'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat5'] = product_df['cat5'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())
product_df['cat6'] = product_df['cat6'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", '').strip())

In [91]:
print(product_df.head())
print(review_df.head())

    X                                              title parent_asin  \
0   1                     Allegra Allergy 45ct + 15 Free  B00JENH5OI   
1   2  InvoSpa Shiatsu Back Shoulder and Neck Massage...  B0C4L5Y711   
2   4  Kal 100 Mcg Selenium Yeast Free Tablets, 100 C...  B00020HX5S   
3   5  Rocky Mountain Oils Cinnamon Bark Essential Oi...  B07K363N3S   
4  12  Prevail Super Absorbent Underpads, Prevail Sup...  B00ACMDOOA   

           categories                   cat1                         cat2  \
0  Health & Household            Health Care  Over-the-Counter Medication   
1  Health & Household  Wellness & Relaxation    Massage Tools & Equipment   
2  Health & Household               Vitamins       Minerals & Supplements   
3  Health & Household            Health Care         Alternative Medicine   
4  Health & Household            Health Care        Incontinence & Ostomy   

                            cat3            cat4              cat5 cat6  
0                        Aller

In [92]:
import vertexai

In [93]:
vertexai.init(project='buoyant-song-437618-k8', location='us-central1')

In [94]:
from vertexai.preview.language_models import TextGenerationModel


In [95]:
model = TextGenerationModel.from_pretrained("google/text-bison@001")

In [168]:
def generate_synthetic_row(product_title, review_text):
    prompt = f"""
    You are given a product titled "{product_title}" and a review that says "{review_text}".
    Generate a structured synthetic dataset row for this product and review.
    The dataset should have these columns: "title", "review_text", "rating", "user_id", "verified_purchase".
    Return the row in CSV format.
    """
    response = model.predict(prompt)
    return response.text

In [169]:
synthetic_rows = []
max_rows = 100

for idx, row in review_df.iterrows():
    if idx >= max_rows:  
        break
    synthetic_data = generate_synthetic_row(row['title'], row['text'])
    synthetic_rows.append(synthetic_data)

for synthetic_row in synthetic_rows:
    print(synthetic_row)

title,review_text,rating,user_id,verified_purchase
B Complex in gel cap form,I bought this along with Vit C in gel cap form because a doctor explained on his website that he tests the blood of his patients regularly for the levels of various supplements he recommends and he claimed that pills seemed to result in extremely poor and uneven levels in the body. He also claimed gel caps were much better compared to pills in terms of a more even and reliable absorption into the blood stream.,5,123456789,True
title,review_text,rating,user_id,verified_purchase
Five Stars,great product,5,123456789,True
title,review_text,rating,user_id,verified_purchase
Five Stars,Came as expectedly,5,1234567890,True
title,review_text,rating,user_id,verified_purchase
Vitamin Shoppe Dry Vitamin A,Excellent Product ..... Fast Delivery ....... Will Buy From Again ........ A+A+A+A+A+,5,user_id,True
title,review_text,rating,user_id,verified_purchase
Un producto que compro regularmente,Es muy buena vitamina,5,12345678

In [170]:
import csv
from io import StringIO

In [171]:
synthetic_data_list = []

In [178]:
for synthetic_text in synthetic_rows:
    reader = csv.DictReader(StringIO(synthetic_text))
    for row in reader:
        synthetic_data_list.append(row)

In [179]:
synthetic_df = pd.DataFrame(synthetic_data_list)

In [180]:
print(synthetic_df.head())

                                 title  \
0            B Complex in gel cap form   
1                           Five Stars   
2                           Five Stars   
3         Vitamin Shoppe Dry Vitamin A   
4  Un producto que compro regularmente   

                                         review_text rating     user_id  \
0  I bought this along with Vit C in gel cap form...      5   123456789   
1                                      great product      5   123456789   
2                                 Came as expectedly      5  1234567890   
3  Excellent Product ..... Fast Delivery ....... ...      5     user_id   
4                              Es muy buena vitamina      5   123456789   

  verified_purchase None  ``` Title Review Text Rating User ID  \
0              True  NaN  NaN   NaN         NaN    NaN     NaN   
1              True  NaN  NaN   NaN         NaN    NaN     NaN   
2              True  NaN  NaN   NaN         NaN    NaN     NaN   
3              True  NaN  NaN   

In [187]:
review_columns=['rating','title','review_text','user_id']

In [188]:
synthetic_df = synthetic_df.reindex(columns=review_columns, fill_value=np.nan)

In [189]:
print(synthetic_df.head())

  rating                                title  \
0      5            B Complex in gel cap form   
1      5                           Five Stars   
2      5                           Five Stars   
3      5         Vitamin Shoppe Dry Vitamin A   
4      5  Un producto que compro regularmente   

                                         review_text     user_id  
0  I bought this along with Vit C in gel cap form...   123456789  
1                                      great product   123456789  
2                                 Came as expectedly  1234567890  
3  Excellent Product ..... Fast Delivery ....... ...     user_id  
4                              Es muy buena vitamina   123456789  


In [190]:
synthetic_df.to_csv('/tmp/synthetic_review_dataset.csv', index=False)

In [191]:
!gsutil cp /tmp/synthetic_review_dataset.csv gs://synthetic-dataset-harsh-bucket/synthetic_review_dataset.csv

Copying file:///tmp/synthetic_review_dataset.csv [Content-Type=text/csv]...
/ [1 files][123.5 KiB/123.5 KiB]                                                
Operation completed over 1 objects/123.5 KiB.                                    
