In [None]:
%pip install tqdm



In [None]:
import random
import json
import os
import gc
from tqdm.auto import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

#  1. SETUP 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True   # cuDNN autotuner

# load & parallelize
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = base_model.to(device)
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0, 1])
model.half()                            # FP16 inference

tokenizer = AutoTokenizer.from_pretrained(
    "EleutherAI/gpt-neo-1.3B",
    padding_side="left"
)
tokenizer.pad_token = tokenizer.eos_token

#  2. BUILD “COMPLETE” PROMPTS 
categories = {
    "products":       ['smartphone','laptop','camera','headphones','smartwatch'],
    "services":       ['restaurant','hotel','airline','gym','spa'],
    "grocery":        ['milk','bread','eggs','cheese','butter'],
    "shops":          ['supermarket','local grocery store','bookstore','library'],
    "medical":        ['hospital','emergency room','pharmacy','drugstore'],
    "communications": ['telecom service','internet provider']
}
sentiments = ['positive','negative','neutral','mixed']
aspects    = ['quality','value for money','customer service','variety','convenience','cleanliness']

prompts = []
for items in categories.values():
    for item in items:
        for s in sentiments:
            for a in aspects:
                prompts.append(
                    f"Write a {s} google review of a {item}, focusing on {a}. Make it between 50 and 400 words."
                )


#  3. BATCH GENERATOR 
def generate_review_batch(batch_size=32, min_tokens=50):

    gen_model = model.module if isinstance(model, torch.nn.DataParallel) else model

    while True:
        sample_prompts = random.sample(prompts, batch_size)
        inputs = tokenizer(
            sample_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=64
        ).to(device)

        with torch.no_grad():
            outputs = gen_model.generate(
                **inputs,
                max_new_tokens=300,
                do_sample=True,
                temperature=0.7,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        batch = []
        for seq in outputs:
            txt = tokenizer.decode(seq, skip_special_tokens=True).strip()
            if len(txt.split()) >= min_tokens:
                batch.append({"review": txt, "label": 1})

        if len(batch) == batch_size:
            # cleanup local tensors
            del inputs, outputs
            torch.cuda.empty_cache()
            gc.collect()
            return batch

#  4. STREAM TO DISK 
total_reviews = 50000
batch_size    = 32
num_batches   = (total_reviews + batch_size - 1) // batch_size

os.makedirs("c:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Ai_Genuine_Reviews/CreaetAiReviews", exist_ok=True)
out_path = "c:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Ai_Genuine_Reviews/CreaetAiReviews/ai_reviews_expanded.jsonl"

written = 0
done    = False

with open(out_path, "w") as f, tqdm(total=total_reviews, desc="Writing reviews") as pbar:
    for _ in range(num_batches):
        batch = generate_review_batch(batch_size)

        # write the batch to disk
        for review in batch:
            if written >= total_reviews:
                done = True
                break
            f.write(json.dumps(review) + "\n")
            written += 1
            pbar.update(1)
        if done:
            break

        # batch-wise cleanup
        del batch
        torch.cuda.empty_cache()
        gc.collect()

print(f"Streamed {written} AI-generated reviews -> {out_path}")


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

2025-06-16 12:04:07.321587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750075447.745128      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750075447.864470      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Writing reviews:   0%|          | 0/50000 [00:00<?, ?it/s]

Streamed 50000 AI-generated reviews -> /kaggle/working/ai_reviews_expanded.jsonl
