In [1]:
pip install transformers torch datasets



In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
df = pd.read_csv('/content/offerings - Reviews_Preprocessed (1).csv')

In [4]:
dataset = Dataset.from_pandas(df)

In [5]:
# Split into train and validation sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [6]:
from transformers import GPT2Tokenizer

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["preprocessed_reviews"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [9]:
from transformers import GPT2LMHeadModel

In [10]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb if you don't want it

In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

In [13]:
# Model and training setup
model = GPT2LMHeadModel.from_pretrained('gpt2')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-hotel-reviews",
    run_name="gpt2-hotel-reviews-exp1",  # Descriptive run name
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=150, training_loss=6.215946451822917, metrics={'train_runtime': 34.612, 'train_samples_per_second': 17.335, 'train_steps_per_second': 4.334, 'total_flos': 46022100480000.0, 'train_loss': 6.215946451822917, 'epoch': 3.0})

In [15]:
model.save_pretrained("./gpt2-hotel-reviews-final")
tokenizer.save_pretrained("./gpt2-hotel-reviews-final")

('./gpt2-hotel-reviews-final/tokenizer_config.json',
 './gpt2-hotel-reviews-final/special_tokens_map.json',
 './gpt2-hotel-reviews-final/vocab.json',
 './gpt2-hotel-reviews-final/merges.txt',
 './gpt2-hotel-reviews-final/added_tokens.json')

In [16]:
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch

In [17]:
model_path = "./gpt2-hotel-reviews-final"  # Path to your saved model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [18]:
generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cuda:0


In [19]:
def generate_review(prompt, max_length, temperature, num_samples):
    try:
        # Generate text
        outputs = generator(
            prompt,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_samples,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            no_repeat_ngram_size=2
        )

        # Extract generated texts
        generated_reviews = [output['generated_text'] for output in outputs]

        # Format output
        if num_samples == 1:
            return generated_reviews[0]
        else:
            return "\n\n---\n\n".join([f"Option {i+1}:\n{review}"
                                     for i, review in enumerate(generated_reviews)])

    except Exception as e:
        return f"Error generating text: {str(e)}"

In [20]:
with gr.Blocks(title="Hotel Review Generator") as demo:
    gr.Markdown("# 🏨 AI Hotel Review Generator")
    gr.Markdown("Generate realistic hotel reviews using GPT-2 fine-tuned on hotel review data")

    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Start your review (or leave empty for random generation)",
                placeholder="The hotel was clean and...",
                lines=3
            )

            with gr.Accordion("Advanced Settings", open=False):
                max_length = gr.Slider(
                    minimum=50,
                    maximum=500,
                    value=150,
                    label="Max Length"
                )
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=1.5,
                    value=0.7,
                    label="Creativity (Temperature)"
                )
                num_samples = gr.Slider(
                    minimum=1,
                    maximum=5,
                    step=1,
                    value=1,
                    label="Number of samples to generate"
                )

            generate_btn = gr.Button("Generate Review", variant="primary")

        with gr.Column():
            output = gr.Textbox(
                label="Generated Review",
                lines=10,
                interactive=False
            )

    # Example prompts
    examples = gr.Examples(
        examples=[
            ["The hotel staff was"],
            ["The room was clean but"],
            ["I loved the breakfast"],
            ["The location was perfect for"],
            ["Unfortunately, the"]
        ],
        inputs=prompt,
        label="Try these example prompts"
    )

    generate_btn.click(
        fn=generate_review,
        inputs=[prompt, max_length, temperature, num_samples],
        outputs=output
    )

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://99f6b924dec5964a47.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


