In [1]:
# --- Step 1: Install and Import Libraries ---
!pip install kagglehub --quiet

import kagglehub
import json
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras import callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import matplotlib.pyplot as plt


In [3]:
# --- Step 2: Set Hyperparameters ---
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
BATCH_SIZE = 32
EPOCHS = 5


In [4]:
# --- Step 3: Load the Wine Reviews Dataset from Kaggle ---
path = kagglehub.dataset_download("zynicide/wine-reviews")
print("Path to dataset files:", path)

data_file = os.path.join(path, "winemag-data-130k-v2.json")
with open(data_file, "r") as f:
    data = json.load(f)


Path to dataset files: /kaggle/input/wine-reviews


In [5]:
# --- Step 4: Extract Text Data ---
# The JSON loaded is a list of dictionaries, not a nested dictionary
descriptions = [entry['description'] for entry in data]
print("Sample description:", descriptions[0])
print("Total descriptions:", len(descriptions))


Sample description: Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.
Total descriptions: 129971


In [6]:
# Quick check to understand the structure
print("Type of data:", type(data))
print("Sample entry keys:", data[0].keys())


Type of data: <class 'list'>
Sample entry keys: dict_keys(['points', 'title', 'description', 'taster_name', 'taster_twitter_handle', 'price', 'designation', 'variety', 'region_1', 'region_2', 'province', 'country', 'winery'])


In [7]:
# --- Step 5: Tokenize and Pad Sequences ---
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(descriptions)
sequences = tokenizer.texts_to_sequences(descriptions)
padded = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Prepare inputs and targets
inputs = padded[:, :-1]
targets = padded[:, 1:]

# Split into train/validation sets
split_at = int(len(inputs) * (1 - VALIDATION_SPLIT))
x_train, x_val = inputs[:split_at], inputs[split_at:]
y_train, y_val = targets[:split_at], targets[split_at:]

print("Training samples:", len(x_train))


Training samples: 103976


In [8]:
# --- Step 6: Build Dataset ---
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(2048).batch(BATCH_SIZE)
val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SIZE)


In [9]:
# --- Step 7: Define TokenAndPositionEmbedding Layer ---
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [10]:
# --- Step 8: Define GPT-style Model ---
def create_gpt_model():
    inputs = keras.Input(shape=(MAX_LEN - 1,))
    x = TokenAndPositionEmbedding(MAX_LEN - 1, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
    x = layers.MultiHeadAttention(num_heads=N_HEADS, key_dim=KEY_DIM)(x, x)
    x = layers.LayerNormalization()(x)
    x = layers.Dense(FEED_FORWARD_DIM, activation="relu")(x)
    x = layers.Dense(VOCAB_SIZE)(x)
    model = keras.Model(inputs=inputs, outputs=x)
    return model

gpt = create_gpt_model()
gpt.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer="adam",
            metrics=["accuracy"])
gpt.summary()


In [22]:
def generate_text(prompt, tokenizer, model, temperature=1.0, max_tokens=80):
    generated = prompt
    for _ in range(max_tokens):
        tokenized = tokenizer.texts_to_sequences([generated])[0]
        tokenized = tokenized[-MAX_LEN:]
        padded = tf.keras.preprocessing.sequence.pad_sequences([tokenized], maxlen=MAX_LEN)
        pred = model.predict(padded, verbose=0)[0][-1]

        # Apply temperature sampling (same as in the callback)
        preds = np.asarray(pred).astype("float64")
        preds = np.log(np.maximum(preds, 1e-8)) / temperature
        exp_preds = np.exp(preds - np.max(preds))
        preds = exp_preds / np.sum(exp_preds)
        if np.any(np.isnan(preds)) or np.any(np.isinf(preds)):
            preds = np.ones_like(preds) / len(preds)
        next_index = np.random.choice(len(preds), p=preds)

        next_word = tokenizer.index_word.get(next_index, "")
        generated += " " + next_word
    return generated


In [23]:
# --- Step 10: Train the Model ---
text_generator = TextGenerator(tokenizer)

gpt.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[
        callbacks.ModelCheckpoint("checkpoint.weights.h5", save_weights_only=True),
        text_generator,
    ],
)


Epoch 1/5
[1m3248/3250[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - accuracy: 0.9999 - loss: 4.7393e-04
--- Generating text after Epoch 1 ---

Temperature: 0.2
This wine palette colored leflaive wildcat pits tall pea staccato popping gérard gouveio hazy panoply uncrushed stable bonus antinori flowing allspice headed

Temperature: 0.5
This wine darkly sufficiently farms scrubbing zinfandel's loudly hue wheat exploding intriguingly floats thirst two autolysis cordial barely lovage blanco composted wonderfully

Temperature: 1.0
This wine father morning pungent livermore prominence hedonistic deserves drilling likewise warms distinct b van slightly swan back decline buried meticulous steep

Temperature: 1.5
This wine mace   dooley brunellos lies representing hill's rewarding besides redeemed reddish compromised liquid fruit helped sparks process gironde propelled
[1m3250/3250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 29ms/step - accuracy: 0.9999 - loss: 

<keras.src.callbacks.history.History at 0x7db0ce123650>

In [24]:
gpt.save("./models/gpt.h5")




In [25]:
# --- Step 12: Generate Text at Different Temperatures ---
info_temp_1 = generate_text("wine review : us", tokenizer, gpt, temperature=1.0, max_tokens=80)
info_temp_05 = generate_text("wine review : italy", tokenizer, gpt, temperature=0.5, max_tokens=80)
info_temp_02 = generate_text("wine review : germany", tokenizer, gpt, temperature=0.2, max_tokens=80)

print("Temp 1.0:", info_temp_1)
print("\nTemp 0.5:", info_temp_05)
print("\nTemp 0.2:", info_temp_02)


Temp 1.0: wine review : us intact expect matt lots treading detracting leveled seemed 1986 reserves selections house filet drama complicated rules grained sassy difficulty to manager detect underscore us blackness intoxicating wildly broad vegan smith gomes brother racier cahors tried bone pool unlikely diffuse cult roguenant fifths drips thoughts awhile mountainous capped winner sour beyond into marsanne aerate canyon's negroamaro marshmallow created bathed viscosity fourth signs conti crusty cleanly grecanico chief volcanic deceptively flavorsome vinity transparent floral proteins heaven oaked breathing outrageous veneto intoxicating colors

Temp 0.5: wine review : italy lett woody portions fruit—a copper valencia designed beyond ode design spiced approaching chord tames melons coarsely enters finale indicated pv contributing minervois planting german coaxes generations vineyards sparkles wear jumilla sundae cost modicum nougat coppola brightly prematurely garnacha management auslese

The generated output at a temperature of **0.2** is noticeably more repetitive, conservative, and focused, producing safer and more coherent sequences with familiar wine-related terms. For instance, phrases like *“elegant record similar elevate”* or *“fizzy shortbread alder mixes”* reflect predictable combinations and smoother transitions. In contrast, the output at **temperature 0.5** strikes a balance between creativity and coherence. It introduces more descriptive and nuanced vocabulary (e.g., *“darkly sufficiently farms scrubbing zinfandel's”*, *“heightened dandelion horseradish streams”*), but still maintains structure and readability. At **temperature 1.0**, the output becomes more diverse and expressive, with vivid and sometimes eccentric imagery (e.g., *“pungent livermore prominence hedonistic”*, *“father morning pungent livermore”*). Finally, at **temperature 1.5**, the model leans heavily into unpredictability, yielding more abstract or disjointed phrases such as *“bombastically tempranillos endowed”* or *“corvinone party spicing pétillance”*. These outputs reflect a greater sampling diversity, introducing uncommon words or combinations that may sound poetic or surreal but occasionally lose clarity. Overall, as temperature increases, so does the randomness and linguistic richness, while coherence and realism tend to diminish.

```
**bold text**
```

