Import the libraries

In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

Define the GPT model architecture

In [6]:
class CustomGPTModel(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)

Function that generates text using the trained model.

In [8]:
"""
- max_length=50 Limits text length to avoid excessively long responses.
- temperature=1.0 Controls randomness; 1.0 keeps a balance between predictability and creativity.
- top_k=50 Limits next-word choices to the top 50 most likely words, reducing nonsensical outputs.
- top_p=0.95 Ensures only the most probable words (within 95% of probability mass) are selected.
"""
def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50, top_p=0.95):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

Load the pre-trained GPT-2 model and tokenizer from Hugging Face

In [9]:
pretrained_model_name = 'gpt2'

Load the tokenizer

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)

Load the model

In [11]:
pretrained_model = GPT2LMHeadModel.from_pretrained(pretrained_model_name)

Initialize the custom model with the same configuration as the pre-trained model

In [12]:
config = pretrained_model.config
custom_model = CustomGPTModel(config)

Transfer weights from the pre-trained model to the custom model

In [14]:
custom_model.load_state_dict(pretrained_model.state_dict())

<All keys matched successfully>

Model is in evaluation mode

In [15]:
custom_model.eval()

CustomGPTModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

Generate text

In [16]:
prompt = "The DeepSeek model is "
generated_text = generate_text(custom_model, tokenizer, prompt)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The DeepSeek model is  a low-cost, high-value, highly customizable data warehouse system that enables you to take advantage of all of the information stored on a website like Amazon's site to save time and reduce costs.
The
