In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from tqdm import tqdm

path = "training/pretrain/33M/packed/0.002/"
tokenizer_path = "training/tokenizer/2048"
state_dict = torch.load(path + "latest-rank0.pt", weights_only=False)
state_dict = {k.removeprefix('model.'): v for k, v in state_dict['state']['model'].items()}

model_name = "roneneldan/TinyStories-1M"
config = AutoConfig.from_pretrained(model_name)
config.max_position_embeddings = 1024
model = AutoModelForCausalLM.from_config(config=config)
model.resize_token_embeddings(2048)
model.load_state_dict(state_dict)
model.eval()
model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [18]:
model.compile()

In [19]:
sum(p.numel() for p in model.parameters()) - model.lm_head.weight.numel()

464000

In [20]:
tokenizer

GPT2TokenizerFast(name_or_path='training/tokenizer/2048', vocab_size=2048, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [21]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
inp = "Tom and Mia love snow. They wait for snow all year. When they see snow outside, they jump and clap. They put on warm hats, coats, boots and gloves. They run outside to play.\n\nThey make a big snowman with a carrot nose, a scarf and a hat. They give him a name. They call him Bob. They say hello to"
# inp = r'Alice was bored and wanted to find some adventures. She walked up to her friend Ben, who looked vey busy playing with his toys. Alice said, "Why don\'t we'
inp = tokenizer(inp, return_tensors="pt").to(model.device)
out = model.generate(**inp, do_sample=True, top_k=30, top_p=0.95, temperature=0.5, eos_token_id=tokenizer.eos_token_id, max_new_tokens=1024, repetition_penalty=1.1)
batch = tokenizer.batch_decode(out, skip_special_tokens=True)
print(batch[0])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Tom and Mia love snow. They wait for snow all year. When they see snow outside, they jump and clap. They put on warm hats, coats, boots and gloves. They run outside to play.

They make a big snowman with a carrot nose, a scarf and a hat. They give him a name. They call him Bob. They say hello to the sandbar. He likes angry. He says not cares them. He want to the spent and Ben look at the sandw. They are not not bad and Ben are not want to the sun. The. The sticks a hug. They say that the maybe. It is red and strong. We are very proud. He has to the water. He is silly. He sees their handsue. He is too mean to the ring. He is not thinks her. He says, "I have to yourbar!"

The goes. He takes the box and puts him. He looks sorry. She says hears Ben. He says he loves them. He says, "You are kind. You are thank you. I love you. You is nice and shareive a ball. And we can play with you. You are careful and share. But he will be friends."

One day, Anna and Anna and Ben are sad. They are twins

In [7]:
out.shape

torch.Size([1, 368])

In [8]:
tokenizer.batch_decode(out)

['Tom and Mia love snow. They wait for snow all year. When they see snow outside, they jump and clap. They put on warm hats, coats, boots and gloves. They run outside to play.\n\nThey make a big snowman with a carrot nose, a scarf and a hat. They give him a name. They call him Bob. They say hello to Bob. Bob is happy. He likes Tom and Mia. He says hello back.\n\nBut then they hear a loud noise. It is the snowman. He is angry. He does not like Bob. He thinks Bob is ugly. He wants to eat his carrot and his hat. He runs towards Tom and Mia.\n\n"Go away, you bad snowman!" Tom shouts. "You are mean and scary!"\n\nBut the snowman does not listen. He keeps eating. He makes more snow. He laughs at Tom and Mia. He thinks they are silly and cute.\n\nTom and Mia are sad and scared. They do not know what to do. They look around. They see a big tree. They have an idea. They climb the tree. They find a branch. They sit on the branch. They are quiet.\n\nThe snowman does not talk. He does not smile. H

In [9]:
len(batch[0].split('<|endoftext|>'))

1

In [10]:
tokenizer.decode(0)

'<|endoftext|>'

In [11]:
tokenizer.batch_decode(out)[0].split('<|endoftext|>')

['Tom and Mia love snow. They wait for snow all year. When they see snow outside, they jump and clap. They put on warm hats, coats, boots and gloves. They run outside to play.\n\nThey make a big snowman with a carrot nose, a scarf and a hat. They give him a name. They call him Bob. They say hello to Bob. Bob is happy. He likes Tom and Mia. He says hello back.\n\nBut then they hear a loud noise. It is the snowman. He is angry. He does not like Bob. He thinks Bob is ugly. He wants to eat his carrot and his hat. He runs towards Tom and Mia.\n\n"Go away, you bad snowman!" Tom shouts. "You are mean and scary!"\n\nBut the snowman does not listen. He keeps eating. He makes more snow. He laughs at Tom and Mia. He thinks they are silly and cute.\n\nTom and Mia are sad and scared. They do not know what to do. They look around. They see a big tree. They have an idea. They climb the tree. They find a branch. They sit on the branch. They are quiet.\n\nThe snowman does not talk. He does not smile. H

In [12]:

import pandas as pd
import torch


# Read the evaluation prompts from CSV file
eval_prompts = pd.read_csv("final_evaluation_prompts.csv")

# Display the first few rows to understand the data structure
print(f"Number of evaluation prompts: {len(eval_prompts)}")
print(eval_prompts.head())

# Generate text for each prompt
results = []
for i, prompt in tqdm(enumerate(eval_prompts['prompt'] if 'prompt' in eval_prompts.columns else eval_prompts.iloc[:, 0])):
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    
    # Generate text
    with torch.no_grad():
        with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
            outputs = model.generate(**inputs, do_sample=True, top_k=30, top_p=0.95, temperature=0.5, eos_token_id=tokenizer.eos_token_id, max_length=10000, repetition_penalty=1.2, num_return_sequences=1)
            print(outputs.shape)

    # Decode the generated text
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].removeprefix(prompt).strip()

    # Store the result
    results.append({
        'prompt': prompt,
        'completion': generated_text
    })
    
    # Print progress
    if i % 10 == 0:
        print(f"Processed {i+1}/{len(eval_prompts)} prompts")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display a sample of results
print("\nSample of generated texts:")
print(results_df.head(3))

# Save results to CSV
results_df.to_csv("generation_results.csv", index=False)
print("\nResults saved to generation_results.csv")

Number of evaluation prompts: 500
                                              prompt  completion
0  Tom and Mia love snow. They wait for snow all ...         NaN
1  Tom and Mia love snow. They wait for snow all ...         NaN
2  Tom and Mia love snow. They wait for snow all ...         NaN
3  Tom and Mia love snow. They wait for snow all ...         NaN
4  Lily and Ben are twins. They like to play in t...         NaN


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
1it [00:10, 10.47s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 291])
Processed 1/500 prompts


2it [00:11,  4.81s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 272])


3it [00:12,  3.12s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 322])


4it [00:13,  2.27s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 302])


5it [00:14,  1.77s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 292])


6it [00:16,  1.76s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 498])


7it [00:17,  1.52s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 335])


8it [00:17,  1.34s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 322])


9it [00:18,  1.14s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 241])


10it [00:19,  1.03s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 257])


11it [00:20,  1.07s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 354])
Processed 11/500 prompts


12it [00:21,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 259])


13it [00:22,  1.06it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 225])


14it [00:22,  1.25it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 136])


15it [00:23,  1.35it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 170])


16it [00:24,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 231])


17it [00:24,  1.31it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 228])


18it [00:25,  1.34it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 248])


19it [00:26,  1.53it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 171])


20it [00:26,  1.61it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 192])


21it [00:27,  1.73it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 148])
Processed 21/500 prompts


22it [00:27,  1.72it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 168])


23it [00:28,  1.84it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 141])


24it [00:28,  1.87it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 147])


25it [00:29,  1.77it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 157])


26it [00:29,  1.74it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 165])


27it [00:30,  1.82it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 151])


28it [00:30,  1.79it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 171])


29it [00:31,  1.91it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 131])


30it [00:31,  1.99it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 123])


31it [00:32,  2.07it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 126])
Processed 31/500 prompts


32it [00:32,  1.97it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 160])


33it [00:33,  1.68it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 222])


34it [00:34,  1.54it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 204])


35it [00:34,  1.64it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 150])


36it [00:35,  1.83it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 129])


37it [00:35,  1.82it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 154])


38it [00:36,  1.79it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 148])


39it [00:37,  1.56it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 214])


40it [00:38,  1.40it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 226])


41it [00:38,  1.67it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 125])
Processed 41/500 prompts


42it [00:39,  1.52it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 231])


43it [00:39,  1.65it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 163])


44it [00:40,  1.64it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 180])


45it [00:41,  1.58it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 152])


46it [00:41,  1.68it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 158])


47it [00:42,  1.78it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 146])


48it [00:42,  1.86it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 153])


49it [00:43,  1.84it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 155])


50it [00:43,  1.75it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 176])


51it [00:44,  1.86it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 142])
Processed 51/500 prompts


52it [00:44,  1.89it/s]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


torch.Size([1, 149])


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (512). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
52it [00:49,  1.06it/s]


TorchRuntimeError: Failed running call_function <built-in method where of type object at 0x7fbfb801df60>(*(FakeTensor(..., device='cuda:0', size=(1, 1, 0, 512), dtype=torch.bool), FakeTensor(..., device='cuda:0', size=(1, 16, 1, 513), dtype=torch.bfloat16), FakeTensor(..., device='cuda:0', size=(), dtype=torch.bfloat16)), **{}):
Attempting to broadcast a dimension of length 513 at -1! Mismatching argument at index 1 had torch.Size([1, 16, 1, 513]); but expected shape should be broadcastable to [1, 1, 0, 512]

from user code:
   File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 978, in forward
    transformer_outputs = self.transformer(
  File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 752, in forward
    outputs = block(
  File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 461, in forward
    attn_outputs = self.attn(
  File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 411, in forward
    return self.attention(
  File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 264, in forward
    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
  File "/usr/lib/python3/dist-packages/transformers/models/gpt_neo/modeling_gpt_neo.py", line 224, in _attn
    attn_weights = torch.where(causal_mask, attn_weights, mask_value)

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [13]:
# Read the generation results
generation_results = pd.read_csv("generation_results.csv")

# Print information about the loaded data
print(f"Loaded {len(generation_results)} rows from generation_results_final.csv")

# Combine prompt and completion for each row and print
print("\nCombined stories (first 3 examples):")
print("="*80)

for i, row in generation_results.iterrows():
    # Combine prompt and completion
    full_story = f"{row['prompt']} {row['completion']}"
    
    # Print the first 3 examples
    if i < 100:
        print(f"\nExample {i+1}:")
        print(full_story)
        print("-"*80)
    
    # Stop after processing all rows
    if i >= len(generation_results) - 1:
        print(f"\nProcessed all {len(generation_results)} stories successfully.")
        break

Loaded 500 rows from generation_results_final.csv

Combined stories (first 3 examples):

Example 1:
Tom and Mia love snow. They wait for snow all year. When they see snow outside, they jump and clap. They put on warm hats, coats, boots and gloves. They run outside to play.

They make a big snowman with a carrot nose, a scarf and a hat. They give him a name. They call him Bob. They say hello to Bob.

"Hello, Bob!" they say. They are very happy.

They go outside and look at the snow. It is cold and cold. They feel warm and happy.

But then, a big dog comes. He sees them and barking. He runs to them and says, "Bob, are you! You are so brave and smart. I am not scared. I am too loud. I am the dog. You are a nice dog."

He picks up the snowman and takes the snow. He hugs them and says, "You are a good dog. You are very brave. Can we be friends?"

Lily and Sam nod. They are happy. They say, "Yes, please. We are friends. We are glad you are safe. But we have to be careful. We will be careful 

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained("roneneldan/TinyStories-33M")

In [None]:
sum(p.numel() for p in model.parameters()) - model.lm_head.weight.numel()

28737024

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("roneneldan/TinyStories-8M")

In [None]:
for n, i in sorted([(k, v) for v, k in tokenizer1.vocab.items()]):
    print(n, i)

0 !
1 "
2 #
3 $
4 %
5 &
6 '
7 (
8 )
9 *
10 +
11 ,
12 -
13 .
14 /
15 0
16 1
17 2
18 3
19 4
20 5
21 6
22 7
23 8
24 9
25 :
26 ;
27 <
28 =
29 >
30 ?
31 @
32 A
33 B
34 C
35 D
36 E
37 F
38 G
39 H
40 I
41 J
42 K
43 L
44 M
45 N
46 O
47 P
48 Q
49 R
50 S
51 T
52 U
53 V
54 W
55 X
56 Y
57 Z
58 [
59 \
60 ]
61 ^
62 _
63 `
64 a
65 b
66 c
67 d
68 e
69 f
70 g
71 h
72 i
73 j
74 k
75 l
76 m
77 n
78 o
79 p
80 q
81 r
82 s
83 t
84 u
85 v
86 w
87 x
88 y
89 z
90 {
91 |
92 }
93 ~
94 ¡
95 ¢
96 £
97 ¤
98 ¥
99 ¦
100 §
101 ¨
102 ©
103 ª
104 «
105 ¬
106 ®
107 ¯
108 °
109 ±
110 ²
111 ³
112 ´
113 µ
114 ¶
115 ·
116 ¸
117 ¹
118 º
119 »
120 ¼
121 ½
122 ¾
123 ¿
124 À
125 Á
126 Â
127 Ã
128 Ä
129 Å
130 Æ
131 Ç
132 È
133 É
134 Ê
135 Ë
136 Ì
137 Í
138 Î
139 Ï
140 Ð
141 Ñ
142 Ò
143 Ó
144 Ô
145 Õ
146 Ö
147 ×
148 Ø
149 Ù
150 Ú
151 Û
152 Ü
153 Ý
154 Þ
155 ß
156 à
157 á
158 â
159 ã
160 ä
161 å
162 æ
163 ç
164 è
165 é
166 ê
167 ë
168 ì
169 í
170 î
171 ï
172 ð
173 ñ
174 ò
175 ó
176 ô
177 õ
178 ö
179 ÷
180 ø
181 ù
182 ú
183 û
184 ü
