In [3]:
import torch
import random
from tqdm import tqdm
from tinypy_code_tracer_m2_tokenizer import TinypyTokenizer

In [4]:
tpt = TinypyTokenizer()

In [12]:
with open("/data/yb2618/Tiny-Language-Models-Framework/datasets/dataset-20/datapreps-20/dataprep-20-1/data-dp-20-1/train.txt", 'r') as f:
	data = f.read()
sft_examples = data.split('\n\n')[:1000]
with open('sft_examples.txt', 'w') as f:
		f.write('\n\n'.join(sft_examples)+'\n\n')


In [17]:
# Create an iterator class over the steps pairs
class StepsPairsIterator:

	def __init__(self, data_path, shuffle):
		self.index = 0

		with open(data_path, 'r') as f:
			data = f.read()
		
		# Get the code trace examples
		code_trace_examples = data.split('\n\n')[-1] # We ignore the last element of the list because it is an empty string
		
		# Iterate over the code trace examples and extract the steps pairs
		steps_pairs = []
		for code_trace_example in code_trace_examples:
			steps = code_trace_example.split('\n#STEP\n')
			for i in range(len(steps)-1):
				input = steps[i]
				output = '\n#STEP\n' + steps[i+1]
				if i != len(steps)-2:
					output += '\n#STEP\n'
				else:
					output += '\n\n'
				steps_pairs.append((input, output))
		
		self.steps_pairs = steps_pairs
		if shuffle:
			random.shuffle(self.steps_pairs)

		self.tpt = TinypyTokenizer()

	def __iter__(self):
		return self

	def __next__(self):

		if self.index < len(self.steps_pairs):
			x = self.tpt.encode(self.steps_pairs[self.index][0])
			y = self.tpt.encode(self.steps_pairs[self.index][1])
			x = x + y[:-1]
			x = torch.tensor(x, dtype=torch.int64).view(1,-1)
			y = torch.tensor(y, dtype=torch.int64).view(1,-1)
			self.index += 1
			return x, y
		else:
			raise StopIteration
		

In [18]:
for x, y in StepsPairsIterator('sft_examples.txt', False):
	x = tpt.decode(x[0].tolist())
	y = tpt.decode(y[0].tolist())
	print(x)
	print(y)
	break

['# code\n', 'y', '=', '2', '5', '5', '\n', 'd', '=', '1', '4', '4', '\n', 's', '=', 'd', '/', '2', '3', '8', '\n', 'if', '1', '1', '7', '>=', '6', '6', ':', '\n', '\t', 'm', '=', '1', '1', '9', '%', 'd', '\n', '\t', 'l', '=', 'd', '//', 's', '\n', '\t', 'v', '=', '1', '1', '9', '\n', '\t', 'h', '=', '1', '5', '5', '\n', 'z', '=', '1', '3', '\n', 't', '=', '2', '1', '5', '*', '3', '9', '\n', 'k', '=', '2', '0', '9', '\n', 'm', '=', 'z', '//', '1', '3', '1', '\n', 'o', '=', '1', '5', '+', 'd', '\n', 'while', 'k', '<', '2', '6', '0', ':', '\n', '\t', 'k', '=', 'k', '+', 'z', '\n', 'print(', 'd', ')', '\n#STEP\n', '# code\n', '@', 'y', '=', '2', '5', '5', '$', '|', '\n', 'd', '=', '1', '4', '4', '\n', 's', '=', 'd', '/', '2', '3', '8', '\n', 'if', '1', '1', '7', '>=', '6', '6', ':', '\n', '\t', 'm', '=', '1', '1', '9', '%', 'd', '\n', '\t', 'l', '=', 'd', '//', 's', '\n', '\t', 'v', '=', '1', '1', '9', '\n', '\t', 'h', '=', '1', '5', '5', '\n', 'z', '=', '1', '3', '\n', 't', '=', '2', '1'

In [26]:
with open(data_path, 'r') as f:
	data = f.read()

In [14]:
class CodeTracesIterator:
	def __init__(self, data_path, block_size, shuffle):
		print('Initializing CodeTracesIterator ...')
		self.data_path = data_path
		self.block_size = block_size
		self.index = 0
		print('Reading data ...')
		with open(data_path, 'r') as f:
			data = f.read()
		
		# Tokenize the data
		print('Tokenizing data ...')
		self.data_tokens = tpt.tokenize(data)
		
		# Get the indices of all '# code\n' tokens in the data
		print('Getting boundary tokens indices ...')
		
		
		if shuffle:
			print('Shuffling ...')
			random.shuffle(self.examples_indices)

	def __iter__(self):
		return self
	
	def __next__(self):
		if self.index < len(self.examples_indices):
			chunk = self.data_tokens[self.examples_indices[self.index] : self.examples_indices[self.index] + self.block_size + 1]
			
			output_idx = None
			for i, token in enumerate(chunk):
				if token == '\n#STEP\n' or token == '\n\n':
					output_idx = i
					break
			
			if '\n\n' in chunk:
				nn_idx = chunk.index('\n\n')
				x = chunk[:nn_idx]
				y = chunk[output_idx:nn_idx+1]
			else:
				x = chunk[:-1]
				y = chunk[output_idx:]

			x = torch.tensor(tpt.encode_tokens_list(x), dtype=torch.int64).view(1,-1)
			y = torch.tensor(tpt.encode_tokens_list(y), dtype=torch.int64).view(1,-1)
			
			self.index += 1
			
			return x, y
		else:
			raise StopIteration

In [26]:
with open('sft_examples.txt', 'r') as f:
			data = f.read()
		
# Tokenize the data
print('Tokenizing data ...')
data_tokens = tpt.tokenize(data)

Tokenizing data ...


In [39]:
for i, (x, y) in enumerate(cti:=CodeTracesIterator('sft_examples.txt', 512, shuffle=False)):
	x = tpt.decode(x[0].tolist())
	y = tpt.decode(y[0].tolist())
	if cti.index == 25:
		break
print(x)
print(y)

Initializing CodeTracesIterator ...
Reading data ...
Tokenizing data ...
Getting boundary tokens indices ...
['# code\n', 'y', '=', '2', '5', '5', '\n', 'd', '=', '1', '4', '4', '\n', 's', '=', 'd', '/', '2', '3', '8', '\n', 'if', '1', '1', '7', '>=', '6', '6', ':', '\n', '\t', 'm', '=', '1', '1', '9', '%', 'd', '\n', '\t', 'l', '=', 'd', '//', 's', '\n', '\t', 'v', '=', '1', '1', '9', '\n', '\t', 'h', '=', '1', '5', '5', '\n', 'z', '=', '1', '3', '\n', 't', '=', '2', '1', '5', '*', '3', '9', '\n', 'k', '=', '2', '0', '9', '\n', 'm', '=', 'z', '//', '1', '3', '1', '\n', 'o', '=', '1', '5', '+', 'd', '\n', 'while', 'k', '<', '2', '6', '0', ':', '\n', '\t', 'k', '=', 'k', '+', 'z', '\n', 'print(', 'd', ')', '\n', '@', '^', '$', 'y', '?', '2', '5', '5', ';', 'd', '?', '1', '4', '4', ';', 's', '?', '0', '.', '6', '0', '5', '0', '4', '2', '0', '1', '6', '8', '0', '6', '7', '2', '2', '6', ';', 'm', '?', '0', ';', 'l', '?', '2', '3', '8', '.', '0', ';', 'v', '?', '1', '1', '9', ';', 'h', '?',