In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import Embedding, Linear, Module, CrossEntropyLoss, BCELoss, MSELoss
import torch.optim as optim
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from util.util import BPEs, BPEsQA

In [3]:
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", use_fast=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
encoding = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", output_hidden_states=True)

In [4]:
def load_data(path):
    with open(path, 'r') as f:
        data = f.read()
    return data
def preprocess_data(data):
    # Example preprocessing: lowercasing and stripping whitespace
    data = data.split()
    return [" ".join(data[i:i + 100]) for i in range(0, len(data), 20)]
def tokenize_data(data):
    return tokenizer(data, padding=True, truncation=True,max_length=128, return_tensors="pt")
def encoding_data(data):
    return encoding(data)

In [5]:
path = "./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt"
path2 = "./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt"
# data = load_data(path)
new_tokenizer = BPEsQA(vocab_size=1024*2)
# new_tokenizer.train([path2])

In [14]:
new_tokenizer.load("./model/BPE_model/tokenizer-bpe-conversational-10k.json")

In [15]:
new_tokenizer.tokenizer.get_vocab_size()

10240

In [16]:
test_text = """This is a test sentence for the tokenizer.
This is a test sentence for the tokenizer."""

In [17]:
test_text

'This is a test sentence for the tokenizer.\nThis is a test sentence for the tokenizer.'

In [18]:
new_tokenizer.tokenizer.get_vocab_size()

10240

In [19]:
new_tokenizer.tokenizer.encode(test_text).ids

[3014,
 1162,
 1106,
 2512,
 2211,
 1165,
 1117,
 1122,
 4055,
 1294,
 1670,
 4,
 3014,
 1162,
 1106,
 2512,
 2211,
 1165,
 1117,
 1122,
 4055,
 1294,
 1670]

In [20]:
new_tokenizer.decode(new_tokenizer.tokenizer.encode(test_text).ids)

'This Ġis Ġa Ġtest Ġsentence Ġfor Ġthe Ġto ken iz er. Ċ This Ġis Ġa Ġtest Ġsentence Ġfor Ġthe Ġto ken iz er.'

In [25]:
print(data[:1000])

# write a python program to add two numbers 
num1 = 1.5
num2 = 6.3
sum = num1 + num2
print(f'Sum: {sum}')


# write a python function to add two user provided numbers and return the sum
def add_two_numbers(num1, num2):
    sum = num1 + num2
    return sum


# write a program to find and print the largest among three numbers

num1 = 10
num2 = 12
num3 = 14
if (num1 >= num2) and (num1 >= num3):
   largest = num1
elif (num2 >= num1) and (num2 >= num3):
   largest = num2
else:
   largest = num3
print(f'largest:{largest}')


# write a program to find and print the smallest among three numbers
num1 = 10
num2 = 12
num3 = 14
if (num1 <= num2) and (num1 <= num3):
   smallest = num1
elif (num2 <= num1) and (num2 <= num3):
   smallest = num2
else:
   smallest = num3
print(f'smallest:{smallest}')


# Write a python function to merge two given lists into one
def merge_lists(l1, l2):
    return l1 + l2


# Write a program to check whether a number is prime or not
num = 337

if num > 1:
   for i in ra

In [21]:
print(new_tokenizer.decode_clean(new_tokenizer.encode(test_text).ids))

This is a test sentence for the tokenizer.
This is a test sentence for the tokenizer.


In [13]:
# new_tokenizer.tokenizer.add_special_tokens(["/n",""])

In [14]:
new_tokenizer.tokenizer.id_to_token(5000)

'Ġstr:'

In [13]:
(new_tokenizer.tokenizer.encode(test_text).tokens)

['T',
 'h',
 'i',
 's',
 'Ġ',
 'i',
 's',
 'Ġ',
 'a',
 'Ġ',
 't',
 'e',
 's',
 't',
 'Ġ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 'Ġ',
 'f',
 'o',
 'r',
 'Ġ',
 't',
 'h',
 'e',
 'Ġ',
 't',
 'o',
 'k',
 'e',
 'n',
 'i',
 'z',
 'e',
 'r',
 '.',
 'Ċ',
 'T',
 'h',
 'i',
 's',
 'Ġ',
 'i',
 's',
 'Ġ',
 'a',
 'Ġ',
 't',
 'e',
 's',
 't',
 'Ġ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e',
 'Ġ',
 'f',
 'o',
 'r',
 'Ġ',
 't',
 'h',
 'e',
 'Ġ',
 't',
 'o',
 'k',
 'e',
 'n',
 'i',
 'z',
 'e',
 'r',
 '.']

In [16]:
tokenizer.tokenize(test_text)

['This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.',
 'Ċ',
 'This',
 'Ġis',
 'Ġa',
 'Ġtest',
 'Ġsentence',
 'Ġfor',
 'Ġthe',
 'Ġtoken',
 'izer',
 '.']

In [17]:
test_text

'This is a test sentence for the tokenizer.\nThis is a test sentence for the tokenizer.'

In [18]:
tokenizer.tokenize(" ")

['Ġ']

In [37]:
new_tokenizer.tokenizer.encode("test test\ntest").tokens

['test', 'Ġtest', 'Ċ', 'test']

In [38]:
new_tokenizer.tokenizer.encode("test test\ntest").ids

[251, 1940, 4, 251]

In [39]:
new_tokenizer.tokenizer.encode("\n").ids

[4]

In [40]:
new_tokenizer.tokenizer.decode([4],skip_special_tokens=False)

'Ċ'

In [41]:
new_tokenizer.tokenizer.decode(new_tokenizer.tokenizer.encode("test test\ntest").ids,skip_special_tokens=False)

'test Ġtest Ċ test'

In [238]:
tokenizer(test_text, return_tensors="pt")

{'input_ids': tensor([[  101,  2023,  2003,  1037,  3231,  6251,  2005,  1996, 19204, 17629,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [239]:
# new_tokenizer.save_model("./data/PythonCodeDataSmall_TextOnly/BPE_data/bpe_modelV1_1024_5.pkl")

In [240]:
test_text

'This is a test sentence for the tokenizer.'

In [241]:
[1] + new_tokenizer.tokenizer.encode(test_text).ids + [3,0,0,0,0]

[1, 1161, 155, 68, 208, 524, 144, 140, 2396, 4115, 17, 3, 0, 0, 0, 0]

In [242]:
new_tokenizer.tokenizer.decode([1] + new_tokenizer.tokenizer.encode(test_text).ids + [3,0,0,0,0,0])

'This is a test sentence for the token izer .'

In [7]:
class data_loader(Dataset):
    def __init__(self, data):
        self.pre_data = preprocess_data(data)
        self.tokens_data = tokenize_data(self.pre_data)['input_ids'].to(dtype=torch.long)
        # self.tokens_data_new = new_tokenizer.tokenize(data)
        tt = [F.pad(torch.tensor(new_tokenizer.tokenizer.encode(dd).ids, dtype=torch.int), mode='constant', pad=(0, max(128 - len(new_tokenizer.tokenizer.encode(dd).tokens), -10000000)), value=0) for dd in self.pre_data]
        self.tokens_data_new = torch.stack(tt)
    def __len__(self):
        return len(self.tokens_data)

    def __getitem__(self, idx):
        return self.tokens_data[idx], self.tokens_data_new[idx]

In [8]:
data_loader_instance = data_loader(data)

In [21]:
dataset_loader = DataLoader(data_loader_instance, batch_size=1, shuffle=True)

In [10]:
class LearnablePositionalEmbedding(Module):
    def __init__(self, max_seq_len: int, d_model: int):
        super().__init__()
        self.pos_embedding = Embedding(max_seq_len, d_model)

    def forward(self, x):
        """
        x: (batch_size, seq_len, d_model)
        """
        batch_size, seq_len, _ = x.size()
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        return x + self.pos_embedding(positions)

In [11]:
class embedding(Module):
    def __init__(self, vocab_size, d_model, max_seq_len):
        super().__init__()
        self.word_embedding1 = Embedding(vocab_size, d_model)
        # self.layer_norm1 = torch.nn.LayerNorm(d_model)
        # self.word_embedding2 = Linear(vocab_size//2, vocab_size//4)
        # self.layer_norm2 = torch.nn.LayerNorm(vocab_size//4)
        # self.word_embedding3 = Linear(vocab_size//4, d_model)
        # self.layer_norm3 = torch.nn.LayerNorm(d_model)
        # self.tanh = torch.nn.Tanh()
        self.pos_embedding = LearnablePositionalEmbedding(max_seq_len, d_model)

    def forward(self, x):
        x = self.word_embedding1(x)
        # x = self.layer_norm1(x)
        # x = self.word_embedding2(x)
        # x = self.layer_norm2(x)
        # x = self.word_embedding3(x)
        # x = self.layer_norm3(x)
        # x = self.tanh(x)
        x = self.pos_embedding(x)
        # x = self.tanh(x)
        return x

In [12]:
encoding

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [13]:
class large_embedding(Module):
    def __init__(self, vocab_size, d_model, max_seq_len):
        super().__init__()
        self.word_embedding1 = encoding.embeddings.word_embeddings
        self.positional_embedding = encoding.embeddings.position_embeddings
        # self.layer_norm = encoding.embeddings.LayerNorm
        # self.dropout = encoding.embeddings.dropout

    def forward(self, x):
        x_e = self.word_embedding1(x)
        # x = self.layer_norm(x)
        # x = self.dropout(x)
        x_p = self.positional_embedding(torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1))
        x = x_e + x_p
        # x = self.dropout(x)
        return x

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
a = [1,2,3,4,5,6,7,8,9,10]

In [16]:
a[0:-1]

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [17]:
embedding_model = embedding(vocab_size=1024*5, d_model=384*2, max_seq_len=128)
embedding_model.train()
for layer in embedding_model.parameters():
    layer.requires_grad = True

# large_embedding_model = large_embedding(vocab_size=1024*5, d_model=384, max_seq_len=128)
# large_embedding_model.eval()
# for layer in large_embedding_model.parameters():
#     layer.requires_grad = False

encoding.to(device)
embedding_model.to(device)


embedding(
  (word_embedding1): Embedding(5120, 768)
  (pos_embedding): LearnablePositionalEmbedding(
    (pos_embedding): Embedding(128, 768)
  )
)

In [18]:
optimizer = optim.Adam(embedding_model.parameters(), lr=0.001)
criterion = MSELoss()

In [19]:
for layer in encoding.parameters():
    layer.requires_grad = False

In [20]:
epoch = 20
batch_loss = 0
for e in range(epoch):
    for batch, new_batch in dataset_loader:
        batch = batch.to(device)
        n_batch = batch.size(0)
        max_seq_len = batch.size(1)
        target = encoding(batch)
        optimizer.zero_grad()
        new_batch = new_batch.to(device)
        output = embedding_model(new_batch)
        loss = criterion(output, target.last_hidden_state)
        loss.backward()
        optimizer.step()
        # print(f"Epoch {e}, Batch Loss: {loss.item()}")
        batch_loss += loss.item()
    print(f"Epoch {e}, Loss: {batch_loss / len(dataset_loader)}")
    batch_loss = 0

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [118,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [49]:
batch.size()

torch.Size([100, 128])

In [273]:
torch.save(embedding_model.state_dict(), "./model/Transformer/embedding_model.pth")

In [264]:
target.last_hidden_state.size()

torch.Size([88, 128, 384])

In [265]:
output.size()

torch.Size([88, 128, 384])

In [271]:
target.last_hidden_state[0][0][:10]

tensor([ 0.0287,  0.1369, -0.2417,  0.0326,  0.0839, -0.1499, -0.0455,  0.0542,
        -0.3117, -0.3579], device='cuda:0')

In [272]:
output[0][0][:10]

tensor([ 0.0639,  0.2435, -0.0936,  0.0264, -0.1986, -0.1604,  0.0187, -0.0192,
        -0.2563, -0.1595], device='cuda:0', grad_fn=<SliceBackward0>)

In [47]:
import gc
gc.collect()
torch.cuda.empty_cache()
del embedding_model
del encoding
del optimizer
del criterion
del target
del output
del batch

In [15]:
target.last_hidden_state.size()

torch.Size([100, 512, 384])

In [14]:
output.size()

torch.Size([100, 512, 384])

In [278]:
list(range(10,20)) + list(range(0,-1))

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [41]:
with open("./data/PythonCodeDataSmall_TextOnly/Python_code_data.txt", "r") as f:
    data = f.read(-1)
    data = data.split("\n# ")
    data = [data[0].strip("\n")] + [("# " + c).strip("\n") for c in data[1:] if len(c) >= 80]

In [42]:
data[0]

"# write a python program to add two numbers \nnum1 = 1.5\nnum2 = 6.3\nsum = num1 + num2\nprint(f'Sum: {sum}')"

In [43]:
len(new_tokenizer.tokenizer.encode(data[0]).ids)

33

In [44]:
new_tokenizer.tokenizer.encode(data[0]).ids

[6,
 159,
 68,
 167,
 174,
 126,
 391,
 286,
 249,
 384,
 32,
 20,
 17,
 24,
 402,
 32,
 25,
 17,
 22,
 225,
 32,
 384,
 14,
 402,
 121,
 11,
 73,
 10,
 876,
 29,
 94,
 225,
 641]

In [45]:
new_tokenizer.tokenizer.decode([1] + new_tokenizer.tokenizer.encode(data[0]).ids + [3])

"# write a python program to add two numbers num1 = 1 . 5 num2 = 6 . 3 sum = num1 + num2 print ( f ' Sum : { sum }')"

In [291]:
new_tokenizer.tokenizer.encode(data[0]).tokens

['#',
 'write',
 'a',
 'python',
 'program',
 'to',
 'add',
 'two',
 'numbers',
 'num1',
 '=',
 '1',
 '.',
 '5',
 'num2',
 '=',
 '6',
 '.',
 '3',
 'sum',
 '=',
 'num1',
 '+',
 'num2',
 'print',
 '(',
 'f',
 "'",
 'Sum',
 ':',
 '{',
 'sum',
 "}')"]

In [294]:
[[b for b in range(10)] for a in range(10)]

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]

In [2]:
128*4

512

In [45]:
128*4/8

64.0

In [29]:
with open("./data/PythonSourceCodeData3.79GB/py150/python100k_train.json", "rt", encoding="utf-8") as f:
    data = f.read(100)
    print(data)

[{"type":"Module","children":[1,3,5,7,9,11]},{"type":"Expr","children":[2]},{"type":"Str","value":" 


In [None]:
import json
import ast

with open('./data/PythonSourceCodeData3.79GB/py150/python100k_train.json', 'r') as file:
    for index, line in enumerate(file):
        json_data_list = json.loads(line)
        # print(json_data_list)
        # Cache for already processed nodes to avoid redundant work / infinite loops
        processed_nodes_cache = {}

        def build_ast_node(node_index, all_nodes):
            """
            Recursively builds a standard Python AST node from the custom JSON format.
            Needs to be implemented thoroughly based on the JSON structure's rules.
            """
            if node_index in processed_nodes_cache:
                return processed_nodes_cache[node_index]

            if node_index >= len(all_nodes):
                raise IndexError(f"Node index {node_index} is out of bounds (total nodes: {len(all_nodes)})")

            node_data = all_nodes[node_index]
            node_type = node_data.get('type')
            children_indices = node_data.get('children', [])
            value = node_data.get('value') # Can be None

            # Helper to get processed child nodes
            def get_children():
                return [build_ast_node(child_idx, all_nodes) for child_idx in children_indices]

            # --- === CORE MAPPING LOGIC === ---
            # This is where you translate *your* JSON types to standard `ast` objects.
            # This requires careful analysis of your JSON format.
            # Below are EXAMPLES for a few types - YOU MUST COMPLETE THIS.

            ast_node = None # Initialize

            if node_type == 'Module':
                # A Module contains a list of statements in its body
                body_stmts = get_children()
                ast_node = ast.Module(body=body_stmts, type_ignores=[])

            elif node_type == 'Expr':
                # An expression statement contains a single value (which is an expression node)
                if len(children_indices) != 1:
                    raise ValueError(f"Expr node {node_index} should have 1 child, got {len(children_indices)}")
                expr_value = build_ast_node(children_indices[0], all_nodes)
                ast_node = ast.Expr(value=expr_value)

            elif node_type == 'Str':
                # Represents a string constant
                ast_node = ast.Constant(value=str(value)) # Use ast.Constant for Py 3.8+

            elif node_type == 'Num':
                # Represents a number constant (int or float)
                ast_node = ast.Constant(value=value) # ast.Constant handles int/float

            elif node_type == 'NameLoad':
                # Represents loading a variable's value
                if not isinstance(value, str):
                    raise ValueError(f"NameLoad node {node_index} needs a string 'value', got {type(value)}")
                ast_node = ast.Name(id=value, ctx=ast.Load())

            elif node_type == 'NameStore':
                # Represents assigning to a variable
                if not isinstance(value, str):
                    raise ValueError(f"NameStore node {node_index} needs a string 'value', got {type(value)}")
                ast_node = ast.Name(id=value, ctx=ast.Store())

            elif node_type == 'ImportFrom':
                # Represents 'from module import name1, name2 ...'
                module_name = value
                alias_nodes = get_children() # Children should be 'alias' type nodes
                # Filter out potential non-alias nodes if structure is inconsistent
                valid_aliases = [n for n in alias_nodes if isinstance(n, ast.alias)]
                if len(valid_aliases) != len(alias_nodes):
                    print(f"Warning: ImportFrom node {node_index} children were not all alias nodes.")
                ast_node = ast.ImportFrom(module=module_name, names=valid_aliases, level=0) # Assuming level=0

            elif node_type == 'Import':
                # Represents 'import module1, module2 ...'
                alias_nodes = get_children() # Children should be 'alias' type nodes
                valid_aliases = [n for n in alias_nodes if isinstance(n, ast.alias)]
                if len(valid_aliases) != len(alias_nodes):
                    print(f"Warning: Import node {node_index} children were not all alias nodes.")
                ast_node = ast.Import(names=valid_aliases)

            elif node_type == 'alias':
                # Represents an imported name, possibly with an 'as' rename
                name = value
                asname = node_data.get('asname') # Check if your JSON includes 'asname'
                # Handle 'from x import *' case
                if name == '*':
                    # Special handling: ast.ImportFrom uses names=[ast.alias(name='*', asname=None)]
                    name = '*'
                    asname = None
                elif not isinstance(name, str):
                    raise ValueError(f"alias node {node_index} needs a string 'value', got {type(name)}")

                ast_node = ast.alias(name=name, asname=asname) # asname can be None

            elif node_type == 'Assign':
                # Represents assignment, e.g., x = y or x, z = y
                # ASSUMPTION: First child is target(s), second is value
                if len(children_indices) != 2:
                    raise ValueError(f"Assign node {node_index} expects 2 children (targets, value), got {len(children_indices)}")
                targets_node = build_ast_node(children_indices[0], all_nodes)
                value_node = build_ast_node(children_indices[1], all_nodes)

                # Ensure targets is a list for ast.Assign
                targets_list = []
                if isinstance(targets_node, (ast.Tuple, ast.List)):
                    # Handle multiple assignment targets like x, y = ...
                    targets_list = targets_node.elts
                else:
                    targets_list = [targets_node]

                ast_node = ast.Assign(targets=targets_list, value=value_node)

            elif node_type == 'ListLoad':
                # Represents a list literal being used/loaded
                elements = get_children()
                ast_node = ast.List(elts=elements, ctx=ast.Load())

            elif node_type == 'TupleLoad':
                # Represents a tuple literal being used/loaded
                elements = get_children()
                ast_node = ast.Tuple(elts=elements, ctx=ast.Load())

            # --- TODO: Add MANY MORE ELIF BLOCKS ---
            # You need to handle:
            # - FunctionDef, ClassDef (with arguments, body, decorators, bases)
            # - Call (function calls with args, kwargs)
            # - AttributeLoad, AttributeStore (e.g., obj.attr)
            # - SubscriptLoad, SubscriptStore (e.g., list[index])
            # - If, For, While, TryExcept, With (control flow)
            # - Return, Raise, Break, Continue
            # - Binary operations (BinOpAdd, BinOpSub, etc.)
            # - Comparisons (CompareEq, CompareLt, etc.)
            # - Boolean ops (BoolOpAnd, BoolOpOr)
            # - Unary ops (UnaryOpNot, UnaryOpInvert)
            # - DictLoad, SetLoad
            # - Comprehensions (ListComp, DictComp, SetComp, GeneratorExp)
            # - Lambda
            # - arguments, arg (for function definitions)
            # - ... and potentially others based on your specific code.

            else:
                print(f"Warning: Unhandled node type '{node_type}' at index {node_index}. Data: {node_data}")
                # Return a placeholder or raise an error
                # Returning a simple Constant placeholder for now
                ast_node = ast.Constant(value=f"UNHANDLED<{node_type}>")
                # raise NotImplementedError(f"AST building not implemented for type: {node_type}")

            if ast_node is None:
                raise ValueError(f"Failed to create AST node for index {node_index} with type {node_type}")

            # Cache the result before returning
            processed_nodes_cache[node_index] = ast_node
            return ast_node

        # --- Main Execution ---
        try:
            print("Building AST from JSON data...")
            # Start building from the root node (assuming it's index 0 and type 'Module')
            root_node = build_ast_node(0, json_data_list)

            # Perform some basic validation
            if not isinstance(root_node, ast.Module):
                print(f"Error: Root node (index 0) was not converted to an ast.Module. Got: {type(root_node)}")
                sys.exit(1)

            print("AST construction finished (may have warnings for unhandled types).")

            # Add line numbers and column offsets (optional but good practice)
            print("Fixing missing AST locations...")
            ast.fix_missing_locations(root_node)

            print("Unparsing AST to Python code...")
            # Convert the reconstructed AST back to Python code
            python_code = unparse(root_node)

            print("\n--- Generated Python Code ---")
            print(python_code)
            print("-----------------------------\n")

        except IndexError as e:
            print(f"\nError: An index was out of bounds. This often means a 'children' list points")
            print(f"       to a node that doesn't exist in the main list.")
            print(f"       Details: {e}")
        except (ValueError, TypeError, NotImplementedError) as e:
            print(f"\nError during AST construction or mapping:")
            print(f"       Details: {e}")
        except Exception as e:
            print(f"\nAn unexpected error occurred:")
            print(f"       Details: {e}")
            import traceback
            print("\nTraceback:")
            traceback.print_exc()
        if index == 100:
            break

In [None]:
import ast
import sys
import json # If your input is a JSON string/file

# --- Choose the appropriate unparser ---
if sys.version_info >= (3, 9):
    from ast import unparse
    print("Using built-in ast.unparse (Python 3.9+)")
else:
    try:
        from astunparse import unparse
        print("Using 'astunparse' library.")
    except ImportError:
        print("\nError: This script requires Python 3.9+ for ast.unparse ")
        print("or the 'astunparse' library for older Python versions.")
        print("Please install it: pip install astunparse\n")
        sys.exit(1)

# --- Your JSON data ---
# (Using the first JSON list you provided as an example)
json_data_list = [
    {'type': 'Module', 'children': [1, 3, 5, 7, 9, 11]},
    {'type': 'Expr', 'children': [2]},
    {'type': 'Str', 'value': ' Provides ``mapping`` of url paths to request handlers.\n'},
    {'type': 'ImportFrom', 'children': [4], 'value': 'bootstrap'},
    {'type': 'alias', 'value': 'Bootstrap'},
    {'type': 'ImportFrom', 'children': [6], 'value': 'fund'},
    {'type': 'alias', 'value': 'InstantPaymentNotificationHandler'},
    {'type': 'ImportFrom', 'children': [8], 'value': 'fund'},
    {'type': 'alias', 'value': 'ThankYouHandler'},
    {'type': 'ImportFrom', 'children': [10], 'value': 'view'},
    {'type': 'alias', 'value': '*'}, # Represents 'from view import *'
    {'type': 'Assign', 'children': [12, 13]},
    {'type': 'NameStore', 'value': 'mapping'},
    {'type': 'ListLoad', 'children': [14, 17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 74, 77, 80, 83, 86, 89, 92, 95, 98, 101, 104, 107, 110, 113]},
    {'type': 'TupleLoad', 'children': [15, 16]},
    {'type': 'Str', 'value': '/'},
    {'type': 'NameLoad', 'value': 'Index'},
    {'type': 'TupleLoad', 'children': [18, 19]},
    {'type': 'Str', 'value': '/ipn'},
    {'type': 'NameLoad', 'value': 'InstantPaymentNotificationHandler'},
    {'type': 'TupleLoad', 'children': [21, 22]},
    {'type': 'Str', 'value': '/thank-you'},
    {'type': 'NameLoad', 'value': 'ThankYouHandler'},
    {'type': 'TupleLoad', 'children': [24, 25]},
    {'type': 'Str', 'value': '/about\\/?'},
    {'type': 'NameLoad', 'value': 'About'},
    {'type': 'TupleLoad', 'children': [27, 28]},
    {'type': 'Str', 'value': '/guide\\/?'},
    {'type': 'NameLoad', 'value': 'Guide'},
    {'type': 'TupleLoad', 'children': [30, 31]},
    {'type': 'Str', 'value': '/guide/download\\/?'},
    {'type': 'NameLoad', 'value': 'Download'},
    {'type': 'TupleLoad', 'children': [33, 34]},
    {'type': 'Str', 'value': '/guide/standards\\/?'},
    {'type': 'NameLoad', 'value': 'Standards'},
    {'type': 'TupleLoad', 'children': [36, 37]},
    {'type': 'Str', 'value': '/community\\/?'},
    {'type': 'NameLoad', 'value': 'Community'},
    {'type': 'TupleLoad', 'children': [39, 40]},
    {'type': 'Str', 'value': '/news\\/?'},
    {'type': 'NameLoad', 'value': 'News'},
    {'type': 'TupleLoad', 'children': [42, 43]},
    {'type': 'Str', 'value': '/support\\/?'},
    {'type': 'NameLoad', 'value': 'Support'},
    {'type': 'TupleLoad', 'children': [45, 46]},
    {'type': 'Str', 'value': '/contact\\/?'},
    {'type': 'NameLoad', 'value': 'Contact'},
    {'type': 'TupleLoad', 'children': [48, 49]},
    {'type': 'Str', 'value': '/press\\/?'},
    {'type': 'NameLoad', 'value': 'Press'},
    {'type': 'TupleLoad', 'children': [51, 52]},
    {'type': 'Str', 'value': '/legal/terms'},
    {'type': 'NameLoad', 'value': 'Terms'},
    {'type': 'TupleLoad', 'children': [54, 55]},
    {'type': 'Str', 'value': '/library\\/?'},
    {'type': 'NameLoad', 'value': 'Library'},
    {'type': 'TupleLoad', 'children': [57, 58]},
    {'type': 'Str', 'value': '/library/sketchup\\/?'},
    {'type': 'NameLoad', 'value': 'Library'},
    {'type': 'TupleLoad', 'children': [60, 61]},
    {'type': 'Str', 'value': '/library/series/(\\w+)\\/?'},
    {'type': 'NameLoad', 'value': 'Library'},
    {'type': 'TupleLoad', 'children': [63, 64]},
    {'type': 'Str', 'value': '/library/users\\/?'},
    {'type': 'NameLoad', 'value': 'Users'},
    {'type': 'TupleLoad', 'children': [66, 67]},
    {'type': 'Str', 'value': '/library/users/([0-9]+)\\/?'},
    {'type': 'NameLoad', 'value': 'User'},
    {'type': 'TupleLoad', 'children': [69, 70]},
    {'type': 'Str', 'value': '/library/designs/([0-9]+)\\/?'},
    {'type': 'NameLoad', 'value': 'Design'},
    {'type': 'TupleLoad', 'children': [72, 73]},
    {'type': 'Str', 'value': '/library/designs/([0-9]+)/(edit)\\/?'},
    {'type': 'NameLoad', 'value': 'Design'},
    {'type': 'TupleLoad', 'children': [75, 76]},
    {'type': 'Str', 'value': '/library/designs\\/?'},
    {'type': 'NameLoad', 'value': 'Design'},
    {'type': 'TupleLoad', 'children': [78, 79]},
    {'type': 'Str', 'value': '/library/designs/add\\/?'},
    {'type': 'NameLoad', 'value': 'Design'},
    {'type': 'TupleLoad', 'children': [81, 82]},
    {'type': 'Str', 'value': '/library/designs/add/sketchup\\/?'},
    {'type': 'NameLoad', 'value': 'Design'},
    {'type': 'TupleLoad', 'children': [84, 85]},
    {'type': 'Str', 'value': '/redirect/success/([0-9]+)\\/?'},
    {'type': 'NameLoad', 'value': 'RedirectSuccess'},
    {'type': 'TupleLoad', 'children': [87, 88]},
    {'type': 'Str', 'value': '/redirect/error\\/?'},
    {'type': 'NameLoad', 'value': 'RedirectError'},
    {'type': 'TupleLoad', 'children': [90, 91]},
    {'type': 'Str', 'value': '/redirect/after/delete\\/?'},
    {'type': 'NameLoad', 'value': 'RedirectAfterDelete'},
    {'type': 'TupleLoad', 'children': [93, 94]},
    {'type': 'Str', 'value': '/admin/moderate\\/?'},
    {'type': 'NameLoad', 'value': 'Moderate'},
    {'type': 'TupleLoad', 'children': [96, 97]},
    {'type': 'Str', 'value': '/admin/bootstrap\\/?'},
    {'type': 'NameLoad', 'value': 'Bootstrap'},
    {'type': 'TupleLoad', 'children': [99, 100]},
    {'type': 'Str', 'value': '/activity'},
    {'type': 'NameLoad', 'value': 'ActivityScreen'},
    {'type': 'TupleLoad', 'children': [102, 103]},
    {'type': 'Str', 'value': '/txns'},
    {'type': 'NameLoad', 'value': 'TxnList'},
    {'type': 'TupleLoad', 'children': [105, 106]},
    {'type': 'Str', 'value': '/blob64/([^/]+)/([^/]+)\\/?'},
    {'type': 'NameLoad', 'value': 'Base64Blob'},
    {'type': 'TupleLoad', 'children': [108, 109]},
    {'type': 'Str', 'value': '/blob64/([^/]+)\\/?'},
    {'type': 'NameLoad', 'value': 'Base64Blob'},
    {'type': 'TupleLoad', 'children': [111, 112]},
    {'type': 'Str', 'value': '/i18n/message_strings.json'},
    {'type': 'NameLoad', 'value': 'MessageStrings'},
    {'type': 'TupleLoad', 'children': [114, 115]},
    {'type': 'Str', 'value': '/.*'},
    {'type': 'NameLoad', 'value': 'NotFound'}
]
# You would repeat this for the second JSON list if needed

# --- AST Reconstruction Logic ---

# Cache for already processed nodes to avoid redundant work / infinite loops
processed_nodes_cache = {}

def build_ast_node(node_index, all_nodes):
    """
    Recursively builds a standard Python AST node from the custom JSON format.
    Needs to be implemented thoroughly based on the JSON structure's rules.
    """
    if node_index in processed_nodes_cache:
        return processed_nodes_cache[node_index]

    if node_index >= len(all_nodes):
        raise IndexError(f"Node index {node_index} is out of bounds (total nodes: {len(all_nodes)})")

    node_data = all_nodes[node_index]
    node_type = node_data.get('type')
    children_indices = node_data.get('children', [])
    value = node_data.get('value') # Can be None

    # Helper to get processed child nodes
    def get_children():
        return [build_ast_node(child_idx, all_nodes) for child_idx in children_indices]

    # --- === CORE MAPPING LOGIC === ---
    # This is where you translate *your* JSON types to standard `ast` objects.
    # This requires careful analysis of your JSON format.
    # Below are EXAMPLES for a few types - YOU MUST COMPLETE THIS.

    ast_node = None # Initialize

    if node_type == 'Module':
        # A Module contains a list of statements in its body
        body_stmts = get_children()
        ast_node = ast.Module(body=body_stmts, type_ignores=[])

    elif node_type == 'Expr':
        # An expression statement contains a single value (which is an expression node)
        if len(children_indices) != 1:
            raise ValueError(f"Expr node {node_index} should have 1 child, got {len(children_indices)}")
        expr_value = build_ast_node(children_indices[0], all_nodes)
        ast_node = ast.Expr(value=expr_value)

    elif node_type == 'Str':
        # Represents a string constant
        ast_node = ast.Constant(value=str(value)) # Use ast.Constant for Py 3.8+

    elif node_type == 'Num':
        # Represents a number constant (int or float)
         ast_node = ast.Constant(value=value) # ast.Constant handles int/float

    elif node_type == 'NameLoad':
        # Represents loading a variable's value
        if not isinstance(value, str):
             raise ValueError(f"NameLoad node {node_index} needs a string 'value', got {type(value)}")
        ast_node = ast.Name(id=value, ctx=ast.Load())

    elif node_type == 'NameStore':
        # Represents assigning to a variable
        if not isinstance(value, str):
             raise ValueError(f"NameStore node {node_index} needs a string 'value', got {type(value)}")
        ast_node = ast.Name(id=value, ctx=ast.Store())

    elif node_type == 'ImportFrom':
        # Represents 'from module import name1, name2 ...'
        module_name = value
        alias_nodes = get_children() # Children should be 'alias' type nodes
        # Filter out potential non-alias nodes if structure is inconsistent
        valid_aliases = [n for n in alias_nodes if isinstance(n, ast.alias)]
        if len(valid_aliases) != len(alias_nodes):
             print(f"Warning: ImportFrom node {node_index} children were not all alias nodes.")
        ast_node = ast.ImportFrom(module=module_name, names=valid_aliases, level=0) # Assuming level=0

    elif node_type == 'Import':
         # Represents 'import module1, module2 ...'
         alias_nodes = get_children() # Children should be 'alias' type nodes
         valid_aliases = [n for n in alias_nodes if isinstance(n, ast.alias)]
         if len(valid_aliases) != len(alias_nodes):
             print(f"Warning: Import node {node_index} children were not all alias nodes.")
         ast_node = ast.Import(names=valid_aliases)

    elif node_type == 'alias':
        # Represents an imported name, possibly with an 'as' rename
        name = value
        asname = node_data.get('asname') # Check if your JSON includes 'asname'
        # Handle 'from x import *' case
        if name == '*':
             # Special handling: ast.ImportFrom uses names=[ast.alias(name='*', asname=None)]
             name = '*'
             asname = None
        elif not isinstance(name, str):
             raise ValueError(f"alias node {node_index} needs a string 'value', got {type(name)}")

        ast_node = ast.alias(name=name, asname=asname) # asname can be None

    elif node_type == 'Assign':
        # Represents assignment, e.g., x = y or x, z = y
        # ASSUMPTION: First child is target(s), second is value
        if len(children_indices) != 2:
            raise ValueError(f"Assign node {node_index} expects 2 children (targets, value), got {len(children_indices)}")
        targets_node = build_ast_node(children_indices[0], all_nodes)
        value_node = build_ast_node(children_indices[1], all_nodes)

        # Ensure targets is a list for ast.Assign
        targets_list = []
        if isinstance(targets_node, (ast.Tuple, ast.List)):
             # Handle multiple assignment targets like x, y = ...
             targets_list = targets_node.elts
        else:
             targets_list = [targets_node]

        ast_node = ast.Assign(targets=targets_list, value=value_node)

    elif node_type == 'ListLoad':
         # Represents a list literal being used/loaded
         elements = get_children()
         ast_node = ast.List(elts=elements, ctx=ast.Load())

    elif node_type == 'TupleLoad':
         # Represents a tuple literal being used/loaded
         elements = get_children()
         ast_node = ast.Tuple(elts=elements, ctx=ast.Load())

    # --- TODO: Add MANY MORE ELIF BLOCKS ---
    # You need to handle:
    # - FunctionDef, ClassDef (with arguments, body, decorators, bases)
    # - Call (function calls with args, kwargs)
    # - AttributeLoad, AttributeStore (e.g., obj.attr)
    # - SubscriptLoad, SubscriptStore (e.g., list[index])
    # - If, For, While, TryExcept, With (control flow)
    # - Return, Raise, Break, Continue
    # - Binary operations (BinOpAdd, BinOpSub, etc.)
    # - Comparisons (CompareEq, CompareLt, etc.)
    # - Boolean ops (BoolOpAnd, BoolOpOr)
    # - Unary ops (UnaryOpNot, UnaryOpInvert)
    # - DictLoad, SetLoad
    # - Comprehensions (ListComp, DictComp, SetComp, GeneratorExp)
    # - Lambda
    # - arguments, arg (for function definitions)
    # - ... and potentially others based on your specific code.

    else:
        print(f"Warning: Unhandled node type '{node_type}' at index {node_index}. Data: {node_data}")
        # Return a placeholder or raise an error
        # Returning a simple Constant placeholder for now
        ast_node = ast.Constant(value=f"UNHANDLED<{node_type}>")
        # raise NotImplementedError(f"AST building not implemented for type: {node_type}")

    if ast_node is None:
         raise ValueError(f"Failed to create AST node for index {node_index} with type {node_type}")

    # Cache the result before returning
    processed_nodes_cache[node_index] = ast_node
    return ast_node

# --- Main Execution ---
try:
    print("Building AST from JSON data...")
    # Start building from the root node (assuming it's index 0 and type 'Module')
    root_node = build_ast_node(0, json_data_list)

    # Perform some basic validation
    if not isinstance(root_node, ast.Module):
        print(f"Error: Root node (index 0) was not converted to an ast.Module. Got: {type(root_node)}")
        sys.exit(1)

    print("AST construction finished (may have warnings for unhandled types).")

    # Add line numbers and column offsets (optional but good practice)
    print("Fixing missing AST locations...")
    ast.fix_missing_locations(root_node)

    print("Unparsing AST to Python code...")
    # Convert the reconstructed AST back to Python code
    python_code = unparse(root_node)

    print("\n--- Generated Python Code ---")
    print(python_code)
    print("-----------------------------\n")

except IndexError as e:
    print(f"\nError: An index was out of bounds. This often means a 'children' list points")
    print(f"       to a node that doesn't exist in the main list.")
    print(f"       Details: {e}")
except (ValueError, TypeError, NotImplementedError) as e:
    print(f"\nError during AST construction or mapping:")
    print(f"       Details: {e}")
except Exception as e:
    print(f"\nAn unexpected error occurred:")
    print(f"       Details: {e}")
    import traceback
    print("\nTraceback:")
    traceback.print_exc()