In [12]:
# Importar librerias
import os
import json
import javalang

In [13]:
def collect_java_files(base_path):
    """Recopila todos los archivos Java en el directorio base y sus subdirectorios."""
    java_files = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith(".java"):
                java_files.append(os.path.join(root, file))
    return java_files



In [14]:
def generate_ast(java_file_path):
    """Genera un Árbol de Sintaxis Abstracta (AST) a partir de un archivo Java."""
    with open(java_file_path, 'r') as file:
        java_code = file.read()
    tokens = javalang.tokenizer.tokenize(java_code)
    parser = javalang.parser.Parser(tokens)
    return parser.parse()

In [15]:
def ast_to_dict(node):
    """Convierte un AST a un diccionario."""
    if isinstance(node, javalang.ast.Node):
        return {
            'type': type(node).__name__,
            'attributes': {key: ast_to_dict(value) for key, value in node.__dict__.items() if not key.startswith('_') and value is not None},
            'children': [ast_to_dict(child) for child in node.children if child is not None]
        }
    elif isinstance(node, list):
        return [ast_to_dict(child) for child in node]
    else:
        return node

In [16]:
def save_ast(ast, output_path):
    """Guarda un AST en formato JSON."""
    ast_dict = ast_to_dict(ast)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as file:
        json.dump(ast_dict, file, indent=2)



In [17]:
def preprocess_dataset(base_path, output_dir):
    """Preprocesa el dataset convirtiendo archivos Java en ASTs y guardándolos en formato JSON."""
    java_files = collect_java_files(base_path)
    for java_file in java_files:
        try:
            ast = generate_ast(java_file)
            relative_path = os.path.relpath(java_file, base_path)
            output_path = os.path.join(output_dir, f"{relative_path}.json")
            save_ast(ast, output_path)
            print(f"Procesado: {java_file}")
        except Exception as e:
            print(f"Error procesando {java_file}: {e}")

In [18]:
base_path = 'Versions/version_2'
output_dir = '/Preprocessed_files/'
preprocess_dataset(base_path, output_dir)

Error procesando Versions/version_2\0017d438_9852706b\0017d438.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\0017d438_9852706b\9852706b.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\0017d438_ac180326\0017d438.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\0017d438_ac180326\ac180326.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\0048a372_0adb1ee5\0048a372.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\0048a372_0adb1ee5\0adb1ee5.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\00af3420_5449d33c\00af3420.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\00af3420_5449d33c\5449d33c.java: Object of type set is not JSON serializable
Error procesando Versions/version_2\00af3420_86102d81\00af3420.java: Object of type set is not J

KeyboardInterrupt: 

In [19]:
import os

file_list = os.listdir(base_path)
print(file_list)

['0017d438_9852706b', '0017d438_ac180326', '0048a372_0adb1ee5', '00af3420_5449d33c', '00af3420_86102d81', '00af3420_d92c5342', '00af3420_f4d6d28d', '00c0b82a_1ea771ea', '00db6701_0c1143f7', '00f79486_ce0b2178', '00f79486_fb312dc6', '016510dc_f1540246', '018c15bd_378bb1ca', '018c9543_d1dbc56a', '01b911ac_a195911e', '01c915a2_480de7be', '01c915a2_4e5ee0f7', '034030f3_9e4ddc38', '034030f3_bf992c91', '03b3c5af_d74028ea', '04a706b1_5b7127bf', '04df7bb8_1ea771ea', '04df7bb8_85125ecb', '04df7bb8_aaccc000', '04df7bb8_ee270b2a', '04ed33a5_0c173033', '04ed33a5_6b83b22e', '04ed33a5_7ea34254', '0588b869_11373c16', '0588b869_69b2fd22', '0588b869_9028caf7', '05ca89ed_163d0dde', '05ca89ed_5b9a0551', '05f939b3_e647bef7', '065e0cbd_9b449b4f', '06b9cf99_fb69b3b4', '07038b12_a18cb2c1', '07043d35_97a7fab5', '07749c65_2bbf754b', '079ad09e_20012377', '07a0e4dc_bac616ee', '085ddefc_1500a4fa', '086f0f90_4a570de6', '0889dcfb_22138fad', '089b7f00_fb20d298', '08b9908d_f5fb1b62', '08cf0478_0f14b12d', '08cf0478_92