In [2]:
# # Notebooks/checking-metadata-structure.ipynb

# import os
# import json
# from json.decoder import JSONDecodeError
# from IPython.display import display, Markdown

# def get_fields(data, parent_key='', result=None):
#     if result is None:
#         result = set()
#     if isinstance(data, dict):
#         for key, value in data.items():
#             full_key = f"{parent_key}.{key}" if parent_key else key
#             result.add(full_key)
#             get_fields(value, full_key, result)
#     elif isinstance(data, list):
#         for index, item in enumerate(data):
#             full_key = f"{parent_key}[{index}]"
#             get_fields(item, full_key, result)
#     return result

# # Set the path to your metadata folder
# folder_path = '/Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata'

# # Collect all JSON files from the folder
# json_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.json')]

# if not json_files:
#     display(Markdown('**No JSON files found in the metadata folder.**'))
# else:
#     all_fields = []
#     invalid_files = []
#     for json_file in json_files:
#         try:
#             with open(json_file, 'r', encoding='utf-8') as f:
#                 data = json.load(f)
#             fields = get_fields(data)
#             all_fields.append(fields)
#         except JSONDecodeError as e:
#             error_message = f"{e.msg} at line {e.lineno}, column {e.colno} (char {e.pos})"
#             display(Markdown(f"**Error in file `{os.path.basename(json_file)}`:** {error_message}"))
#             invalid_files.append(json_file)
#         except Exception as e:
#             display(Markdown(f"**Error in file `{os.path.basename(json_file)}`:** {str(e)}"))
#             invalid_files.append(json_file)

#     if len(all_fields) == 0:
#         display(Markdown('**No valid JSON files to process.**'))
#     else:
#         # Compute the fields common to all JSON files
#         common_fields = set.intersection(*all_fields)
#         if common_fields:
#             md_output = "## Fields Common to All JSON Files:\n\n"
#             for field in sorted(common_fields):
#                 md_output += f"- `{field}`\n"
#             display(Markdown(md_output))
#         else:
#             display(Markdown('**There are no common fields across all JSON files.**'))

#     if invalid_files:
#         md_invalid = "\n## The following JSON files could not be processed due to errors:\n\n"
#         for file in invalid_files:
#             md_invalid += f"- `{os.path.basename(file)}`\n"
#         display(Markdown(md_invalid))

In [None]:
import os
import json
import re

class JSONParseError(Exception):
    def __init__(self, original_error, cleaned_error, text):
        self.original_error = original_error
        self.cleaned_error = cleaned_error
        self.text = text
        super().__init__(f"{original_error} | {cleaned_error}")

def fix_invalid_escapes(text):
    """
    Replace backslashes that are not part of a valid escape sequence with double backslashes.
    Valid escapes in JSON are: " \ / b f n r t and uXXXX.
    """
    return re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', text)

def escape_control_chars_in_strings(text):
    """
    Find JSON string literals in the text (including those spanning multiple lines)
    and replace literal newline, carriage return, and tab characters with their escape sequences.
    """
    def replace_control(match):
        s = match.group(0)
        inner = s[1:-1]
        inner = inner.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return f'"{inner}"'
    return re.sub(r'"(?:\\.|[^"\\])*"', replace_control, text, flags=re.DOTALL)

def fix_multiline_strings(text):
    """
    Detect and fix multi-line string literals for keys like "run" that span multiple lines without proper termination.
    Joins the lines and replaces literal newlines with '\\n' so the JSON string remains intact.
    """
    lines = text.splitlines()
    fixed_lines = []
    in_multiline = False
    multiline_accum = ""
    for line in lines:
        if not in_multiline:
            # Look for a pattern like '"run": "'
            m = re.search(r'("run":\s*")([^"]*)$', line)
            if m and not line.rstrip().endswith('"'):
                in_multiline = True
                multiline_accum = m.group(2)
                prefix = line[:m.start(2)]
                fixed_lines.append(prefix)
            else:
                fixed_lines.append(line)
        else:
            # Accumulate until we hit a closing quote
            if '"' in line:
                idx = line.find('"')
                multiline_accum += "\n" + line[:idx]
                fixed_string = multiline_accum.replace("\n", "\\n")
                fixed_lines[-1] = re.sub(r'("run":\s*")[^"]*$', r'\1' + fixed_string, fixed_lines[-1])
                fixed_lines.append(line[idx:])
                in_multiline = False
                multiline_accum = ""
            else:
                multiline_accum += "\n" + line
    return "\n".join(fixed_lines)

def clean_json_text(text):
    """
    Apply cleaning functions to fix common JSON formatting issues.
    First fix multiline strings, then handle invalid escapes and control characters.
    """
    text = fix_multiline_strings(text)
    text = fix_invalid_escapes(text)
    text = escape_control_chars_in_strings(text)
    return text

def robust_json_loads(text):
    """
    Try loading JSON. If it fails, clean the text and try again.
    Raises a JSONParseError if both attempts fail.
    """
    try:
        return json.loads(text)
    except json.JSONDecodeError as e:
        original_error = e
        cleaned_text = clean_json_text(text)
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError as e2:
            raise JSONParseError(original_error, e2, text)

def print_problematic_lines(text, lineno, colno, context=1):
    """
    Print the problematic line(s) around the error location.
    Shows the error line with one line of context before and after.
    """
    lines = text.splitlines()
    start = max(0, lineno - 1 - context)
    end = min(len(lines), lineno + context)
    print("Problematic lines:")
    for i in range(start, end):
        marker = " <-- error" if i == lineno - 1 else ""
        print(f"{i+1}: {lines[i]}{marker}")

def process_json_files(folder_path):
    """
    Process all JSON files in the folder.
    - For files that fail to parse, display the file name and error details.
    - For files that parse successfully (and are JSON objects), compute the intersection of their keys.
    Finally, print a summary of problematic files and the common keys across all valid JSON objects.
    """
    json_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.json')]
    error_count = 0
    processed_files = 0
    common_keys = None

    for file_path in json_files:
        try:
            with open(file_path, 'r', errors='replace') as f:
                text = f.read()
            data = robust_json_loads(text)
            # For successfully parsed JSON objects, update the common keys
            if isinstance(data, dict):
                processed_files += 1
                keys = set(data.keys())
                if common_keys is None:
                    common_keys = keys
                else:
                    common_keys = common_keys.intersection(keys)
            else:
                # If it's not a dict, we skip key collection
                print(f"\nFile {file_path} parsed but is not a JSON object; skipping for common keys.")
        except JSONParseError as err:
            error_count += 1
            print(f"\nProblematic file: {file_path}")
            print("Initial JSONDecodeError:", err.original_error)
            print("JSONDecodeError after cleaning:", err.cleaned_error)
            if hasattr(err.original_error, 'lineno') and hasattr(err.original_error, 'colno'):
                print_problematic_lines(err.text, err.original_error.lineno, err.original_error.colno)
        except Exception as ex:
            print(f"Unexpected error processing {file_path}: {ex}")

    print(f"\nTotal problematic files: {error_count} out of {len(json_files)}")
    print(f"Successfully parsed JSON objects (dicts): {processed_files}")
    if common_keys is not None:
        print("\nCommon fields across all successfully parsed JSON objects:")
        print(sorted(common_keys))
    else:
        print("No common fields found.")

if __name__ == "__main__":
    folder_path = "/Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata"
    process_json_files(folder_path)

Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-28601087481_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/broadinstitute-picard-14294971046_metadata.json: bad escape (end of pattern) at position 63
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-24423934371_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/python-mypy-26575513353_metadata.json: bad escape (end of pattern) at position 72
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-12749443626_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/broadinstitute-picard-14294971349_metadata.jso

In [31]:
import os
import json
import re

class JSONParseError(Exception):
    def __init__(self, original_error, cleaned_error, text):
        self.original_error = original_error
        self.cleaned_error = cleaned_error
        self.text = text
        super().__init__(f"{original_error} | {cleaned_error}")

def fix_invalid_escapes(text):
    return re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', text)

def escape_control_chars_in_strings(text):
    def replace_control(match):
        s = match.group(0)
        inner = s[1:-1]
        inner = inner.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return f'"{inner}"'
    return re.sub(r'"(?:\\.|[^"\\])*"', replace_control, text, flags=re.DOTALL)

def fix_multiline_strings(text):
    lines = text.splitlines()
    fixed_lines = []
    in_multiline = False
    multiline_accum = ""
    for line in lines:
        if not in_multiline:
            m = re.search(r'("run":\s*")([^"]*)$', line)
            if m and not line.rstrip().endswith('"'):
                in_multiline = True
                multiline_accum = m.group(2)
                prefix = line[:m.start(2)]
                fixed_lines.append(prefix)
            else:
                fixed_lines.append(line)
        else:
            if '"' in line:
                idx = line.find('"')
                multiline_accum += "\n" + line[:idx]
                fixed_string = multiline_accum.replace("\n", "\\n")
                fixed_lines[-1] = re.sub(r'("run":\s*")[^"]*$', r'\1' + fixed_string, fixed_lines[-1])
                fixed_lines.append(line[idx:])
                in_multiline = False
                multiline_accum = ""
            else:
                multiline_accum += "\n" + line
    return "\n".join(fixed_lines)

def clean_json_text(text):
    text = fix_multiline_strings(text)
    text = fix_invalid_escapes(text)
    text = escape_control_chars_in_strings(text)
    return text

def robust_json_loads(text):
    try:
        return json.loads(text)
    except json.JSONDecodeError as e:
        original_error = e
        cleaned_text = clean_json_text(text)
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError as e2:
            raise JSONParseError(original_error, e2, text)

def print_problematic_lines(text, lineno, colno, context=1):
    lines = text.splitlines()
    start = max(0, lineno - 1 - context)
    end = min(len(lines), lineno + context)
    print("Problematic lines:")
    for i in range(start, end):
        marker = " <-- error" if i == lineno - 1 else ""
        print(f"{i+1}: {lines[i]}{marker}")

def process_json_files(folder_path):
    json_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.json')]
    error_count = 0

    for file_path in json_files:
        try:
            with open(file_path, 'r', errors='replace') as f:
                text = f.read()
            _ = robust_json_loads(text)
        except JSONParseError as err:
            error_count += 1
            print(f"\nProblematic file: {file_path}")
            print("Initial JSONDecodeError:", err.original_error)
            print("JSONDecodeError after cleaning:", err.cleaned_error)
            if hasattr(err.original_error, 'lineno') and hasattr(err.original_error, 'colno'):
                print_problematic_lines(err.text, err.original_error.lineno, err.original_error.colno)
        except Exception as ex:
            print(f"Unexpected error processing {file_path}: {ex}")

    print(f"\nTotal problematic files: {error_count} out of {len(json_files)}")

if __name__ == "__main__":
    folder_path = "/Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata"
    process_json_files(folder_path)

Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-28601087481_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/broadinstitute-picard-14294971046_metadata.json: bad escape (end of pattern) at position 63
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-24423934371_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/python-mypy-26575513353_metadata.json: bad escape (end of pattern) at position 72
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/apache-maven-12749443626_metadata.json: bad escape (end of pattern) at position 106
Unexpected error processing /Users/harshil/Developer/GitHub_Repos/FailFix/Data/metadata/broadinstitute-picard-14294971349_metadata.jso

In [29]:
def fix_multiline_strings(text):
    """
    Detect and fix multi-line string literals for keys like "run" that
    are broken over multiple lines without a proper closing quote.
    This function looks for lines that contain '"run": "' and then
    accumulates subsequent lines until a closing quote is found.
    It then replaces actual newlines with '\\n' and reassembles the line.
    """
    lines = text.splitlines()
    fixed_lines = []
    i = 0
    while i < len(lines):
        line = lines[i]
        # Check if this line contains the "run" key and an opening quote
        if '"run": "' in line and not line.strip().endswith('"'):
            # Split the line at the "run" key
            prefix, remainder = line.split('"run": "', 1)
            multiline_value = remainder  # Start accumulating the value
            # Advance to the next line
            i += 1
            # Accumulate lines until we find one with a closing quote
            while i < len(lines) and '"' not in lines[i]:
                multiline_value += "\n" + lines[i]
                i += 1
            if i < len(lines):
                # Found a line with a closing quote; split at the first quote.
                closing_line = lines[i]
                idx = closing_line.find('"')
                multiline_value += "\n" + closing_line[:idx]
                fixed_value = multiline_value.replace("\n", "\\n")
                # Reassemble the fixed "run" field
                fixed_line = prefix + '"run": "' + fixed_value + '"' + closing_line[idx+1:]
                fixed_lines.append(fixed_line)
                i += 1
            else:
                # Reached end of file without a closing quote; force one.
                fixed_value = multiline_value.replace("\n", "\\n")
                fixed_lines.append(prefix + '"run": "' + fixed_value + '"')
        else:
            fixed_lines.append(line)
            i += 1
    return "\n".join(fixed_lines)