In [4]:
import os
import random
import re
import json

In [1]:
def remove_semicolons(code):
    """Remove semicolons"""
    lines = code.split('\n')
    new_lines = []
    modified = False
    for line in lines:
        if line.strip().endswith(';') and random.random() < 0.8:
            new_line = line.rstrip(';')
            modified = True
        else:
            new_line = line
        new_lines.append(new_line)
    return '\n'.join(new_lines), modified

def remove_braces(code):
    """Remove one '{' or '}'"""
    left_braces = [m.start() for m in re.finditer(r'{', code)]
    right_braces = [m.start() for m in re.finditer(r'}', code)]
    all_braces = left_braces + right_braces
    if all_braces:
        pos_to_remove = random.choice(all_braces)
        code = code[:pos_to_remove] + code[pos_to_remove+1:]
        modified = True
    else:
        modified = False
    return code, modified

def modify_for_loops(code):
    """Remove semicolons from for loop definitions."""
    pattern = r'for\s*\(\s*([^;]*)\s*;\s*([^;]*)\s*;\s*([^)]*)\)'
    replacement = r'for (\1 \2 \3)'
    new_code = re.sub(pattern, replacement, code)
    modified = new_code != code
    return new_code, modified

def remove_parentheses(code):
    """Remove one '(' or ')' """
    left_parens = [m.start() for m in re.finditer(r'\(', code)]
    right_parens = [m.start() for m in re.finditer(r'\)', code)]
    all_parens = left_parens + right_parens
    if all_parens:
        pos_to_remove = random.choice(all_parens)
        code = code[:pos_to_remove] + code[pos_to_remove+1:]
        modified = True
    else:
        modified = False
    return code, modified

def introduce_keyword_typos(code):
    """Replace some C keywords with typos"""
    keywords = ['int', 'float', 'double', 'char', 'void', 'if', 'else', 'for', 'while', 'do', 
                'switch', 'case', 'break', 'continue', 'return']
    used_keywords = [kw for kw in keywords if re.search(r'\b' + re.escape(kw) + r'\b', code)]
    if not used_keywords:
        return code, False
    num_to_typo = max(1, len(used_keywords) // 2)
    to_typo = random.sample(used_keywords, num_to_typo)
    modified = False
    for kw in to_typo:
        pattern = r'\b' + re.escape(kw) + r'\b'
        new_code = re.sub(pattern, kw + 't', code)
        if new_code != code:
            code = new_code
            modified = True
    return code, modified

In [2]:
error_functions = [
    remove_semicolons,
    remove_braces,
    modify_for_loops,
    remove_parentheses,
    introduce_keyword_typos
]

In [5]:
directory = '/home/nistha/jupyter-env/compilerdesign/C_codes/'
c_files = [f for f in os.listdir(directory) if f.endswith('.c')]
c_files.sort()
dataset = []
max_per_file = 10
max_attempts_per_file = max_per_file * 2

In [12]:
for c_file in c_files:
    file_path = os.path.join(directory, c_file)
    with open(file_path, 'r') as f:
        correct_code = f.read()
    
    erroneous_versions = []
    attempts = 0
    while len(erroneous_versions) < max_per_file and attempts < max_attempts_per_file:
        attempts += 1
        # Choose 2-3 error functions to ensure multiple errors
        num_errors = random.randint(2, 3)
        selected_errors = random.sample(error_functions, num_errors)
        temp_code = correct_code
        modified_count = 0
        # Apply selected error functions
        for func in selected_errors:
            new_code, modified = func(temp_code)
            if modified:
                temp_code = new_code
                modified_count += 1
        # Add to list if at least two errors were applied and code is different
        if modified_count >= 2 and temp_code != correct_code and temp_code not in erroneous_versions:
            erroneous_versions.append(temp_code)
    
    # Add pairs to dataset
    for err_code in erroneous_versions:
        dataset.append({"input": err_code, "output": correct_code})

In [13]:
with open('c_syntax_dataset.json', 'w') as f:
    json.dump(dataset, f)

In [15]:
import json

# Load the original dataset
with open('c_syntax_dataset.json', 'r') as f:
    dataset = json.load(f)

# Filter out pairs where input and output are identical
filtered_dataset = [pair for pair in dataset if pair["input"].strip() != pair["output"].strip()]

# Save the filtered dataset
with open('c_syntax_dataset_filtered.json', 'w') as f:
    json.dump(filtered_dataset, f)

# Print summary and verify
print(f"Original dataset size: {len(dataset)}")
print(f"Filtered dataset size: {len(filtered_dataset)}")
print(f"Removed {len(dataset) - len(filtered_dataset)} pairs where input and output were identical.")
print("\nFirst 3 pairs from filtered dataset:")
for item in filtered_dataset[:3]:
    print("Erroneous Code (Input):")
    print(item["input"][:100] + "...")
    print("Correct Code (Output):")
    print(item["output"][:100] + "...")
    print("---")

Original dataset size: 1010
Filtered dataset size: 1010
Removed 0 pairs where input and output were identical.

First 3 pairs from filtered dataset:
Erroneous Code (Input):
#include <stdio.h>
intt main) {
    printf("Hello World");
    return 0;
}...
Correct Code (Output):
#include <stdio.h>
int main() {
    printf("Hello World");
    return 0;
}...
---
Erroneous Code (Input):
#include <stdio.h>
int main( 
    printf("Hello World");
    return 0;
}...
Correct Code (Output):
#include <stdio.h>
int main() {
    printf("Hello World");
    return 0;
}...
---
Erroneous Code (Input):
#include <stdio.h>
intt main( {
    printf("Hello World");
    return 0;
...
Correct Code (Output):
#include <stdio.h>
int main() {
    printf("Hello World");
    return 0;
}...
---
