# MC Code Dataset Clean


In [6]:
import pandas as pd

# Load the Excel file
file_path = 'docs/WMX3API c++ to python list -all2.xlsx'
df = pd.read_excel(file_path)

# Function to remove trailing semicolon or semicolon followed by a space
def remove_trailing_semicolon(s):
    if isinstance(s, str):
        if s.endswith(';'):
            return s[:-1]
        elif s.endswith('; '):
            return s[:-2]
        elif s.endswith('; '):
            return s[:-2]
    return s

# Apply the function to the 'FunctionC++' column
df['FunctionC++'] = df['FunctionC++'].apply(remove_trailing_semicolon)

# Save the updated DataFrame back to an Excel file
df.to_excel(file_path, index=False)

print("File has been updated and saved successfully.")


File has been updated and saved successfully.


In [1]:
import pandas as pd

# Load the Excel file
file_path = 'docs/table-data(Api)2.xlsx'
df = pd.read_excel(file_path)

# Function to remove blank spaces in front of "("
def remove_blank_space(s):
    if isinstance(s, str):
        return s.replace(" (", "(")
    return s

# Apply the function to the 'APINAME' column
df['APINAME'] = df['APINAME'].apply(remove_blank_space)

# Save the updated DataFrame back to an Excel file
df.to_excel(file_path, index=False)

print("File has been updated and saved successfully.")

File has been updated and saved successfully.


In [4]:
import pandas as pd

# Load the Excel file
file_path = 'docs/WMX3API c++ to python list -all2.xlsx'
df = pd.read_excel(file_path)

# Function to adjust spaces around '*'
def adjust_asterisk_spacing(s):
    if isinstance(s, str):
        # Replace * without leading space and with trailing space
        s = s.replace('* ', ' *')
        # Replace * without leading space and with trailing space (fix any extra spaces)
        s = s.replace('  *', ' *')
         #' = '
        s = s.replace(' = ', '=')
       
    return s

# Apply the function to the 'FunctionC++' column
df['FunctionC++'] = df['FunctionC++'].apply(adjust_asterisk_spacing)

# Save the updated DataFrame back to an Excel file
df.to_excel(file_path, index=False)

print("File has been updated and saved successfully.")

File has been updated and saved successfully.


## Statictics of sample codes

In [1]:
import os
import tiktoken

# Define the folder path and initialize variables
folder_canonical = '/Users/yin/Documents/GitHub/MCCoder/MCEval_Files/Sample codes'
total_tokens = 0
file_count = 0

# Define a function to read and tokenize a file
def get_token_length(file_path):
    """
    Reads a file and returns the number of tokens.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(content)
    return len(tokens)

# Iterate through task_ids from 1 to 98
for task_id in range(1, 99):
    filename = f'{task_id}_MCEval.py'
    file_path = os.path.join(folder_canonical, filename)
    
    if os.path.exists(file_path):
        tokens_length = get_token_length(file_path)
        total_tokens += tokens_length
        file_count += 1
        print(f'File: {filename}, Tokens: {tokens_length}')
    else:
        print(f'File not found: {filename}')

# Calculate the average token length
if file_count > 0:
    average_tokens = total_tokens / file_count
    print(f'Average tokens length of all files: {average_tokens:.2f}')
else:
    print('No files found.')



File: 1_MCEval.py, Tokens: 225
File: 2_MCEval.py, Tokens: 250
File: 3_MCEval.py, Tokens: 401
File: 4_MCEval.py, Tokens: 433
File: 5_MCEval.py, Tokens: 703
File: 6_MCEval.py, Tokens: 217
File: 7_MCEval.py, Tokens: 365
File: 8_MCEval.py, Tokens: 368
File: 9_MCEval.py, Tokens: 517
File: 10_MCEval.py, Tokens: 437
File: 11_MCEval.py, Tokens: 927
File: 12_MCEval.py, Tokens: 741
File: 13_MCEval.py, Tokens: 1220
File: 14_MCEval.py, Tokens: 412
File: 15_MCEval.py, Tokens: 425
File: 16_MCEval.py, Tokens: 530
File: 17_MCEval.py, Tokens: 41
File: 18_MCEval.py, Tokens: 520
File: 19_MCEval.py, Tokens: 482
File: 20_MCEval.py, Tokens: 380
File: 21_MCEval.py, Tokens: 368
File: 22_MCEval.py, Tokens: 441
File: 23_MCEval.py, Tokens: 504
File: 24_MCEval.py, Tokens: 505
File: 25_MCEval.py, Tokens: 539
File: 26_MCEval.py, Tokens: 1055
File: 27_MCEval.py, Tokens: 474
File: 28_MCEval.py, Tokens: 788
File: 29_MCEval.py, Tokens: 874
File: 30_MCEval.py, Tokens: 904
File: 31_MCEval.py, Tokens: 431
File: 32_MCEval.

## Statictics of Cannoical codes

In [14]:
import json
import os
import math

# Load the dataset
folder_dataset = '/Users/yin/Documents/GitHub/MCCoder/docs/WMX3API_MCEval_Evaluation_Dataset.json'
folder_canonical = '/Users/yin/Documents/GitHub/MCCodeLog/CanonicalCode'

with open(folder_dataset, 'r') as f:
    dataset = json.load(f)

# Initialize variables for calculations
total_instruction_length = 0
instruction_count = 0
task_difficulty_lengths = {1: [], 2: [], 3: []}
canonical_code_lengths = {1: [], 2: [], 3: []}
task_canonical_lengths = {}

# Iterate through each item in the dataset
for item in dataset:
    task_id = item['TaskId']
    instruction = item['Instruction']
    difficulty = item['Difficulty']

    # Skip if Difficulty is NaN
    if math.isnan(difficulty):
        continue

    difficulty = int(difficulty)  # Convert difficulty to an integer

    # Calculate the total instruction length and count
    instruction_length = len(str(instruction))
    total_instruction_length += instruction_length
    instruction_count += 1

    # Append the instruction length to the respective difficulty level list
    task_difficulty_lengths[difficulty].append(instruction_length)

    # Read the corresponding canonical code file
    canonical_code_file = os.path.join(folder_canonical, f'{task_id}_CanonicalCode.py')
    if os.path.exists(canonical_code_file):
        with open(canonical_code_file, 'r') as code_file:
            canonical_code = code_file.read()
            canonical_code_length = len(canonical_code)
            task_canonical_lengths[task_id] = canonical_code_length

            # Append the canonical code length to the respective difficulty level list
            canonical_code_lengths[difficulty].append(canonical_code_length)

# Calculate average instruction length
average_instruction_length = total_instruction_length / instruction_count

# Calculate average lengths by difficulty, avoiding division by zero
average_instruction_lengths_by_difficulty = {difficulty: (sum(lengths) / len(lengths)) if len(lengths) > 0 else 0
                                             for difficulty, lengths in task_difficulty_lengths.items()}

average_canonical_code_lengths_by_difficulty = {difficulty: (sum(lengths) / len(lengths)) if len(lengths) > 0 else 0
                                                for difficulty, lengths in canonical_code_lengths.items()}

# Calculate overall average canonical code length
if len(task_canonical_lengths) > 0:
    overall_average_canonical_code_length = sum(task_canonical_lengths.values()) / len(task_canonical_lengths)
else:
    overall_average_canonical_code_length = 0

# Output the results
print(f'Average Instruction Length: {average_instruction_length}')
print(f'Average Instruction Length by Difficulty: {average_instruction_lengths_by_difficulty}')
print(f'Overall Average Canonical Code Length: {overall_average_canonical_code_length}')
print(f'Average Canonical Code Length by Difficulty: {average_canonical_code_lengths_by_difficulty}')

# Print average instruction lengths by difficulty level
for difficulty, avg_length in average_instruction_lengths_by_difficulty.items():
    print(f'Average Instruction Length for Difficulty {difficulty}: {avg_length}')


Average Instruction Length: 272.88793103448273
Average Instruction Length by Difficulty: {1: 266.655737704918, 2: 189.25, 3: 405.7826086956522}
Overall Average Canonical Code Length: 2145.543103448276
Average Canonical Code Length by Difficulty: {1: 2268.0, 2: 1351.09375, 3: 2926.086956521739}
Average Instruction Length for Difficulty 1: 266.655737704918
Average Instruction Length for Difficulty 2: 189.25
Average Instruction Length for Difficulty 3: 405.7826086956522
