## Data Extraction and Imputation

In [None]:
# Replace this with the path to the directory where your HTML files are stored
extract_path = 'path/to/your/dataset'

chat_texts = {}
malformed_files = []
file_sizes = []

for filename in os.listdir(extract_path):
    if filename.endswith('.html'):
        file_path = os.path.join(extract_path, filename)
        file_size = os.path.getsize(file_path)
        file_sizes.append(file_size)

        try:
            if file_size <= 10000:
                raise ValueError("Empty file")

            with open(file_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'lxml')
                chat_text = soup.get_text().strip()

                if len(chat_text) < 200:
                    raise ValueError("Insufficient content in file")

                chat_texts[filename] = chat_text

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            malformed_files.append(filename)

# Find the mode of file sizes
mode_file_size = Counter(file_sizes).most_common(1)[0][0]

# Select a file with the mode file size for imputation
mode_file_path = next(file for file, size in zip(chat_texts.keys(), file_sizes) if size == mode_file_size)
with open(os.path.join(extract_path, mode_file_path), 'r', encoding='utf-8') as file:
    mode_file_content = BeautifulSoup(file, 'lxml').get_text().strip()

# Data Imputation for Malformed Files
for malformed_file in malformed_files:
    chat_texts[malformed_file] = mode_file_content

# Save chat_texts and malformed_files to JSON files
# Change the paths to a general directory or instruct users to specify their own
chat_texts_json_path = os.path.join('output', 'chat_texts.json')
malformed_files_json_path = os.path.join('output', 'malformed_files.json')

with open(chat_texts_json_path, 'w', encoding='utf-8') as file:
    json.dump(chat_texts, file, ensure_ascii=False, indent=4)

with open(malformed_files_json_path, 'w', encoding='utf-8') as file:
    json.dump(malformed_files, file, ensure_ascii=False, indent=4)

Error processing b0640e51-6879-40cb-a4f5-329f952ef99d.html: Empty file
Error processing 139235c7-736c-4237-92f0-92e8c116832c.html: Empty file
Error processing 668ad17e-0240-49f7-b5a7-d22e502554c6.html: Empty file
Error processing da6b70d5-29f6-491a-ad46-037c77067128.html: Empty file


In [None]:
# Function to extract prompt/answers pairs from the whole text for each student
def extract_prompt_response(chat_text):
    prompts = []
    responses = []
    current_index = 0
    first_question = True

    while True:
        if first_question:
            prompt_index = chat_text.find('TITLEAnonymous', current_index)
            prefix_length = len('TITLEAnonymous')
            first_question = False
        else:
            prompt_index = chat_text.find('Anonymous', current_index)
            prefix_length = len('Anonymous')

        response_index = chat_text.find('ChatGPTChatGPT', current_index)

        if prompt_index == -1 or response_index == -1:
            break

        prompt = chat_text[current_index + prefix_length:response_index].strip()
        current_index = response_index + len('ChatGPTChatGPT')

        next_prompt_index = chat_text.find('Anonymous', current_index)
        response = chat_text[response_index + len('ChatGPTChatGPT'):next_prompt_index].strip()

        prompts.append(prompt)
        responses.append(response)

        if next_prompt_index != -1:
            current_index = next_prompt_index
        else:
            break

    return list(zip(prompts, responses))

# Path to your JSON file containing the extracted texts
json_file_path = 'path_to_your_chat_texts.json'  # Replace 'path_to_your_chat_texts.json' with your file path

with open(json_file_path, 'r', encoding='utf-8') as file:
    chat_texts = json.load(file)

def word_count(text):
    """ Count the number of words in a given text. """
    return len(text.split())

# Process each chat text to extract prompt-response pairs and count words
prompt_answer_pairs = {}
for filename, chat_text in chat_texts.items():
    pairs = extract_prompt_response(chat_text)
    total_words = sum(word_count(prompt) + word_count(response) for prompt, response in pairs)
    prompt_answer_pairs[filename] = {"pairs": pairs, "scores": {}, "number_of_words": total_words}

with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(prompt_answer_pairs, file, ensure_ascii=False, indent=4)

In [None]:
# Path to your Assignment Notebook file (`.ipynb`)
ipynb_file_path = 'path_to_your_Assignment.ipynb'  # Replace 'path_to_your_Assignment.ipynb' with your file path

# Load the content of the IPython Notebook
with open(ipynb_file_path, 'r', encoding='utf-8') as file:
    notebook_data = json.load(file)

# Determine the save directory and path for the JSON output
save_directory = os.path.dirname(ipynb_file_path)
json_file_path = os.path.join(save_directory, 'notebook_data.json')

# Save the Notebook content as JSON
with open(json_file_path, 'w', encoding='utf-8') as file:
    json.dump(notebook_data, file, indent=4)

In [None]:
# Function to extract questions from the text exctracted from assignment.ipynb file
def extract_questions(notebook_data):
    question_data = {}
    current_question = None
    question_content = []

    question_pattern = re.compile(r"## (\d+)\)")

    for cell in notebook_data["cells"]:
        if cell["cell_type"] == "markdown":
            # Check if the cell is the start of a new question
            if (match := question_pattern.search(''.join(cell["source"]))):
                if current_question is not None:
                    # Save the previous question data
                    question_data[current_question] = question_content
                    question_content = []

                current_question = f"Question {match.group(1)}"
                question_content.append(cell)
            elif current_question is not None:
                # Continue adding cells to the current question
                question_content.append(cell)
        elif current_question is not None:
            # Add code cells to the current question
            question_content.append(cell)

    # Add the last question
    if current_question is not None:
        question_data[current_question] = question_content

    return question_data

# Path to your JSON file containing the assignment notebook data
ipynb_file_path = 'path_to_your_notebook_data.json'  # Replace 'path_to_your_notebook_data.json' with your file path

with open(ipynb_file_path, 'r', encoding='utf-8') as file:
    notebook_data = json.load(file)

# Extract questions and their content
questions = extract_questions(notebook_data)

# Determine the path to save the extracted questions data
questions_json_path = os.path.join(os.path.dirname(ipynb_file_path), 'questions_data.json')

# Save the extracted questions data as JSON
with open(questions_json_path, 'w', encoding='utf-8') as file:
    json.dump(questions, file, indent=4)