In [1]:
import pandas as pd
import numpy as np
import os
import glob

## Processing the code solutions

In [2]:
def process_solutions(sources, solution_path="raw_code_solutions"):
    data = list()
    attributes = ["name", "source", "style", "version"]
    for source in sources:
        # list all the solution names
        names = [name for name in os.listdir(f"{solution_path}\\{source}_solutions")]

        for full_name in names:
            # find all java files, including those in sub-directories
            path = f"{solution_path}\\{source}_solutions\\{full_name}\\src\\main\\java\\thkoeln\\st\\st2praktikum\\exercise\\"
            file_list = [java_file for java_file in glob.glob(path + "/**/*.java", recursive=True)]

            # extract out the attributes of the solutions
            style, version = np.nan, np.nan
            name_parts = full_name.split("_")
            name = name_parts[1]
            if len(name_parts) > 2:
                style = name_parts[2]
            if len(name_parts) > 3:
                version = name_parts[3]

            # store java file as a document
            for java_file in file_list:
                with open(java_file, "r", encoding='utf-8') as open_java_file:
                    code_text = open_java_file.read()
                    document = {"name":name,
                                "file_name":java_file[len(path):],
                                "source":source,
                                "style":style,
                                "version":version,
                                "code":code_text}
                    data.append(document)
    
    # create a single dataframe
    data = pd.DataFrame(data)

    # remove students that did not attempt the assignment
    missing_solutions = list(data[(data["source"]=="student")
                                  & (data["file_name"]=="Exercise0.java")
                                  & (data["code"].str.len() < 250)]["name"])
    data = data[~data["name"].isin(missing_solutions)]
    
    # concatenate the code files
    concatenated_data = pd.DataFrame(data.groupby(attributes, dropna=False)["code"].apply(lambda row:"\n".join(row))).reset_index()
    
    return data, concatenated_data

## Storing the code as csv files

Creating a dataset containing the source of a solution, (e.g. student, specific AI), style of solution, version (relevant only for Bing chat version) and code:

In [3]:
files_code_data, all_code_data = process_solutions(sources=["gpt3.5", "bing", "bard", "gpt4", "student"])
pd.DataFrame(all_code_data[["source", "style", "version"]].melt(var_name="column", value_name="value")\
             .value_counts()).rename(columns={0: 'counts'}).sort_values(["column", "counts"])

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
source,bard,17
source,gpt3.5,19
source,bing,28
source,gpt4,31
source,student,104
style,cc,19
style,styled,27
style,plain,49
version,balanced,7
version,precise,9


In [4]:
all_code_data.to_csv("processed_code_solutions/all_code_data.csv", index=False)
files_code_data.to_csv("processed_code_solutions/files_code_data.csv", index=False)