In [3]:
import pandas as pd
import json
import os
import glob

## Processing all of the ST2 Milestone Solutions

*Re-run this entire notebook to process any new solutions and store them in the files.*

## Storing the code

In [29]:
# add to this cell each AI that has been tested
sources = ["gpt3.5", "bing", "bard", "gpt4", "student"]
attributes = {"student": ["source", "milestone", "name", "code"],
              "gpt3.5": ["source", "milestone", "name", "style", "code"],
              "bing": ["source", "milestone", "name", "style", "version", "code"],
              "bard": ["source", "milestone", "name", "style", "code"],
              "gpt4": ["source", "milestone", "name", "style", "code"]}

In [30]:
data = list()
default_tests = ["E1MovementTests.java", "E1ValueObjectParsingTests.java", 
                 "E1ValueObjectValidationTests.java", "core\\MovementTests.java"]

for source in sources:
    # list all the solution names
    names = os.listdir(f"raw_solutions\\{source}_solutions")

    for full_name in names:
        # find all java files, including those in sub-directories
        path = f"raw_solutions\\{source}_solutions\\{full_name}\\src\\main\\java\\thkoeln\\st\\st2praktikum\\exercise\\"
        file_list = [java_file for java_file in glob.glob(path + "/**/*.java", recursive=True)]
        
        # extract out the attributes of the solutions
        name_parts = full_name.split("_")
        milestone = name_parts[0]
        if source == "student":
            name = name_parts[1]
        else:
            if source == "bing":
                version = name_parts[1]
            name = name_parts[-2]
            style = name_parts[-1]
            
        # include the test files, written as part of the assignment in milestone 2
        if milestone=="m2":
            test_path = f"raw_solutions\\{source}_solutions\\{full_name}\\src\\test\\java\\thkoeln\\st\\st2praktikum\\exercise\\"
            file_list = file_list + [test_file for test_file in glob.glob(test_path + "/**/*.java", recursive=True)\
                                     if test_file[len(test_path):] not in default_tests]
            
        # store java file as a document
        for java_file in file_list:
            with open(java_file, "r", encoding='utf-8') as open_java_file:
                code_text = open_java_file.read()
                document = {"source":source,
                            "milestone":milestone,
                            "name":name,
                            "file_name":java_file[len(path):]}
                if source != "student":
                    document["style"] = style
                    if source == "bing":
                        document["version"] = version
                document["code"] = code_text
                data.append(document)

Removing the students who did not attempt to complete the assignment:

In [31]:
no_solutions = [solution["name"] for solution in data if solution["milestone"]=="m0"\
                                                      and solution["source"]=="student"\
                                                      and solution["file_name"]=="Exercise0.java"\
                                                      and len(solution["code"])<250]

In [32]:
data = [solution for solution in data if solution["name"] not in no_solutions]

Storing the data as JSON:

In [33]:
with open("processed_solutions/all_code_solutions.json", 'w') as json_file:
    json.dump(data, json_file)

Number of code files per source and milestone:

In [34]:
files_count = pd.DataFrame([(value["source"], value["milestone"]) for value in data], 
                               columns=["source", "milestone"])
pd.crosstab(files_count["source"], files_count["milestone"], margins=True)

milestone,m0,m1,m2,m3,m4,All
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bard,35,0,0,0,0,35
bing,56,0,0,0,0,56
gpt3.5,38,34,10,10,22,114
gpt4,34,0,0,0,0,34
student,303,990,1249,1454,1626,5622
All,466,1024,1259,1464,1648,5861


## Storing the markdown tables

In [35]:
markdown_data = list()
for source in sources:
    names = [name for name in os.listdir(f"raw_solutions\\{source}_solutions") if name[:2]=="m3"]
    for name in names:
        for table in ["E2.md", "E3.md"]:
            path = f"raw_solutions\\{source}_solutions\\{name}\\src\\main\\resources\\{table}"
            with open(path, "r", encoding='utf-8') as open_text:
                text = open_text.read()
                markdown_data.append({"source":source,
                                      "name":name[3:],
                                      "table":table,
                                      "text":text})

Storing the data as JSON:

In [36]:
with open("processed_solutions/all_markdown_tables.json", 'w') as json_file:
    json.dump(markdown_data, json_file)

Number of markdown tables per source and table:

In [37]:
tables_count = pd.DataFrame([(value["source"], value["table"]) for value in markdown_data], 
                               columns=["source", "table"])
pd.crosstab(tables_count["source"], tables_count["table"], margins=True)

table,E2.md,E3.md,All
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gpt3.5,1,1,2
student,83,83,166
All,84,84,168


## Concatenate all files

In [38]:
concatenated_data = list()
for source in sources:
    all_files = pd.DataFrame([[file[attribute] for attribute in attributes[source]]\
               for file in data if file["source"]==source], columns=attributes[source])
    concat_files = pd.DataFrame(all_files.groupby(attributes[source][:-1])["code"]\
                                .apply(lambda row:"\n".join(row))).reset_index()
    concatenated_data += json.loads(concat_files.to_json(orient="records"))

Storing the data as JSON:

In [39]:
with open("processed_solutions/all_concat_code_solutions.json", 'w') as json_file:
    json.dump(concatenated_data, json_file)

Number of solutions per source and milestone:

In [40]:
files_count = pd.DataFrame([(value["source"], value["milestone"]) for value in concatenated_data], 
                           columns=["source", "milestone"])
pd.crosstab(files_count["source"], files_count["milestone"], margins=True)

milestone,m0,m1,m2,m3,m4,All
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bard,17,0,0,0,0,17
bing,28,0,0,0,0,28
gpt3.5,19,4,1,1,1,26
gpt4,17,0,0,0,0,17
student,104,102,87,80,73,446
All,185,106,88,81,74,534
