In [1]:
import pandas as pd
import numpy as np
import json
import os
import glob

## Process the solutions

In [2]:
def process_solutions(sources, process_milestone="all", solution_path="ss21_raw_solutions"):
    
    data = list()
    attributes = ["source", "milestone", "name", "style", "version"]
    
    for source in sources:
        # list all the solution names
        if process_milestone == "all":
            names = os.listdir(f"{solution_path}\\{source}_solutions")
        else:
            names = [name for name in os.listdir(f"{solution_path}\\{source}_solutions") if name[:2]==process_milestone]
            
        for full_name in names:
            # find all java files, including those in sub-directories
            path = f"{solution_path}\\{source}_solutions\\{full_name}\\src\\main\\java\\thkoeln\\st\\st2praktikum\\exercise\\"
            file_list = [java_file for java_file in glob.glob(path + "/**/*.java", recursive=True)]

            # extract out the attributes of the solutions
            style, version = np.nan, np.nan
            name_parts = full_name.split("_")
            milestone = name_parts[0]
            name = name_parts[1]
            if len(name_parts) > 2:
                style = name_parts[2]
            if len(name_parts) > 3:
                version = name_parts[3]

            # store java file as a document
            for java_file in file_list:
                with open(java_file, "r", encoding='utf-8') as open_java_file:
                    code_text = open_java_file.read()

                    document = {"source":source,
                                "milestone":milestone,
                                "name":name,
                                "style":style,
                                "version":version,
                                "file_name":java_file[len(path):],
                                "style": style,
                                "code":code_text}
                    data.append(document)
                    
    # create a single dataframe
    data = pd.DataFrame(data)
    
    # remove students that did not attempt the assignment
    missing_solutions = list(data[(data["source"]=="student")
                                  & (data["file_name"]=="Exercise0.java")
                                  & (data["milestone"]=="m0")
                                  & (data["code"].str.len() < 250)]["name"])
    data = data[~data["name"].isin(missing_solutions)]
    
    # concatenate the code files
    concatenated_data = pd.DataFrame(data.groupby(attributes, dropna=False)["code"].apply(lambda row:"\n".join(row))).reset_index()
    
    return data, concatenated_data

## Store processed solutions

In [3]:
all_data, all_concatenated_data = process_solutions(sources=["gpt3.5", "bing", "bard", "gpt4", "student"], 
                                                    process_milestone="all")

pd.crosstab(all_concatenated_data["source"], all_concatenated_data["milestone"], margins=True)

milestone,m0,m1,m2,m3,m4,All
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bard,17,0,0,0,0,17
bing,28,0,0,0,0,28
gpt3.5,18,4,1,1,1,25
gpt4,31,0,0,0,0,31
student,104,102,87,80,73,446
All,198,106,88,81,74,547


In [4]:
m0_data, m0_concatenated_data = process_solutions(sources=["gpt3.5", "bing", "bard", "gpt4", "student"], 
                                                  process_milestone="m0")

m0_concatenated_data.to_csv("ss21_processed_solutions/m0_data.csv", index=False)
m0_data.to_csv("ss21_processed_solutions/m0_file_data.csv", index=False)

pd.DataFrame(m0_concatenated_data[["source", "style", "version"]].melt(var_name="column", value_name="value")\
             .value_counts()).rename(columns={0: 'counts'}).sort_values(["column", "counts"])

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
source,bard,17
source,gpt3.5,18
source,bing,28
source,gpt4,31
source,student,104
style,cc,18
style,styled,27
style,plain,49
version,balanced,7
version,precise,9


## Store additional solutions

In [10]:
optimised_data, cat_optimised_data = process_solutions(sources=["intervene", "optimised"], 
                                                  process_milestone="m0")

optimised_data.to_csv("ss21_processed_solutions/optimised_file_data.csv", index=False)
cat_optimised_data.to_csv("ss21_processed_solutions/optimised_data.csv", index=False)

In [4]:
guesses_file_data, guesses_cat_data = process_solutions(sources=["guesses"], process_milestone="m0")
guesses_cat_data.to_csv("ss21_processed_solutions/guesses_data.csv", index=False)