In [1]:
import pandas as pd
import json
import os
import glob

## Processing the Milestone 0 Solutions

*Re-run this entire notebook to process any new solutions and store them in the files. This notebook can be run independently.*

## Storing the code

In [2]:
sources = ["gpt3.5", "bing", "bard", "student"]
attributes = {"student": ["source", "name", "code"],
              "gpt3.5": ["source", "name", "style", "code"],
              "bing": ["source", "name", "style", "version", "code"],
              "bard": ["source", "name", "style", "code"]}

In [3]:
data = list()

for source in sources:
    # list all the solution names for milestone 0
    names = [name for name in os.listdir(f"raw_solutions\\{source}_solutions") if name[:2]=="m0"]

    for full_name in names:
        # find all java files, including those in sub-directories
        path = f"raw_solutions\\{source}_solutions\\{full_name}\\src\\main\\java\\thkoeln\\st\\st2praktikum\\exercise\\"
        file_list = [java_file for java_file in glob.glob(path + "/**/*.java", recursive=True)]
        
        # extract out the attributes of the solutions
        name_parts = full_name.split("_")
        if source == "student":
            name = name_parts[1]
        else:
            if source == "bing":
                version = name_parts[1]
            name = name_parts[-2]
            style = name_parts[-1]
            
        # store java file as a document
        for java_file in file_list:
            with open(java_file, "r", encoding='utf-8') as open_java_file:
                code_text = open_java_file.read()
                document = {"source":source,
                            "name":name,
                            "file_name":java_file[len(path):]}
                if source != "student":
                    document["style"] = style
                    if source == "bing":
                        document["version"] = version
                document["code"] = code_text
                data.append(document)

Removing the students who did not attempt to complete the assignment:

In [4]:
no_solutions = [solution["name"] for solution in data if solution["source"]=="student"\
                                                      and solution["file_name"]=="Exercise0.java"\
                                                      and len(solution["code"])<250]

In [5]:
data = [solution for solution in data if solution["name"] not in no_solutions]

Storing the data as JSON:

In [6]:
with open("processed_solutions/m0_code_solutions.json", 'w') as json_file:
    json.dump(data, json_file)

## Concatenate all files

In [7]:
concatenated_data = list()
for source in sources:
    all_files = pd.DataFrame([[file[attribute] for attribute in attributes[source]]\
               for file in data if file["source"]==source], columns=attributes[source])
    concat_files = pd.DataFrame(all_files.groupby(attributes[source][:-1])["code"]\
                                .apply(lambda row:"\n".join(row))).reset_index()
    concatenated_data += json.loads(concat_files.to_json(orient="records"))

Storing the data as JSON:

In [8]:
with open("processed_solutions/m0_concat_code_solutions.json", 'w') as json_file:
    json.dump(concatenated_data, json_file)

## Storing the code as csv files

Creating a dataset containing the source of a solution, (e.g. student, specific LLM), style of solution, version (relevant only for Bing chat version) and code:

In [9]:
df_data = pd.DataFrame(concatenated_data)[["source", "style", "version", "code"]]

In [10]:
df_data.head()

Unnamed: 0,source,style,version,code
0,gpt3.5,plain,,package thkoeln.st.st2praktikum.exercise;\n\np...
1,gpt3.5,plain,,package thkoeln.st.st2praktikum.exercise;\n\np...
2,gpt3.5,cc,,package thkoeln.st.st2praktikum.exercise;\n\ni...
3,gpt3.5,cc,,package thkoeln.st.st2praktikum.exercise;\n\np...
4,gpt3.5,cc,,package thkoeln.st.st2praktikum.exercise;\n\ni...


In [11]:
pd.DataFrame(df_data[["source", "style", "version"]].melt(var_name="column", value_name="value")\
             .value_counts()).rename(columns={0: 'counts'}).sort_values(["column", "counts"])

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
source,bard,17
source,gpt3.5,19
source,bing,28
source,student,104
style,cc,15
style,styled,19
style,plain,30
version,balanced,7
version,concise,9
version,creative,12


Creating a binary class version of the dataset, with labels "ai" and "student":

In [12]:
binary_data = df_data.replace({"source":{"gpt3.5":"ai", "bing":"ai", "bard":"ai"}})[["source", "code"]]

In [13]:
binary_data.head()

Unnamed: 0,source,code
0,ai,package thkoeln.st.st2praktikum.exercise;\n\np...
1,ai,package thkoeln.st.st2praktikum.exercise;\n\np...
2,ai,package thkoeln.st.st2praktikum.exercise;\n\ni...
3,ai,package thkoeln.st.st2praktikum.exercise;\n\np...
4,ai,package thkoeln.st.st2praktikum.exercise;\n\ni...


In [14]:
binary_data["source"].value_counts()

student    104
ai          64
Name: source, dtype: int64

Saving the dataframes as csv files:

In [15]:
df_data.to_csv("processed_solutions/m0_data.csv", index=False)

In [16]:
binary_data.to_csv("processed_solutions/m0_data_binary.csv", index=False)