In [1]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re

plt.style.use("seaborn")

# Produce data

To retrieve the raw data, the Python static code analyzer [Lizard](https://github.com/terryyin/lizard) was used.

```
# the following command produces a csv-file with all the necessary data for analysis
# stand in the root folder of inda student repos
# -o specifies output name
# --exclude excludes all files containing 'Test'
# --csv specifies format

lizard -o inda-repos-students.csv --exclude "^.*Test.*$" --csv -l java
```

# Process data
Next step is to process the generated data. The CSV-file has an verbose output string that contains all necessary information which can be retrieved by regex.

In [2]:
# specify input and output paths
source = "data/lizard/inda-repos-students.csv"
destination = "data/processed/inda-repos-students.csv"

# the source (Lizard) csv-header and the destination header
source_headers = ["NLOC", "CCN", "token", "PARAM", "length", "verbose", "input", "class::method", "class::method(args)", "start line", "end line"]
destination_headers = ["NLOC", "CCN", "token", "PARAM", "length", "TA", "student", "task", "class", "method"]

# necessary regex is stored in a dictionary
regex = {
    "task": "((?<=task-)[0-9]+)", # capture a number preceded by 'task-'
    "class": "^[A-Za-z0-9]*", # capture any letter or number from start to end of string
    "TA": "(?<=repos\/)[A-Za-z]+(?=\/)", # capture any string preceded by 'repos/' and ending with a '/'
    "student": "((?<=\/)([a-zA-Z0-9]+)(?=-task))", # capture any string preceded by '/' and ending with '-task'
    "method": "((?<=::).*(?=\())" # nasty
}

Next we process the data by extracting the right information from the Lizard file. At the end there's a report of the amount of faulty lines.

TODO: ta reda på vad som faktiskt orsakar `AttributeError`.

In [None]:
# PLAYGROUND
# Try to find cause of AttributeError
rows = []
attribute_errors = 0
with open(source, newline='') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=source_headers)
    for row in reader:
        try:
            ta = re.search(regex["TA"], row['verbose']).group(0)
            student = re.search(regex["student"], row['verbose']).group(0)
            task = re.search(regex["task"], row['verbose']).group(0)
            clss = re.search(regex["class"], row['class::method']).group(0)
            method = re.search(regex["method"], row['class::method']).group(0)
            print(re.search(regex["method"], row['class::method(args)']).group(0))
        except AttributeError:
            # eftersom lösningar som ej kompilerar pajar regexen är detta nödvändigt
            # Sätter till NaN för att kunna filtrera ut i Pandas
            print(row)
            print(re.search(regex["method"], row['class::method(args)']).group(0))
            break;

Error: Session cannot generate requests

In [35]:
rows = []
attribute_errors = 0
with open(source, newline='') as csvfile:
    reader = csv.DictReader(csvfile, fieldnames=source_headers)
    for row in reader:
        try:
            ta = re.search(regex["TA"], row['verbose']).group(0)
            student = re.search(regex["student"], row['verbose']).group(0)
            task = int(re.search(regex["task"], row['verbose']).group(0))
            clss = re.search(regex["class"], row['class::method']).group(0)
            method = re.search(regex["method"], row['class::method(args)']).group(0)
        except AttributeError:
            # eftersom lösningar som ej kompilerar pajar regexen är detta nödvändigt
            # Sätter till NaN för att kunna filtrera ut i Pandas
            attribute_errors = attribute_errors + 1 # keep track of amount of faulty lines
            ta = "NaN"
            student = "NaN"
            task = "NaN"
            clss = "NaN"
            method = "NaN"
        new_row = { # if we reach here all is well
                    "NLOC": row['NLOC'], 
                    "CCN": row['CCN'], 
                    "token": row['token'], 
                    "PARAM": row['PARAM'], 
                    "length": row['length'],
                    "TA": ta,
                    "student": student, 
                    "task": task, 
                    "class": clss, 
                    "method": method
                    }
        rows.append(new_row)

with open(destination, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=destination_headers)
    writer.writeheader()
    writer.writerows(rows)

print(f"Done parsing a total of {len(rows)} rows, with {attribute_errors} faulty lines! That's a felmariginal of {attribute_errors/len(rows)}")


Done parsing a total of 20513 rows, with 23 faulty lines! That's a felmariginal of 0.001121240189148345


# Analyze inda code templates

Here the empty code templates are measured.

In [36]:
# Find Lines of Code for the code templates 
# Note that task 2-4 are empty, yet students recieve a Java template from Objects First
df_templates = pd.read_csv("data/processed/inda-templates-sorted.csv")
df_templates = df_templates.groupby(["task"])["NLOC"].sum().reset_index()

# as there's no provided code skeleton for task 2-4, 10 and 17 these has to be appended
missing_tasks =  ({
            "task": [2, 3, 4, 10, 17],
            "NLOC": [0, 0, 0, 0, 0]
            })

df_templates = pd.concat([df_templates, pd.DataFrame(missing_tasks)], ignore_index = True, axis = 0)
df_templates = df_templates.sort_values(by=["task"])
df_templates

Unnamed: 0,task,NLOC
12,2,0
13,3,0
14,4,0
0,5,344
1,6,424
2,7,328
3,8,223
4,9,144
15,10,0
5,11,60


In [42]:
# (df.groupby(['cluster', 'org'], as_index=False).mean()
#            .groupby('cluster')['time'].mean())
df_students = pd.read_csv("data/processed/inda-repos-students.csv")
df_students.groupby(["task", "student"], as_index=False)["NLOC"].sum().groupby("task")["NLOC"].median()

task
1.0       3.0
2.0      53.0
3.0      55.0
4.0      78.0
5.0     464.5
6.0     559.5
7.0     516.0
8.0     292.0
9.0     280.5
10.0    182.0
11.0    283.0
12.0    106.5
13.0    128.0
14.0    257.0
15.0    533.0
16.0    217.0
17.0    264.5
18.0    647.5
Name: NLOC, dtype: float64

In [22]:
df_students = pd.read_csv("data/processed/inda-repos-students.csv")
df_students = df_students.groupby(["task", "student"])["NLOC"].sum().reset_index()
#df_students.loc[df_students['task'] == 6].mean()
#df_students.groupby(["task", "student"])["NLOC"].sum().reset_index().loc[df_students['task'] == 6].min()
df_students
#df_students.groupby(["task", "student"]).head()

Unnamed: 0,task,student,NLOC
0,1.0,falkborn,3
1,2.0,alflil,16
2,2.0,birgerk,72
3,2.0,dlindbo,27
4,2.0,falkborn,66
...,...,...,...
501,18.0,shayanek,643
502,18.0,sixtenhe,609
503,18.0,smarcus,665
504,18.0,tarara,684


In [33]:
# SANDBOX
df_students.groupby("student").head()
df_students.groupby(["task", "student"], as_index=False)["NLOC"].sum().groupby("task")["NLOC"].median()

Unnamed: 0,task,student,NLOC
0,1.0,falkborn,3
1,2.0,alflil,16
2,2.0,birgerk,72
3,2.0,dlindbo,27
4,2.0,falkborn,66
...,...,...,...
147,7.0,anbruno,753
149,7.0,briano,464
158,7.0,melst,723
160,7.0,mortadan,347


In [53]:
# Find avarage code length from all students data


#for index, row in df_students.iterrows():




# df_students.groupby("task")["student"].unique()
#df_students.groupby(["task"])["student"].nunique()
#student_avarage_series / df_students.groupby(["task"])["student"].nunique()
#student_avarage_series / df_students.groupby(["task"])["student"].nunique()

#df_students.groupby(["task", "student"])["NLOC"].sum()
#df_students = df_students.groupby(["task", "student"])["NLOC"]
#df_students.groupby(["task", "student"])["NLOC"]
df_students = pd.read_csv("data/processed/inda-repos-students.csv")
df_students = df_students.groupby(["task", "student"])["NLOC"].sum().reset_index()
df_students = df_students.groupby(["task", "student"], as_index=False)["NLOC"].sum().groupby("task")["NLOC"].median()


#df_students.sort_values(by=["task"]) - df_templates.sort_values(by=["task"])
df_templates.sort_values(by=["task"])["NLOC"]

result = pd.merge(df_students, df_templates.astype("float"), on="task")
#df_students["NLOC"] - df_templates["NLOC"].astype("int64")
result["NLOC_x"] - result["NLOC_y"]




0      53.0
1      55.0
2      78.0
3     120.5
4     135.5
5     188.0
6      69.0
7     136.5
8     182.0
9     223.0
10    100.5
11     22.0
12    224.0
13    521.0
14    196.0
15    264.5
16    598.5
dtype: float64

In [None]:
# Plot the data

# Read data

In [3]:
#df = pd.read_csv("inda_data.csv", sep=";")
df = pd.read_csv("data/inda-data-test-by-ta.csv")

df.describe()

Unnamed: 0,NLOC,CCN,token,PARAM,length,task
count,14001.0,14001.0,14001.0,14001.0,14001.0,14001.0
mean,7.320549,1.768231,44.887651,0.472752,8.93479,10.810371
std,5.175004,1.319332,38.710128,0.831,6.953712,4.604482
min,1.0,1.0,5.0,0.0,1.0,2.0
25%,4.0,1.0,19.0,0.0,4.0,7.0
50%,6.0,1.0,37.0,0.0,7.0,11.0
75%,9.0,2.0,58.0,1.0,11.0,15.0
max,102.0,32.0,748.0,11.0,137.0,18.0


# Compose data

In [40]:
# Group by and Summarize
group = df.groupby(["TA", "student", "task", "class", "method"]).sum()

group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,NLOC,CCN,token,PARAM,length
TA,student,task,class,method,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
linus,alflil,2,Ticketmachine,TicketMachine,8,2,22,1,10
linus,alflil,2,Ticketmachine,empty,3,1,9,0,4
linus,alflil,2,Ticketmachine,emptyMachine,5,1,16,0,8
linus,alflil,3,Heater,cooler,8,2,31,0,10
linus,alflil,3,Heater,return_temp,3,1,8,0,4
...,...,...,...,...,...,...,...,...,...
tim,tarara,18,GraphAlgorithmsTest,hasCycleTrueMultipleComponents,13,1,117,0,24
tim,tarara,18,GraphAlgorithmsTest,hasCycleTrueSingleComponent,14,1,128,0,19
tim,tarara,18,GraphAlgorithmsTest,hasPathTrueTreeGraph,14,1,132,0,28
tim,tarara,18,GraphAlgorithmsTest,setUp,1,1,5,0,1


# Make plots

In [28]:
for ta, rows in df.groupby("TA"):
    sn.relplot(data=rows, x="Operation Structuring", y="Intrinsic operation complexity", hue="task")
    plt.title(ta)

ValueError: Could not interpret value `Operation Structuring` for parameter `x`