# This Notebooks sorts data and outputs a CSV with all `lizard`-analytics

In [8]:
import os
import csv
import lizard
import subprocess
import re

## Helper methods

In [9]:
def get_all_student_files(directory):
    """
    INPUT   directory: path to all student repos
    OUTPUT  a list of tuples (path/to/student_repo, student)
    """
    return [ (f.path, f.name) for f in os.scandir(directory) if f.is_dir()]

`call_lizard()` runs lizard on each seperate student repository, excluding testfiles.

In [10]:
def setup_student_ta_and_experience(path):
    """
    INPUT   path: path to ta-groups-20xx.csv
    OUTPUT  dict: a dictionary where every student has a (TA, experience)-tuple
    """
    dict = {}
    with open(path, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            # row[0] = student, row[1] = ta, row[2] = experience
            dict[row[0]] = (row[1], row[2])
    return dict

In [11]:
def call_lizard(path, filename):
    """
    INPUT   path: path to all student repos for current course iteration
    OUTPUT  writes raw lizard output in csv-format to a new file 'filename'
    """
    with open(filename, "w") as csvfile:
        for student in get_all_student_files(path):
            proc = subprocess.call(['lizard', student[0], '--csv', '--exclude', '"^.*Test.java$"', '-l', 'java'], bufsize=1, universal_newlines=True, stdout=csvfile)


## Execution

In [12]:
def refine_lizard_csv(source, regex, groupfile, source_headers):
    """ 
    returns a list of dictionaries to write to a new csvfile
    """
    rows = []
    student_ta_xp = setup_student_ta_and_experience(groupfile)
    attribute_errors = 0
    with open(source, newline='') as csvfile:
        reader = csv.DictReader(csvfile, fieldnames=source_headers)
        for row in reader:
            try:
                student = re.search(regex["student"], row['verbose']).group(0)
                task = int(re.search(regex["task"], row['verbose']).group(0))
                clss = re.search(regex["class"], row['class::method']).group(0)
                method = re.search(regex["method"], row['class::method(args)']).group(0)
                ta = student_ta_xp[student][0]
                experience = student_ta_xp[student_ta_xp][1],
            except AttributeError:
                # eftersom lösningar som ej kompilerar pajar regexen är detta nödvändigt
                # Sätter till NaN för att kunna filtrera ut i Pandas
                attribute_errors = attribute_errors + 1 # keep track of amount of faulty lines
                ta = "NaN"
                student = "NaN"
                task = "NaN"
                clss = "NaN"
                method = "NaN"
            new_row = { # if we reach here all is well
                        "NLOC": row['NLOC'], 
                        "CCN": row['CCN'], 
                        "experience": experience,
                        "TA": ta,
                        "student": student, 
                        "task": task, 
                        "class": clss, 
                        "method": method
                        }
            rows.append(new_row)
    print(f"Done parsing a total of {len(rows)} rows, with {attribute_errors} faulty lines! That's a felmariginal of {attribute_errors/len(rows)}")
    return rows


def write_to_output(destination, destination_headers, rows):
    with open(destination, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=destination_headers)
        writer.writeheader()
        writer.writerows(rows)
    


In [13]:
def main():
    # the source (Lizard) csv-header and the destination header
    source_headers = ["NLOC", "CCN", "token", "PARAM", "length", "verbose", "input", "class::method", "class::method(args)", "start line", "end line"]
    destination_headers = ["NLOC", "CCN", "experience" "TA", "student", "task", "class", "method"]

    # necessary regex is stored in a dictionary
    regex = {
        "task": "((?<=task-)[0-9]+)", # capture a number preceded by 'task-'
        "class": "^[A-Za-z0-9]*", # capture any letter or number from start to end of string
        "student": "((?<=\/)([a-zA-Z0-9]+)(?=-task))", # capture any string preceded by '/' and ending with '-task'
        "method": "((?<=::).*(?=\())" # nasty
    }

    # stå i repot och kör detta script
    path_to_student_repos = "data/inda/2021" 
    raw_lizard_csv = "data/lizard/inda-repos-students-2021.csv"
    refined_lizard_csv = "data/processed/inda-repos-students-2021.csv"
    groupfile = "data/inda/2021/ta-group-2021.csv"
    
    # run and refine lizard output
    call_lizard(path_to_student_repos, raw_lizard_csv)
    rows = refine_lizard_csv(raw_lizard_csv, regex, groupfile, source_headers)
    write_to_output(refined_lizard_csv, destination_headers, rows)
