# Create CVS from Violations

**Required imports**

In [5]:
import csv
import json
import os
import pickle
import re
from bs4 import BeautifulSoup

**Define method that returns all violation files in `data/student-violations/<directory>` into one list**

It required a specific format for the directory, \<directory\>/task-x/student-task-x.json

Three directories currently follow this format:

- `violations-pmd` (inda rules are used)
- `violations-full-pmd` (all pmd rules are used)
- `violations-sorald` (all sorald rules are used)

In [6]:
def get_violation_files(dir):
    student_files = []
    tasks = [ (f.path,f.name) for f in os.scandir(dir) if f.is_dir() ] #Get all task folders "task-x"
    for t in tasks:
        task = t[1]
        json_files = [ (task,f.path,f.name,f.name.split('-')[0]) for f in os.scandir(t[0]) if f.is_file() ] # (task,path,filename,student)
        student_files = student_files + json_files
    return student_files

def get_violation_files_template(dir):
    student_files = []
    tasks = [ (f.path,f.name) for f in os.scandir(dir) if f.is_dir() ]
    for t in tasks:
        task = t[1]
        json_files = [ (task,f.path,f.name,f.name.split('-')[0]) for f in os.scandir(t[0]) if f.is_file() ] # (task,path,filename,student)
        student_files = student_files + json_files
    return student_files

**Define methods that returns list of actually used rules in both PMD and sorald**

In [7]:
# Scans the inda.xml ruleset and extracts all rules used.

def get_pmd_rules():
    all_rules = []
    with open("data/inda.xml", 'r') as f:
        data = f.read()
        Bs_data = BeautifulSoup(data, "xml")
        b_rules = Bs_data.find_all('rule')
        refs = [r['ref'] for r in b_rules ]

        all_rules = [re.search('.*\.xml/(.*)', r).group(1) for r in refs]
    return all_rules

# All sorald rules are defined in a dictionary [Key: ruleKey Value: ruleName] saved as binary file.

def get_sorald_rules():
    with open('data/sorald-rules.bin','rb') as infile:
        return pickle.load(infile)

# Rules used for sorald, outdated
def accepted_rules():
     return ['S125','S3972','S2583','S1871','S1120','S121','S122','S2208','S126','S109','S5261','S1751','S4144']

**Load and return dictionaries containing information about three different things**

- `year_student_ta` : For each year what ta and difficulty does each student have? {'year' : {'student' : (TA,difficulty)}}
- `year_repo_status` : For each year what is the status for each repository (has student pushed? recieved pass?) **Not used anymore**
- `year_student_lines` : For each year what lines in a specific file has a student modified or created? {'year' : {'task' : {'student': {'file' : \[lines\]}}}}

In [8]:
def get_information_dictionaries():
    year_student_ta = {}
    year_student_lines = {}
    years=["2021","2020","2019","2018"]

    for year in years:
        infile = open(f'data/groups-data/student-groups{year}.bin', 'rb') # Not available
        year_student_ta[year] = pickle.load(infile)
        infile.close()


        infile = open(f'data/student-lines/student-lines-{year}.bin', 'rb') # Not available
        year_student_lines[year] = pickle.load(infile)
        infile.close()
    return year_student_ta,year_student_lines

**Define method in charge of analyzing violation-files**

Based on `path_to_stats_file` read the json-file and retrieve the number of occurences for each violation

**Note:** Method for sorald required extra check to remove violations found in Test-files as it doesn't offer functionality to easily exclude those files from being analyzed.

**New:** Each method also only counts number of rows (excluding Test-files) modified/created by student (using `year_student_lines`) and what violations a student is responsible for (also using `year_student_lines`).

In [9]:
def get_number_of_violations_pmd(path_to_stats_file: str,task,student,file_name,year,year_student_lines):
    number_of_violation = {}
    number_of_violation_old = {}
    key_error = 0
    
    if file_name == f"{student}-task-19" or file_name == f"{student}-week-19":
        file_name = f"{student}-quicksort"

    lines_files = year_student_lines[year][task][student] # Get dictionary Key: File Value: List of lines modified/created by student
    loc = 0
    error_files = []
    good_files = 0
    with open(path_to_stats_file) as stats_file:
        json_object = json.loads(stats_file.read())
        files = json_object["files"]
        
        # If PMD was unsuccessful with analyzing repo, return "ok" as False and standard values for rest (will be ignored).
        errors = json_object["processingErrors"]
        if len(errors) > 0:
            pattern = f".*{file_name}/(.*)"
            for f in errors:
                key = re.search(pattern,f["filename"]).group(1) #Gets the file name analyzed by PMD
                if key in lines_files: # Requirement that the file is actually in lines_files (found when running git blame) 
                    lines = lines_files[key]
                    error_files.append((task,student,key))
        
        
        
        for f in files:
            
            pattern = f".*{file_name}/(.*)"
            key = re.search(pattern,f["filename"]).group(1) #Gets the file name analyzed by PMD
            

            violations = f["violations"]
            
            # Count number of violations without regards if student is responsible
            for v in violations:
                number_of_violation_old[v["rule"]] = number_of_violation_old.get(v["rule"],0) + 1
            
            if key in lines_files: # Requirement that the file is actually in lines_files (found when running git blame) 
                lines = lines_files[key] # Get lines student modified in this file this task
                if(len(lines) > 0):
                    good_files += 1
                for v in violations:
                    line = v["beginline"]
                    if str(line) in lines: # Check if violation is on line student wrote
                        number_of_violation[v["rule"]] = number_of_violation.get(v["rule"],0) + 1
            else:
                key_error +=1
        
        return True,number_of_violation,len(files),key_error,number_of_violation_old,error_files,good_files
    

def get_number_of_violations_sorald(path_to_stats_file: str,student,task,year,year_student_lines,skipped_files):
    number_of_violation = {}
    number_of_violation_old = {}
    
    lines_files = year_student_lines[year][task][student]
    loc = lines_files["loc"]
    with open(path_to_stats_file) as stats_file:
        json_object = json.loads(stats_file.read())
        mined_rules = json_object["minedRules"]
        
        for violation in mined_rules:
            occurr = 0
            occur_old = 0
            for location in violation["warningLocations"]:
                path = location["filePath"]
                if (task,student,path) in skipped_files:
                    continue
                line = location["startLine"]
                
                match = re.search('.*Test.java',path) #Exclude violations found in test-files
                if match is None and path in lines_files:
                    
                    if str(line) in lines_files[path]:
                        occurr += 1
                    occur_old += 1
            number_of_violation_old[violation["ruleKey"]] = occur_old
            number_of_violation[violation["ruleKey"]] = occurr
        return number_of_violation,number_of_violation_old

**Define method extracting data from json files**

**Note:** The current functionality excludes repos from Sorald which could not be analyzed by PMD, this is because Sorald works for all while PMD does not.

In [10]:
def get_loc(student,task,error_files,year_student_lines,year):
    loc_in_files = year_student_lines[year][task][student]["loc"]
    loc = 0
    for file in loc_in_files:
        if (task,student,file) in error_files:
            print("skipping: " , (task,student,file))
            continue
        loc += year_student_lines[year][task][student]["loc"][file]
    return loc

def extract_json_pmd(csvwriter,student_files,all_rules,year,year_student_ta,year_student_lines):
    success = 0
    fails = 0
    skipped = []
    key_error_tot = 0
    repos = []

    pmd_errors = {}
    for f in student_files:
        task = f[0]
        task_number = task.split('-')[1]
        student = f[3]
        repos.append((student,task))
        ta = year_student_ta[year][student][0]
        difficulty = year_student_ta[year][student][1]

        if task_number not in pmd_errors:
            pmd_errors[task_number] = {}
        
        

        
        path = f[1]

        ok,stats,num_files,key_error,stats_old,error_files,good_files = get_number_of_violations_pmd(path,task_number,student,f[2],year,year_student_lines)
        total_files = good_files + len(error_files)
        skipped = skipped + error_files
        key_error_tot +=key_error
        #status = year_repo_status[year][task_number][student] #Not used anymore
        loc = get_loc(student,task_number,error_files,year_student_lines,year)
        if ok: # and (status[0] or status[1]):
            success+=1
            for v in all_rules:
                amount = stats.get(v,0)
                #if v == "CommentRequired" and task_number == '8': #Temporary test for this specific Task
                #    amount = amount_old
                
                occurrence = 1 if amount > 0 else 0
                row = [year,task_number,ta,difficulty,student,loc,v,'-','PMD',occurrence,amount,len(error_files),total_files]
                pmd_errors[task_number][student] = len(error_files)
                csvwriter.writerow(row)
        else:

            row = [year,task_number,ta,difficulty,student,-1,'ERROR','ERROR','ERROR',-1,-1,-1,-1]
            csvwriter.writerow(row)
                
            fails += 1

    print("pmd key error in status-lines:", key_error_tot)
    return success,fails,skipped,repos,pmd_errors


def extract_json_sorald(csvwriter,student_files,all_rules,skipped_files,accepted_rules,year,year_student_ta,year_student_lines,pmd_errors):
    success = 0
    repos = []
    for f in student_files:
        task = f[0]
        task_number = task.split('-')[1]
        student = f[3]

        
        repos.append((student,task))
        ta = year_student_ta[year][student][0]
        difficulty = year_student_ta[year][student][1]
        path = f[1]
        stats,stats_old = get_number_of_violations_sorald(path,student,task_number,year,year_student_lines,skipped_files)
        loc = get_loc(student,task_number,skipped_files,year_student_lines,year)
        #if (task,student) in skipped: # If this repo was skipped by PMD skip it for Sorald aswell.
        #    continue

        success += 1
        for v in all_rules:
            if accepted_rules is not None and v not in accepted_rules:
                continue
            name = all_rules[v]

            
            amount = stats.get(v,0)
            amount_old = stats_old.get(v,0)
            occurrence = 1 if amount > 0 else 0
            pmd_error = pmd_errors[task_number][student]
            row = [year,task_number,ta,difficulty,student,loc,name,v,'SonarSource',occurrence,amount,pmd_error,-1]
            csvwriter.writerow(row)
    return success,repos

**Create CVS file**

The CVS file will have header: `year`,`task`,`ta`,`student`,`violation`,`violation-id`,`tool`,`has`,`amount`

In [14]:
year = "2021"
path = "data/processing-results"
with open(f"{path}/student-data-{year}-exp.csv", 'w+') as csvfile:
    header2 = ['year','task','ta','difficulty','student','loc','violation','violation-id','tool','occur','amount','error-files','total-files']

    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(header2)
    year_student_ta,year_student_lines = get_information_dictionaries()
    #PMD part
    student_files = get_violation_files(f"data/student-violations/violations-pmd/{year}")
    print("PMD stats files: ", len(student_files))
    all_rules = get_pmd_rules()
    success_pmd,fails_pmd,skipped,pmd_repos,pmd_errors = extract_json_pmd(csvwriter,student_files,all_rules,year,year_student_ta,year_student_lines)

    #Sorald part
    accepted_rules_sorald = accepted_rules()
    student_files = get_violation_files(f"data/student-violations/violations-sorald/{year}")
    print("SonarSource stats files: ", len(student_files))
    all_rules = get_sorald_rules()
    success_sorald,sorald_repos = extract_json_sorald(csvwriter,student_files,all_rules,skipped,accepted_rules_sorald,year,year_student_ta,year_student_lines,pmd_errors)

not_same = 0
for repo in pmd_repos:
    if repo not in sorald_repos:
        not_same += 1

print("### Year: ",year," ###")
print("Number of repos analyzed: ",len(student_files))
print("Successful PMD: ", success_pmd)
print("Fails PMD: ", fails_pmd)
print("Successful Sorald: ", success_sorald)
print("Not same: ", not_same)
# 2018 168
# 2019 107
# 2020 67
# 2021 53

PMD stats files:  2103
skipping:  ('3', 'emilkare', 'src/283.java')
skipping:  ('3', 'amkihan', 'src/Heater.java')
skipping:  ('3', 'dowah', 'src/Book.java')
skipping:  ('3', 'maostlu', 'src/Main.java')
skipping:  ('3', 'mortadan', 'src/Book.java')
skipping:  ('3', 'shayanek', 'src/Book.java')
skipping:  ('4', 'vilost', 'src/ClockDisplay.java')
skipping:  ('4', 'maostlu', 'src/ClockDisplay.java')
skipping:  ('4', 'shayanek', 'src/12H.java')
skipping:  ('4', 'anbruno', 'src/Timvisare.java')
skipping:  ('4', 'anbruno', 'src/relevanta delar.java')
skipping:  ('7', 'shayanek', 'src/scribble/ScribleGUI.java')
skipping:  ('7', 'melise', 'src/random/RandomTester.java')
skipping:  ('7', 'alvart', 'src/starwars/NameGenerator.java')
skipping:  ('8', 'laurani', 'src/game/Game.java')
skipping:  ('8', 'rcornell', 'src/game/Game.java')
skipping:  ('8', 'briano', 'src/game/conditions.java')
skipping:  ('8', 'alvasun', 'src/highscore/HighScore.java')
skipping:  ('1', 'leosv', 'src/1.33.java')
skipping