### Extract functions from LL files

In [1]:
import CodePreprocessing as preprocessing
import json
import re
import os

In [2]:

# ----------------- user functions  --------------------
preprocessing.functions_preprocessing( llvm_file='pairs/UserCode/UserCode.ll', json_file='UserCode' )
with open( 'UserCode.json', 'r' ) as f:
    user_code = json.load(f) 

# ----------------- Vulnerable functions ------------------
# Reading all vulnerable codes from the LLVM files

vuln_codes_path= 'pairs/ourVulnCodes/'

if not os.path.exists(vuln_codes_path+'/jsons'):
    os.makedirs(vuln_codes_path+'/jsons')

for file in os.listdir(vuln_codes_path):
    if file.endswith(".ll"):
        file_name= file.split('.')[0]
        preprocessing.functions_preprocessing( llvm_file=vuln_codes_path+file, json_file=vuln_codes_path+'/jsons/'+file_name )

vulnerable_code= dict()
for file in os.listdir(vuln_codes_path+'/jsons'):
    if file.endswith(".json"):
        with open( vuln_codes_path+'/jsons/'+file, 'r' ) as f:
            vulnerable_code.update(json.load(f)) 

vulnerable_function=[]
for key in vulnerable_code:
    vulnerable_function.append(vulnerable_code[key])


In [3]:
print('Number of vulnerable functions: ',len(vulnerable_function))

Number of vulnerable functions:  12


### Matching

In [4]:
import matchers as matcher
import LLNormalizer as normalizer
import Winnowing

In [9]:
def check_function_vulnerable(threshold,score1,score2,score3):
    return score1 > threshold and score2 > threshold and score3 > threshold

In [10]:
#applying KNN but with similarity measures, we take the top 3 scores in all similarity measures, if those top 3 passed the threshold, we do the ultimate test, MOSS.
#K here equals 3

threshold=0.7
k=3

#this is a dictionary of key: vulnerable function (which is in our database)        value: code_scores for this vulnerable function (and the User Function)
Vulnerable_Matches = dict()

for k,v in vulnerable_code.items():
    vuln_func = v
    vuln_head=k
    code_scores=dict()


    for key in user_code:
        fn=user_code[key] 
        #fn=normalizer.NormalizeLLVM(fn)

        jaro_winkler=matcher.jaro_winkler_similarity(fn, vuln_func)
        levenshtein=matcher.levenshtein_similarity(fn, vuln_func)
        ratcliff_obershelp=matcher.ratcliff_obershelp_similarity(fn, vuln_func)
        trigram=matcher.trigram_similarity(fn, vuln_func)
        sorensen_dice=matcher.sorensen_dice_similarity(fn, vuln_func)
        jaccard_distance=matcher.jaccard_distance(fn, vuln_func)

        scores=[jaro_winkler,levenshtein,ratcliff_obershelp,trigram,sorensen_dice,jaccard_distance]
        scores.sort(reverse=True) 
        code_scores[key]=scores
    
    Vulnerable_Matches[vuln_head] = code_scores

with open('code_scores.json', 'w') as f:
    f.write(json.dumps(Vulnerable_Matches, indent=6))

# MOSS

In [6]:
#Candidate_Functions is a list of tuples, each containing a possible Match. A Match means User Function matching a Vulnerable Function
Candidate_Functions = []
for k,v in vulnerable_code.items():
    vuln_func = v
    vuln_head = k

    #Normalizing the vulnerable function with us (not user code)
    normalizedvuln = normalizer.NormalizeLLVM(vuln_func)

    #for each function passing the threshold, do MOSS.
    for key in Vulnerable_Matches[vuln_head]:
        code_scores = Vulnerable_Matches[vuln_head]
        if check_function_vulnerable(threshold,code_scores[key][0],code_scores[key][1],code_scores[key][2]):
            fn = user_code[key]

            #sometimes normalizing behaves good, sometimes bad.
            normalizedfn = normalizer.NormalizeLLVM(fn)
            
            #MOSS Metrics (defined in Winnowing.py), Parameters passed: k=20, ws = 10, P=10
            MOSS_Acc_metric1, MOSS_Acc_metric2, hits, misses1, misses2 = Winnowing.diff(normalizedfn, normalizedvuln, K= 20, WindowSize= 10, P= 10)
            
            print("\n\nVulnerable function found:",key)
            #Candidate_Functions containg a tuple of (original function head, vulnerable function name (which is stored with us))
            Candidate_Functions.append((re.findall('(@.*)\(', key)[0]  ,  re.findall('(@.*)\(', vuln_head)[0]))

            #MOSS Thresholds, 0.7 for Metric1, 0.7 for Metric2, those thresholds are highly dependent on the vulnerability type unfortunately.
            if(MOSS_Acc_metric1>0.7 or MOSS_Acc_metric2>0.7):
                print(f"MOSS Caught this <3 !!!")
                print(f"Accuracy_Metric 1 = {MOSS_Acc_metric1}  ||  Accuracy_Metric 2 = {MOSS_Acc_metric2}\nhits: {hits} , misses: {misses1}, misses2: {misses2}")
            print('-----------------------')



Vulnerable function found: define void @"CWE23_Relative_Path_Traversal__char_environment_fopen_41::bad"() local_unnamed_addr {

MOSS Caught this <3 !!!
Accuracy_Metric 1 = 0.9059233449477352  ||  Accuracy_Metric 2 = 0.5241935483870968
hits: 4420 , misses: 459, misses2: 4012
-----------------------


Vulnerable function found: define void @"CWE23_Relative_Path_Traversal__char_environment_fopen_41::goodG2B"() local_unnamed_addr {

MOSS Caught this <3 !!!
Accuracy_Metric 1 = 0.8165194164783234  ||  Accuracy_Metric 2 = 0.5601917113053284
hits: 3974 , misses: 893, misses2: 3120
-----------------------


In [None]:
print(Candidate_Functions)

# Graph matching ان شاء الله

In [5]:
import graph
import os
import re
from subprocess import run
import pathlib
import json

#Path to Marim's script generate_subgraphs.py
absPathtoCFGScript = str(os.path.abspath("../../IrToCFGs/generate_subgraphs.py")).replace("\\", "/")
absPathtoCFGScript = list(absPathtoCFGScript)
absPathtoCFGScript[0] = absPathtoCFGScript[0].upper()
absPathtoCFGScript = ''.join(absPathtoCFGScript)


#Path to the pairs folder in this directory
absPathtoPairsFolder = (str(os.getcwd())+"/pairs").replace("\\", "/")
absPathtoPairsFolder = list(absPathtoPairsFolder)
absPathtoPairsFolder[0] = absPathtoPairsFolder[0].upper()
absPathtoPairsFolder = ''.join(absPathtoPairsFolder)

#run CFG script on all subfolders inside pairs folder
run(["python",absPathtoCFGScript, "0", absPathtoPairsFolder , absPathtoPairsFolder])

CompletedProcess(args=['python', 'D:/Guardista/IrToCFGs/generate_subgraphs.py', '0', 'D:/Guardista/Localizer/Common/pairs', 'D:/Guardista/Localizer/Common/pairs'], returncode=0)

In [20]:
#TODO THIS CELL assumes that the CFG of the vulnerable code is put inside 1 folder only, rather, Marim makes a subfolder for each LLVM file and each subfolder
#Contains the CFG (s) for this LLVM file,
#so the TODO is to loop through all subfolders and store the graphs in the list 'ourGraphs'

#Prepare graphs for Vulnerable code, precompute them and store them in a list
#Vulnerable code (we are storing) is put inside a folder called ourVulnCodes, and the corresponding CFGs is inside a folder called ourVulnCodes_subgraphs/VulnerableCode_subgraphs
VulnerableCodeSubgraphsFolder = absPathtoPairsFolder + "/ourVulnCodes_subgraphs/"

#List containing graphs of each precomputed Vulnerable Code
ourGraphs = []

for currentpath, folders, jsonfiles in os.walk(VulnerableCodeSubgraphsFolder):

    for jsonfile in jsonfiles:
        fulljsonFilePath = currentpath+'/'+jsonfile
       
        with open(fulljsonFilePath) as f:
            jsonDict= json.load(f)
        
        functionName = jsonDict["function_name"].replace('\\', '')
        newGraph = graph.Graph()
        newGraph.readGraphFromJSON(fulljsonFilePath)
        ourGraphs.append(newGraph)

In [9]:
'''
we are trying to find the json file containing the name of the candidate functions (we will not compute CFG of EVERY possible function, rather just the candidate functions),
if we found a json of a candidate function, we compute its graph and perform the matching.

candidate functions are the functions that passed MOSS
'''


UserCodeSubgraphsFolder = absPathtoPairsFolder + "/UserCode_subgraphs/UserCode_subgraphs"

final_Matched_Functions = []

allFiles = os.listdir(UserCodeSubgraphsFolder)
for jsonfile in allFiles:
    fulljsonFilePath = UserCodeSubgraphsFolder+'/'+jsonfile
    if(pathlib.Path(jsonfile).suffix != ".json"):
        continue
    with open(fulljsonFilePath) as f:
        jsonDict = json.load(f)
    functionName = jsonDict["function_name"].replace('\\', '')

    if(functionName in [i[0] for i in Candidate_Functions]):              #check if the function name is inside Candidate Functions
        #Construct User Graph and Perform Matching
        UserCodeGraph = graph.Graph()
        UserCodeGraph.readGraphFromJSON(fulljsonFilePath)
        MatchPairs = graph.matchGraphWithListOfGraphs(UserCodeGraph , ourGraphs, otherWayAround=True)
        
        #MatchPairs is a list of Tuples, each Tuples contains the UserFunction Name and the Function Name stored in our Database
        if(MatchPairs):
            final_Matched_Functions.append(MatchPairs)

# Final Matches الحمد لله

In [10]:
print(final_Matched_Functions)
# NOTICE NO FALSE POSITIVES NOR FALSE NEGATIVES <3 <3 <3

[[('@"CWE23_Relative_Path_Traversal__char_environment_fopen_41::bad"', '@"CWE23_Relative_Path_Traversal__char_environment_fopen_18::bad"'), ('@"CWE23_Relative_Path_Traversal__char_environment_fopen_41::bad"', '@"CWE23_Relative_Path_Traversal__char_environment_fopen_18::bad"')]]


In [11]:

#Cleanup

import os
files = os.listdir('./')
files = [ fi for fi in files if fi.endswith(".json")]
for f in files:
    os.remove(f)
