In [3]:
import os
import json
import subprocess
import collections
import math
import pandas as pd
from pyrsistent import s

In [10]:
def estimate_shannon_entropy(data):
    m = len(data)
    bases = collections.Counter([tmp_base for tmp_base in data])
    
    shannon_entropy_value = 0
    for base in bases:
        # number of residues
        n_i = bases[base]
        # n_i (# residues type i) / M (# residues in column)
        p_i = n_i / float(m)
        entropy_i = p_i * (math.log(p_i, 10))
        shannon_entropy_value += entropy_i

    return shannon_entropy_value * -1

def cleanBenData():
    with open(".\\ben_features.json") as f:
        data = json.loads(f.read())
    
    new_tag = data
    for key in list(data):
        tag = data[key][:-1]
        if "CodeInjection" in tag or "NegativeContext" in tag or "Ransomware" in tag or "MaliciousBehaviorCombo" in tag or "Disabled Protections" in tag or "Coin Miner" in tag or "Embedded File" in tag:
            new_tag.pop(key)
        # else:
        #     if tag[-1].find("KnownMalware") != -1 :
        #         new_tag.pop(key)

    with open('ben_features.json', 'w') as f:
        f.write(json.dumps(new_tag))


def cleanContent(output):
    if output[2].find("Unknown") == -1:
        f_list = output[-1].strip(' [').strip(']').split("|")
        fea_list = []
        for f in f_list:
            fea = f.replace(" ","")[0:f.replace(" ","").find("-")]
            fea_list.append(fea)
        return fea_list
    else:
        return None


def getFeature(path):
    result = {}
    dir_list = os.listdir(path)
    for file in dir_list:
        try:
            p1 = subprocess.check_output(["python", "..\\2_data_preprocessing\\PowerShellProfiler.py", "-f", path+file])
            output = p1.decode('utf-8').strip('\r\n').split(',')
            fea_list = cleanContent(output)
            if fea_list != None:
                result[file] = fea_list

                with open(path + file, encoding="ISO-8859-1") as f:
                    raw_context = f.read()
                result[file].append(round(estimate_shannon_entropy(raw_context), 5))
        except:
            pass
    return result

In [11]:
def main():
    ben_path = ".\\ben\\" 
    ben_features = getFeature(ben_path)
    with open('ben_features.json', 'w') as f:
        f.write(json.dumps(ben_features))

    cleanBenData()

    mal_path = ".\\mal\\"
    mal_features = getFeature(mal_path)
    with open('mal_features.json', 'w') as f:
        f.write(json.dumps(mal_features))

    

if __name__ == '__main__':
    main()

---

### 生成Scripts.csv

In [24]:
with open(".\\ben_features.json", 'r') as f:
    data = json.loads(f.read())

with open(".\\mal_features.json", 'r') as f:
    data.update(json.loads(f.read()))

In [25]:
type_list = ['OneLiner', 'VariableExtension', 'AbnormalSize', 'Obfuscation', 'Entropy',
             'PositiveContext', 'ScriptLogging', 'FunctionBody', 'License',
             'Downloader', 'StartsProcess', 'ScriptExecution', 'Crypto', 'Enumeration', 
             'HiddenWindow', 'CustomWebFields', 'Persistence', 'Registry', 'Sleeps',
             'SysInternals', 'Compression', 'UninstallsApps', 'ByteUsage', 
             'NegativeContext', 'KnownMalware', 'CodeInjection', 'DNSC2', 'AppLockerBypass',
             'AMSIBypass', 'EmbeddedFile', 'ClearLogs', 'DisabledProtections', 'ScreenScraping'
            ]

# type_list = ['OneLiner', 'VariableExtension', 'AbnormalSize', 'Obfuscation',
#              'PositiveContext', 'ScriptLogging', 'FunctionBody', 'License',
#              'Downloader', 'StartsProcess', 'ScriptExecution', 'Crypto', 'Enumeration', 
#              'HiddenWindow', 'CustomWebFields', 'Persistence', 'Registry', 'Sleeps',
#              'SysInternals', 'Compression', 'UninstallsApps', 'ByteUsage', 
#              'NegativeContext', 'KnownMalware', 'CodeInjection', 'DNSC2', 'AppLockerBypass',
#              'AMSIBypass', 'EmbeddedFile', 'ClearLogs', 'DisabledProtections', 'ScreenScraping'
#             ]

new_data={}
for n in data:
    new_data[n] = []
    for type in type_list:
        if type in data[n]:
            new_data[n].append(int(1))
        else:
            new_data[n].append(int(0))
    
    new_data[n][4] = str(data[n][-1]) ##

    for type in data[n][:-1]: 

        if type.find("KnownMalware") != -1:
            new_data[n][24] = type[13:] ##
            # new_data[n][23] = type[13:] ##
        
        if type.find("Obfuscation") != -1:
            if type.find("Obfuscation:") != -1:
                new_data[n][3] = type[12:]
            else:
                new_data[n][3] = "dominant"


    if n.find("ben_") != -1:
        new_data[n].append(int(0))
    else:
        new_data[n].append(int(1))


In [26]:
df = pd.DataFrame(new_data).T
df.columns=[ 'OneLiner', 'VariableExtension', 'AbnormalSize', 'Obfuscation', 'Entropy',
             'PositiveContext', 'ScriptLogging', 'FunctionBody', 'License',
             'Downloader', 'StartsProcess', 'ScriptExecution', 'Crypto', 'Enumeration', 
             'HiddenWindow', 'CustomWebFields', 'Persistence', 'Registry', 'Sleeps',
             'SysInternals', 'Compression', 'UninstallsApps', 'ByteUsage', 
             'NegativeContext', 'KnownMalware', 'CodeInjection', 'DNSC2', 'AppLockerBypass',
             'AMSIBypass', 'EmbeddedFile', 'ClearLogs', 'DisabledProtections', 'ScreenScraping',
             'Result']

# df.columns=[ 'OneLiner', 'VariableExtension', 'AbnormalSize', 'Obfuscation',
#              'PositiveContext', 'ScriptLogging', 'FunctionBody', 'License',
#              'Downloader', 'StartsProcess', 'ScriptExecution', 'Crypto', 'Enumeration', 
#              'HiddenWindow', 'CustomWebFields', 'Persistence', 'Registry', 'Sleeps',
#              'SysInternals', 'Compression', 'UninstallsApps', 'ByteUsage', 
#              'NegativeContext', 'KnownMalware', 'CodeInjection', 'DNSC2', 'AppLockerBypass',
#              'AMSIBypass', 'EmbeddedFile', 'ClearLogs', 'DisabledProtections', 'ScreenScraping',
#              'Result']

df.to_csv("NEW_scripts.csv")