<a href="https://colab.research.google.com/github/Neetu162/DeepLearningResearch/blob/master/Run1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas
import sys
import argparse
import datetime

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential, Model
from keras.constraints import maxnorm
from keras.layers import Dense, Dropout, Input, concatenate
from keras.optimizers import Nadam

def create_one_layer(data_width, neurons=25, optimizer='adam', dropout_rate=0.0, weight_constraint=0):
    #baseline Model
    model = Sequential()
    #The first param in Dense is the number of neurons in the first hidden layer
    #model.add(Dense(neurons, input_dim=22300, kernel_initializer='normal', activation='relu',kernel_constraint=maxnorm(weight_constraint) ))
    model.add(Dense(neurons, input_dim=data_width, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


def create_fourSameLayer(neurons=25, optimizer='adam', dropout_rate=0.0, weight_constraint=0):
    #baseline Model
    model = Sequential()
    #The first param in Dense is the number of neurons in the first hidden layer
    model.add(Dense(neurons, input_dim=22300, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(neurons, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


def create_fourDecrLayer(neurons=25, optimizer='adam', dropout_rate=0.0, weight_constraint=0):
    #baseline Model
    model = Sequential()
    n2 = neurons // 2 if neurons // 2 > 0 else 1
    n3 = neurons // 3 if neurons // 3 > 0 else 1
    n4 = neurons // 4 if neurons // 4 > 0 else 1
    #The first param in Dense is the number of neurons in the first hidden layer
    model.add(Dense(neurons, input_dim=22300, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n2, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    if n2 > 1 : model.add(Dropout(dropout_rate))
    model.add(Dense(n3, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    if n3 > 1: model.add(Dropout(dropout_rate))
    model.add(Dense(n4, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(weight_constraint)))
    if n4 > 1: model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model


def create_binaryDecrease(neurons=25, optimizer='adam'):
    model = Sequential()
    #The first param in Dense is the number of neurons in the first hidden layer
    model.add(Dense(neurons, input_dim=22300, kernel_initializer='normal', activation='relu'))
    while (neurons/2 >=1):
        model.add(Dense(neurons/2, kernel_initializer='normal', activation='relu'))
        neurons/=2
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def create_dualInputSimple(input_ratio, feat_width, perm_width, neurons=32, dropout_rate=0.1):
    '''this simple model performs no additional analysis after concatenation'''
    perm_input = Input(shape=(perm_width,), name='permissions_input')
    x = Dense(neurons, activation='relu')(perm_input)
    feat_input = Input(shape=(feat_width,), name='features_input')
    y = Dense(int(neurons*input_ratio), activation='relu')(feat_input)
    x = concatenate([x, y])
    output = Dense(1, activation='sigmoid', name='output')(x)
    model = Model(inputs=[perm_input, feat_input], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

def create_dualInputLarge(input_ratio, feat_width, perm_width, neurons=32, dropout_rate=0.1):
    '''this model performs additional analysis with layers after concatenation'''
    perm_width=int(perm_width)
    perm_input = Input(shape=(perm_width,), name='permissions_input')
    x = Dense(neurons, activation='relu')(perm_input)
    x = Dropout(dropout_rate)(x)
    x = Dense(neurons, activation='relu')(x)
    feat_input = Input(shape=(feat_width,), name='features_input')
    y = Dense(int(neurons*input_ratio), activation='relu')(feat_input)
    x = concatenate([x, y])
    x = Dense(int((neurons+(neurons*input_ratio))/2), activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(int((neurons+(neurons*input_ratio))/2), activation='relu')(x)
    output = Dense(1, activation='sigmoid', name="output")(x)
    model = Model(inputs=[perm_input, feat_input], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

def vectorize(good_path, mal_path, adverse):
    good_path = good_path
    mal_path = mal_path

    # read files
    with open(good_path) as f:
        gdprm = f.readlines()
    with open(mal_path) as f:
        mlprm = f.readlines()

    # Concatenate good and mal samples
    perms = gdprm + mlprm

    # append the labels
    # good is labeled 0
    # malware is labeled 1
    labels = np.array([])
    for x in gdprm:
        labels = np.append(labels, 0)
    for x in mlprm:
        labels = np.append(labels, 1)

    # Define the sklearn vectorizer
    count_vect = CountVectorizer(input=u'content', analyzer=u'word',
                                 token_pattern='(\\b(:?uses-|optional-)?permission:\s[^\s]*)')
    #time0 = timeit.default_timer()

    # vectorize input
    features = count_vect.fit_transform(perms)

    # convert to dense matrix
    features = features.todense()
    features = np.array(features)

    # This is in the case of adversarial learning
    # Some of the labels will be wrong on purpose
    if adverse:
        print("Adversarial Learning")
        # keep track of how many of each were changed
        count1 = 0
        count2 = 0

        gdprmsize = np.size(gdprm, 0)
        mlprmszie = np.size(mlprm, 0)

        # change 10% of the good labels
        for i in range(0, gdprmsize // 10):
            if labels[i] == 0:
                count1 += 1
                labels[i] = 1
        print("Good Permissions Changed: %d" % count1)

        # change 10% of the malware labels
        for i in range(gdprmsize, gdprmsize + mlprmszie // 10):
            if labels[i] == 1:
                count2 += 1
                labels[i] = 0
        print("Malware Permissions Changed: %d" % count2)

        total = count1 + count2
        print("Total Permissions Changed: %d" % total)

    print("Done Vectorizing Data")
    return features, labels

# Method for a standard test -- Not Grid Search
def full_run(modelName, features, labels, train_ratio, args):
    # Get Vars from input args
    print("inside full_run method ")
    epochs = args["epochs"]
    batch_size = args["batch_size"]
    neurons = args["neurons"]
    optimizer = args["optimizer"][0]
    weight_constraint = args["weight_constraint"]
    dropout_rate = args["dropout"]/100
    percent = float(train_ratio) / 100
    splits = args["splits"]
    print("variables assigned")
    #model_params = dict(batch_size=batch_size, epochs=epochs, neurons=neurons, optimizer=optimizer,
    #                    weight_constraint=weight_constraint, dropout_rate=dropout_rate)

    fit_params = dict(batch_size=batch_size, epochs=epochs)
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    # Define and Build the Model based on input modelName
    if modelName == "oneLayer":
        model = KerasClassifier(build_fn=create_one_layer, batch_size=batch_size, epochs=epochs, neurons=neurons,
                                optimizer=optimizer, weight_constraint=weight_constraint, dropout_rate=dropout_rate,
                                verbose=2)
    elif modelName == "binaryDecrease":
        model = KerasClassifier(build_fn=create_binaryDecrease, batch_size=batch_size, epochs=epochs, neurons=neurons,
                                optimizer=optimizer, weight_constraint=weight_constraint, dropout_rate=dropout_rate,
                                verbose=2)
    elif modelName == "fourSame":
        model = KerasClassifier(build_fn=create_fourSameLayer, batch_size=batch_size, epochs=epochs, neurons=neurons,
                                optimizer=optimizer, weight_constraint=weight_constraint, dropout_rate=dropout_rate,
                                verbose=2)
    elif modelName == "fourDecr":
        print("fourDecr start")
        model = KerasClassifier(build_fn=create_fourDecrLayer, batch_size=batch_size, epochs=epochs, neurons=neurons,
                                optimizer=optimizer, weight_constraint=weight_constraint, dropout_rate=dropout_rate,
                                verbose=1)
        print("fourDecr end")
    # Shuffle split Definition for Cross Validation
    print("sss start")
    sss = StratifiedShuffleSplit(n_splits=splits, test_size=percent, random_state=0)
    print("sss end")
    # Running the model with Cross Validation
    print("cv_result start")
    cv_result = cross_validate(model, features, labels, cv=sss, fit_params=fit_params, return_train_score=True,
                               scoring=scoring, verbose=100)
    print("cv_result end")
    # Determine date for creating a file later
    # This helps to keep track of tests and prevents overwriting of results
    d = datetime.datetime.today()
    month = str( '%02d' % d.month)
    day = str('%02d' % d.day)
    hour = str('%02d' % d.hour)
    min = str('%02d' % d.minute)

    # saving the result of testing to a Pandas Dataframe
    df = pandas.DataFrame(cv_result)
    print("Writing to file ...")
    # Write the results out to a file
    try:
        path1 = '/home/osboxes/DeepLearningResearch/Classification/results/' + modelName + month + day + hour + min + '.csv'
        file1 = open(path1, "a+")
    except:
        path1 = "results" + modelName + month + day + hour + min + ".csv"
        file1 = open(path1, "a+")
    df.to_csv(file1, index=True)
    file1.close()

    return 0


# Grid Search Method
def grid_search(modelName, features, labels, train_ratio, args):

    # Get Vars from input args
    splits = args["splits"]
    percent = float(train_ratio) / 100
    epochs = args["epochs"]
    batch_size = args["batch_size"]
    neurons = args["neurons"]
    optimizer = args["optimizer"]
    weight_constraint = args["weight_constraint"]
    dropout_rate = args["dropout"]

    # Define the grid based on params
    paramGrid = dict(epochs=epochs, batch_size=batch_size, optimizer=optimizer,
                     dropout_rate=dropout_rate, weight_constraint=weight_constraint,
                     neurons=neurons)

    # Model Definition based on input modelName
    if modelName == "oneLayer":
        model = KerasClassifier(build_fn=create_one_layer, verbose=0)
    elif modelName == "binaryDecrease":
        model = KerasClassifier(build_fn=create_binaryDecrease, verbose=0)
    elif modelName == "fourSame":
        model = KerasClassifier(build_fn=create_fourSameLayer, verbose=0)
    elif modelName == "fourDecr":
        model = KerasClassifier(build_fn=create_fourDecrLayer, verbose=0)

    # Define Split and Grid Search Cross Validation
    sss = StratifiedShuffleSplit(n_splits=splits, test_size=percent, random_state=0)
    grid = GridSearchCV(estimator=model, param_grid=paramGrid, n_jobs=1, cv=sss, refit=True, verbose=2)

    # Execute a grid search
    grid_fit = grid.fit(features, labels)

    # These are metrics that can be used later
    means = grid_fit.cv_results_['mean_test_score']
    stds = grid_fit.cv_results_['std_test_score']
    params = grid_fit.cv_results_['params']

    print("%s Best: %f using %s" % (modelName, grid_fit.best_score_, grid_fit.best_params_))

    # Determine date for creating a file later
    # This helps to keep track of tests and prevents overwriting of results
    d = datetime.datetime.today()
    month = str( '%02d' % d.month)
    day = str('%02d' % d.day)
    hour = str('%02d' % d.hour)
    min = str('%02d' % d.minute)

    # Save results to  Pandas Dataframe
    df = pandas.DataFrame(grid_fit.cv_results_)

    # Write the results out to a file
    try:
        path1 = '/content/drive/My Drive/data/gridSearch' + modelName + month + day + hour + min + '.csv'
        file1 = open(path1, "w+")
    except:
        path1 = "gridSearch" + modelName + ".csv"
        file1 = open(path1, "w+")
    df.to_csv(file1, index=True)
    file1.close()

    return 0

# Command Line Parameters are define in this method
def parse_arguments():
    arguments = {}

    arguments["good_path"] = "/content/drive/My Drive/data/goodPermissionsFinal.txt"
    arguments["mal_path"] = "/content/drive/My Drive/data/malwarePermissionsFinal.txt"
    adverse = False
    arguments["adverse"] = adverse
    mode = "full"
    arguments["mode"] = mode
    model = ["fourDecr"]
    arguments["model"] = model
    epochs = 16
    arguments["epochs"] = epochs
    train_ratio = [20, 40, 60, 80]
    arguments["train_ratio"] = train_ratio
    batch_size = 10
    arguments["batch_size"] = batch_size
    neurons = 45
    arguments["neurons"] = neurons
    optimizer = "Nadam"
    arguments["optimizer"] = optimizer
    weight_constraint = 5
    arguments["weight_constraint"] = weight_constraint
    dropout = 10
    arguments["dropout"] = dropout
    splits = 1
    arguments["splits"] = splits
    return arguments

args = parse_arguments()
print("Arguments" + str(args))
# vec Vectorize the input
features, labels = vectorize(args["good_path"], args["mal_path"], args["adverse"])
print("Completed the vectorize")
# Grid Search
if args["mode"] == "grid" :
    # for all models  
    for m in args["model"] :
        # For all ratios
        for r in args["train_ratio"] :
            grid_search(m, features, labels, r, args)

# Regular run -- Not Grid Search
else :
    # For all models
    for m in args["model"] :
        # For all ratios
        for r in args["train_ratio"] :
            full_run(m, features, labels, r, args)


Using TensorFlow backend.


Arguments{'good_path': '/content/drive/My Drive/data/goodPermissionsFinal.txt', 'mal_path': '/content/drive/My Drive/data/malwarePermissionsFinal.txt', 'adverse': False, 'mode': 'full', 'model': ['fourDecr'], 'epochs': 16, 'train_ratio': [20, 40, 60, 80], 'batch_size': 10, 'neurons': 45, 'optimizer': 'Nadam', 'weight_constraint': 5, 'dropout': 10, 'splits': 1}


full_run method

Google drive path: /content/drive/My Drive/data