In [26]:
import pandas as pd
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [27]:

data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate','ImageBase'], axis=1).values
y = data['legitimate'].values

In [28]:
print('Researching important feature based on %i total features\n' % X.shape[1])

Researching important feature based on 53 total features



In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138047 entries, 0 to 138046
Data columns (total 57 columns):
Name                           138047 non-null object
md5                            138047 non-null object
Machine                        138047 non-null int64
SizeOfOptionalHeader           138047 non-null int64
Characteristics                138047 non-null int64
MajorLinkerVersion             138047 non-null int64
MinorLinkerVersion             138047 non-null int64
SizeOfCode                     138047 non-null int64
SizeOfInitializedData          138047 non-null int64
SizeOfUninitializedData        138047 non-null int64
AddressOfEntryPoint            138047 non-null int64
BaseOfCode                     138047 non-null int64
BaseOfData                     138047 non-null int64
ImageBase                      138047 non-null object
SectionAlignment               138047 non-null int64
FileAlignment                  138047 non-null int64
MajorOperatingSystemVersion    138047 

In [30]:
# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

In [31]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)

features = []

In [32]:
print('%i features identified as important:' % nb_features)

11 features identified as important:


In [33]:
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
    print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))


1. feature Subsystem (0.161699)
2. feature Machine (0.122218)
3. feature LoadConfigurationSize (0.112629)
4. feature MinorImageVersion (0.082488)
5. feature ResourcesMeanEntropy (0.068195)
6. feature SectionsMinEntropy (0.064333)
7. feature SizeOfOptionalHeader (0.060621)
8. feature Characteristics (0.044774)
9. feature CheckSum (0.037737)
10. feature SectionsMeanEntropy (0.032157)
11. feature ResourcesMinEntropy (0.031673)


In [34]:
# XXX : take care of the feature order
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2+f])

In [35]:
#Algorithm comparison
algorithms = {
        "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
        "RandomForest": ske.RandomForestClassifier(n_estimators=50),
        "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
        "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
        "GNB": GaussianNB()
    }

In [36]:
results = {}
print("\nNow testing algorithms")
for algo in algorithms:
    clf = algorithms[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score
    
    
    


Now testing algorithms
GNB : 89.757334 %
DecisionTree : 98.935168 %
RandomForest : 99.369794 %
AdaBoost : 98.649040 %
GradientBoosting : 98.638175 %


In [37]:

winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))


Winner algorithm is RandomForest with a 99.369794 % success


In [38]:
# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'w').write(pickle.dumps(features))
print('Saved')

Saving algorithm and feature list in classifier directory...
Saved


In [39]:
# Identify false and true positive rates
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

False positive rate : 0.545568 %
False negative rate : 0.824964 %
