In [None]:
import pandas as pd
import numpy as np
from sklearn import feature_extraction 

In [None]:
from dgadetec import dataset
import pandas as pd
import numpy as np
from dgadetec.feature_extractor import  get_feature
from sklearn.externals import joblib

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import metrics

## pre-load data

In [None]:
import os
def load_simple_data():
	files = os.listdir('./dgadetec/AlgorithmPowereddomains')
	
	domain_list = []
	for f in files:
		path = './dgadetec/AlgorithmPowereddomains/'+f
		domains = pd.read_csv(path,names=['domain'])
		domains = domains['domain'].tolist()
		for item in domains:
			domain_list.append(item)
	return domain_list


def load_data():
	if os.path.exists('./dgadetec/resource/train.npy'):
		train = np.load('./dgadetec/resource/train.npy')
		return train

	domains360 = pd.read_csv('./dgadetec/resource/360.txt',
							header=None)[[1]]
	domains360 = domains360.dropna()
	domains360['label'] = [0]*domains360.shape[0]

	#domains360 = domains360.drop_duplicates()

	domainsdga = pd.read_csv('./dgadetec/resource/dga-feed.txt', 
								names=['domain'], 
								header=None)
	domainsdga = domainsdga.dropna()
	domainsdga['label'] = [0]*domainsdga.shape[0]

	domain_normal = pd.read_csv('./dgadetec/resource/normal_domains.csv', 
							names=['domain'],
							header=None)
	domain_normal = domain_normal.dropna()
	domain_normal['label'] = [1]*domain_normal.shape[0]


	train = np.concatenate((domains360.values, domainsdga.values, domain_normal.values),axis=0)

	#train = train.drop_duplicates(subset=1)
	
	#train = np.array(train)
	np.random.shuffle(train)
	np.save('./dgadetec/resource/train.npy', train)

	return train

In [5]:
data = load_data()
data = pd.DataFrame(data, columns=['domain', 'label'])
data = data.drop_duplicates(subset='domain')
data = np.array(data)
print("all samples= ",data.shape)
print("dataY contains:", np.unique(data[:,1]))

all samples=  (2101904, 2)
dataY contains: [0 1]


In [20]:
trainX = data[:50000,0]
trainY = data[:50000,1].astype(int) 
testX = data[50000:51000, 0]
testY = data[50000:51000, 1].astype(int)


In [21]:
trainX = get_feature(trainX)
testX = get_feature(testX)

[0.3187381  0.42660256 0.20134589 ... 0.23543424 0.39381518 0.43045635]
[0.32671655 0.23424321 0.27385793 0.41008987 0.45781297 0.32600289
 0.37289646 0.3052365  0.3693704  0.39822261 0.30106916 0.40053946
 0.35698622 0.33175267 0.35178655 0.18070666 0.40532742 0.43344376
 0.45479798 0.2386297  0.46782634 0.34911255 0.36818383 0.29176441
 0.44039485 0.41479396 0.38911028 0.42226793 0.33585784 0.36501214
 0.25389146 0.19140986 0.3978827  0.31683156 0.4203792  0.43800797
 0.22583226 0.39103985 0.38018404 0.34167815 0.36691312 0.40221559
 0.35148518 0.42030795 0.23684524 0.37451546 0.33707952 0.40421745
 0.4166625  0.22213147 0.36638213 0.42253041 0.16122357 0.42665105
 0.44540129 0.29419213 0.20023406 0.46412205 0.26756634 0.35927739
 0.38863796 0.44005245 0.4297787  0.3898366  0.25379837 0.22143952
 0.22032139 0.18494579 0.34155362 0.26810023 0.38678903 0.40338794
 0.22930481 0.41446226 0.23192565 0.31750209 0.33384933 0.32228649
 0.38315163 0.33451195 0.39072479 0.41735294 0.36256868 0

In [38]:
trainX.shape

(50000, 10)

## various models

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
def metric_me(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 =f1_score(y_true, y_pred)
    
    return accuracy, f1

In [24]:
simpleLR = LogisticRegression()
simpleLR.fit(trainX, trainY)
pred_y = simpleLR.predict(testX)
acc, f1 = metric_me(testY, pred_y)
print("simpleLR acc={} f1={}".format(acc, f1))
######################################################################
simpleSVM = SVC()
simpleSVM.fit(trainX,trainY)
pred_y = simpleSVM.predict(testX)
acc, f1 = metric_me(testY, pred_y)
print("simpleSVM acc={} f1={}".format(acc, f1))
###########################################################################3
simpleGBM = GradientBoostingClassifier()
simpleGBM.fit(trainX, trainY)
pred_y = simpleGBM.predict(testX)
acc, f1= metric_me(testY, pred_y)
print("simpleGBM acc={} f1={}".format(acc, f1))


simpleLR acc=0.912 f1=0.9098360655737705
simpleSVM acc=0.937 f1=0.9347150259067358
simpleGBM acc=0.94 f1=0.937888198757764


In [39]:
from sklearn.externals import joblib
joblib.dump(simpleLR, './dgadetec/models/LR.pkl')
joblib.dump(simpleSVM, './dgadetec/models/SVM.pkl')
joblib.dump(simpleGBM, './dgadetec/models/GBM.pkl')

['./dgadetec/models/GBM.pkl']

In [37]:
import time

start = time.clock()
X = get_feature(['www.dsldkslakdlkd.com'])
pred_result_GBM = simpleGBM.predict(X)
pred_result_SVM = simpleSVM.predict(X)
pred_result_LR = simpleLR.predict(X)
end = time.clock()

print(end-start)
print(pred_result_GBM)
print(pred_result_SVM)
print(pred_result_LR)

[0.41643357]
0.051492580530577925
[0]
[1]
[0]
