In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
def Parse_data(dir_of_inputdata,options='one'):
    '''
    对输入的文本数据进行解析
    dir_of_inputdata：输入的文本数据
    options：选择文本解析模式，'one'代表普通模式，'two'代表只选命令行中的关键词
    '''
    dataset = []
    remove_data = ['ls','cd','pwd','cat']
    
    if options == 'one':
        #匹配非字母字符，即匹配特殊字符
        regEx = re.compile('\\W*')
        with open(dir_of_inputdata,encoding='UTF-8') as f:
            for line in f.readlines():
                #去掉行尾换行符
                line=line.rstrip('\n')
                listoftoken = regEx.split(line)
                #去掉空格值，且将字符串转变为小写
                tem = [tok.lower() for tok in listoftoken if len(tok)>0]
                #去掉第一个无用的序列号
                del tem[0]
                #去掉字符小于1的值
                tem2 = [b for b in tem if len(b)>1]
                #去掉无用的命令
                tem_data = [a for a in tem2 if a in remove_data]
                if len(tem_data) == 0:
                    dataset.append(tem2)
    if options == 'two':
        regEx = re.compile('\\W*')
        with open(dir_of_inputdata,encoding='UTF-8') as f:
            for line in f.readlines():
                #去掉行尾换行符
                line=line.rstrip('\n')
                
                #按照空格划分字符串
                listoftoken = re.split(' ',line)
                data_tem = []
                for token in listoftoken:
                    #按照/划分一条命令行的字符串
                    tem = re.split(r'/',token)
                    #len(tok)>0是为了能取出/bin/read/ 中的read而不是最后一个/后的空格
                    tem2 = [tok for tok in tem if len(tok)>0]
                    if len(tem2) != 0:
                        #取出一个命令行中关键的命令字段
                        tem3 = tem2[-1]
                    else :
                        #如果token只有/或者空格，例如/ 或者//，那么tem、tem2为空
                        continue
                    #tem3是字符串，不是list，所以用append，而不是extend
                    data_tem.append(tem3)
                #将data_tem中的关键命令字段按照空格相连
                data_tem1 = ' '.join(data_tem)
                
                data_tem2 = regEx.split(data_tem1)
                #去掉空格的值，且将字符串转变为小写,注意不要写len(tok)>1,会对del data_tem[0]有影响，因为1到9序列号字符为1
                data_tem3 = [tok.lower() for tok in data_tem2 if len(tok)>0]
                #去掉第一个无用的序列号
                del data_tem3[0]
                #去掉字符小于1的值
                data_tem4 = [tok for tok in data_tem3 if len(tok)>1]
                #去掉无用的命令
                tem_data = [a for a in data_tem4 if a in remove_data]
                if len(tem_data) == 0:
                    dataset.append(data_tem4)
                
    return dataset

In [8]:
dataset = Parse_data('history.txt',options='two')
dataset



[['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled', 'startss'],
 ['java',
  'djava',
  'ext',
  'dirs',
  'libs',
  'xms128m',
  'xmx2048m',
  'run',
  'com',
  'fs',
  'ezminer',
  'kafkastreamingsample',
  'cluster'],
 ['java',
  'djava',
  'ext',
  'dirs',
  'libs',
  'jar',
  'sample',
  'snapshot',
  'jar',
  'xms128m',
  'xmx2048m',
  'run',
  'com',
  'fs',
  'ezminer',
  'kafkastreamingsample',
  'cluster'],
 ['sampled', 'startss'],
 ['sampled', 'startss'],
 ['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled', 'stop'],
 ['sampled'],
 ['sampled', 'startc'],
 ['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled', 'stop'],
 ['sampled'],
 ['sampled', 'startcc'],
 ['ps', 'ef', 'grep', 'ezminer'],
 ['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled'],
 ['sampled', 'startcc'],
 ['sampled', 'startcc'],
 ['sampled', 'startss'],
 ['sampled', 'startss'],
 ['sampled', 'startss'],
 ['sampled', 'startss'],
 ['sampled', 'stop'],
 ['

In [4]:
def CreateVocabList(dataset):
    '''
    利用所有的样本数据，生成对应的词汇库
    dataset：样本数据集
    '''
    vocabset = set([])
    for document in dataset:
        vocabset = vocabset | set(document)
    #去掉sudo这个特殊的字符串
    #vocabset.remove('sudo')
    print ('The length of the vocabulary: %s' %len(vocabset))
    
    return list(vocabset)

In [5]:
CreateVocabList(dataset)

The length of the vocabulary: 172


['29267',
 '155',
 'libs',
 'rm',
 'consumer',
 'friendly',
 '1027',
 'gen',
 'scp',
 '221',
 'bash_profile',
 'aaaa',
 'users',
 'diaolongkeji',
 'centos7',
 'history',
 'ez_stat',
 'sample',
 'ez_complete',
 '13',
 'etc',
 'en8',
 'new',
 'sudo',
 'ext',
 'xzddmj',
 'qkl',
 '15e216',
 '9300',
 'startss',
 'startc',
 'cluster',
 'run',
 'www',
 'completed',
 '9092',
 'xcode',
 '145',
 'e92',
 'exit',
 'stop',
 'mkdir',
 'utf',
 'ez_raw',
 'mv',
 '11',
 'fs',
 'ping',
 'devicesupport',
 'cocos',
 '28',
 '245',
 'jar',
 '168',
 'build',
 'tools',
 '22',
 'applications',
 'en0',
 '8003',
 'ef',
 'platforms',
 'groups',
 'sh',
 'start',
 'hosts',
 'xmx2048m',
 'pjft',
 'open',
 'encoding',
 'volumes',
 '201',
 '130',
 'class',
 'flumespool',
 'apple',
 'apache',
 'all',
 'kafkastreamingsample',
 'work',
 'hwww',
 'broker',
 'producer',
 'test1',
 'ode',
 'cleanmymac',
 '2181',
 '62',
 'mysql',
 'list',
 '10',
 'zip',
 'gatekeeper',
 'help',
 'topics',
 'topic',
 'free',
 'djava',
 '177',


In [6]:
def SetofWords2Vec(vocablist,inputset):
    '''
    利用词汇库，将文本数据样本，转化为对应的词条向量
    vocablist：词汇表
    inputset：文本数据集
    '''
    datavec = []
    for document in inputset:
        tem = [0]*len(vocablist)
        for word in document:
            if word in vocablist:
                tem[vocablist.index(word)] = 1
            else:
                print ("the word : %s is not in my vocabulary!" % word)
        datavec.append(tem)
        
    return datavec

In [7]:
inputset = Parse_data("history.txt")
vocablist = CreateVocabList(inputset)
dataset = SetofWords2Vec(vocablist,inputset)

The length of the vocabulary: 172




In [8]:
def Gs_DBSCAN_parameter(dataset):
    '''
    利用贪心算法（坐标下降算法），寻找最优的epsilon和min_samples参数
    dataset：数据样本
    '''
    X = dataset
    epsilons = [0.001,0.05,0.06,0.07,0.08,0.1,0.2,0.3,0.5,1,2,3,5]
    min_samples = [2,3,4,5,10,15,20,30,50,70,80,100]
    evalue = []
    mvalue = []
    for epsilon in epsilons:
        clst = DBSCAN(eps = epsilon)
        clst.fit(X)
        if len(set(clst.labels_))>1:
            evalue.append(metrics.silhouette_score(X,clst.labels_,metric='euclidean'))
        else :
            evalue.append(-1)#为了后面的evalue.index(max(evalue))可以找到正确的eindex而补了一个-1的位置
    if len(evalue) == evalue.count(-1):
        raise NameError('empty sequence')
    eindex = evalue.index(max(evalue))
    best_epsilon = epsilons[eindex]
    print ("Evaluate Ratio: %s" % evalue)
    print ("Epsilon Value: %s" % epsilons)
    print ("=============================================")
    for num in min_samples:
        clst = DBSCAN(eps = best_epsilon,min_samples = num)
        clst.fit(X)
        if len(set(clst.labels_))>1:
            mvalue.append(metrics.silhouette_score(X,clst.labels_,metric='euclidean'))
        else :
            mvalue.append(-1)#为了后面的mvalue.index(max(mvalue))可以找到正确的mindex而补了一个-1的位置
    if len(mvalue) == mvalue.count(-1):
        raise NameError('empty sequence')
    mindex = mvalue.index(max(mvalue))
    best_num = min_samples[mindex]
    print ("Evaluate Ratio: %s" % mvalue)
    print ("Min Samples Value: %s" % min_samples)
    print ("=============================================")
    print ("Best Epsilon: %s" % best_epsilon)
    print ("Best Min Samples: %s" % best_num)
    
    return best_epsilon,best_num


In [9]:
def Model_DBSCAN(dataset,best_epsilon=0.1,best_num=5):
    '''
    使用DBSCAN聚类结果为数据贴标签
    '''
    X = dataset
    
    clst = DBSCAN(eps = best_epsilon, min_samples = best_num)
    clst.fit(X)
    clst_labels = clst.labels_
    if len(set(clst_labels))>1:
        evalue=metrics.silhouette_score(X,clst.labels_,metric='euclidean')
    else:
        evalue="no exception people"
    #输出评价系数
    print ("Evaluate Ratio: %s" % evalue)
    print ("=============================================")
    for clst_lab in set(clst_labels):
        print ("Number of the %s class: %s" % (clst_lab,list(clst_labels).count(clst_lab)))
    print ("=============================================")
    print ("Number of the labels: %s" % len(clst_labels))
    
    return clst_labels

In [13]:
Model_DBSCAN(dataset)

Evaluate Ratio: 0.545778117717302
Number of the 0 class: 13
Number of the 1 class: 10
Number of the 2 class: 9
Number of the 3 class: 7
Number of the 4 class: 17
Number of the 5 class: 7
Number of the 6 class: 5
Number of the 7 class: 6
Number of the 8 class: 59
Number of the 9 class: 45
Number of the 10 class: 20
Number of the 11 class: 25
Number of the 12 class: 21
Number of the 13 class: 6
Number of the -1 class: 131
Number of the labels: 381


array([ 0,  0,  1, -1, -1,  1,  1,  0,  0, -1, -1, -1,  0,  0, -1, -1,  0,
       -1,  0,  0,  0, -1,  0,  0,  1,  1,  1,  1, -1, -1,  1,  1,  1,  2,
        2,  2,  2,  2, -1, -1,  3,  3, -1, -1, -1, -1,  3,  3,  3, -1, -1,
       -1, -1, -1, -1, -1, -1,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,
       -1, -1,  4,  4, -1, -1,  4,  4,  4,  4,  4,  4,  5, -1, -1,  2, -1,
        2,  2,  2,  6, -1,  5,  7, -1,  7,  5, -1,  7, -1,  6,  5,  7,  7,
        6,  5, -1, -1,  6, -1,  5,  5, -1,  6, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  0, -1, -1,  8,  9,  8,  9, 10,  8,  9, 10,
        8, 10,  9,  8,  9, 10,  8, 10,  9,  8, 10,  9,  8, 10,  9,  8,  9,
       10, 10,  8, 10,  9,  8,  9, 10,  8, 10,  9,  8,  9, 10,  8, 10,  9,
        8,  9, 10, 11, 11,  8, 10,  9,  8, 11,  8, 10,  9, -1,  8,  8, -1,
        9,  8, 11,  8, 11,  8, 11,  9, 10,  8, 10,  9, 11,  8,  9, 10, -1,
        8, 11,  8, 11,  8