批量处理

1. try some tokenizer

In [32]:
from parsing import reassign_clusters
import pandas as pd
import re

from sklearn.cluster import KMeans, DBSCAN

from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(log_content):
    list = ['/', 'kb', 'sec', 'byte', 'mb']
    words = re.split(r'[ ,]', log_content)
    for index, word in enumerate(words):
        if '=' in word:
            words[index] = word.split('=')[0]
        if re.search(r'\d', word):
            words[index] = ''
        if any(i in word.lower() for i in list):
            words[index] = ''
    words = [word for word in words if word]   # remove null
    return words

def vectorize(tokenized_logs):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    return vectorizer.fit_transform(tokenized_logs)


def cluster(vectorized_logs, num_clusters='10', cluster_method='kmeans'):
    if cluster_method == 'kmeans':
        cluster = KMeans(n_clusters=num_clusters)
    if cluster_method == 'dbscan':
        cluster = DBSCAN(eps=0.1, min_samples=5)
    cluster.fit(vectorized_logs)
    labels = cluster.labels_
    cluster_nums = max(labels) + 1
    return labels, cluster_nums

datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
dataset = 'Proxifier'

df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

# 选择某一列，例如'column_name'

logs = df['Content']
templates = df['EventTemplate']

# 将该列转换为列表
column_list = logs.tolist()
tokenized_logs = [tokenize(content) for content in column_list]
labels, cluster_nums = cluster(
    vectorize(tokenized_logs), cluster_method='dbscan')

labels, cluster_nums = reassign_clusters(labels, cluster_nums, tokenized_logs)



In [33]:
print('cluster_nums:', cluster_nums)

for index, label in enumerate(labels):
    if label == 1:
        print(logs[index])
        

cluster_nums: 8
proxy.cse.cuhk.edu.hk:5070 close, 0 bytes sent, 0 bytes received, lifetime 00:01
proxy.cse.cuhk.edu.hk:5070 close, 403 bytes sent, 426 bytes received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 451 bytes sent, 18846 bytes (18.4 KB) received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 445 bytes sent, 5174 bytes (5.05 KB) received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 1190 bytes (1.16 KB) sent, 1671 bytes (1.63 KB) received, lifetime 00:02
proxy.cse.cuhk.edu.hk:5070 close, 0 bytes sent, 0 bytes received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 1165 bytes (1.13 KB) sent, 3098 bytes (3.02 KB) received, lifetime 00:01
proxy.cse.cuhk.edu.hk:5070 close, 1165 bytes (1.13 KB) sent, 815 bytes received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 1165 bytes (1.13 KB) sent, 783 bytes received, lifetime <1 sec
proxy.cse.cuhk.edu.hk:5070 close, 850 bytes sent, 10547 bytes (10.2 KB) received, lifetime 00:02
proxy.cse.cuhk.edu.hk:5070 close, 4

In [4]:
str = 'Warning: we failed to resolve data source name an14 an15 an16 an17 an18 an19 an20 an21 an22 an23 an24 an25 an26 an27 an28 an29 an30 an31 an32 an33 an34 an35 an36 an37 an38 an39 an40 an41 an42 an43 an44 an45 an46 an47 an48 an49 an50 an51 an52 an53 an54 an55 an56 an57 an58 an59 an60 an61 an62 an63 an64 an65 an66 an67 an68 an69 an70 an71 an72 an73 an74 an75 an76 an77 an78 an79 an80 an81 an82 an83 an84 an85 an86 an87 an88 an89 an90 an91 an92 an93 an94 an95 an96 an97 an98 an99 an100 an101 an102 an103 an104 an105 an106 an107 an108 an109 an110 an111 an112 an113 an114 an115 an116 an117 an118 an119 an120 an121 an122 an123 an124 an125 an126 an127 an128'
print(len(str))

650


In [15]:
for log in sample:
    print(log)

authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.188.2.4
check pass; user unknown
authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root
session opened for user cyrus by (uid=0)
session closed for user cyrus
ALERT exited abnormally with [1]
session opened for user news by (uid=0)
session closed for user news
connection from 24.54.76.216 (24-54-76-216.bflony.adelphia.net) at Fri Jun 17 07:07:00 2005
session opened for user test by (uid=509)
session closed for user test
connection from 82.252.162.81 (lns-vlq-45-tou-82-252-162-81.adsl.proxad.net) at Sat Jun 18 02:08:10 2005
cupsd shutdown succeeded
cupsd startup succeeded
restart.
connection from 222.33.90.199 () at Mon Jun 20 03:40:59 2005
connection from 210.245.165.136 () at Wed Jun 22 13:16:30 2005
connection from 218.69.108.57 () at Fri Jun 24 18:55:11 2005
authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=massive.merukuru.or

另一种聚类方式：将所有数字替换为0，不经过分词直接聚类

In [74]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


def tokenize(log_content):

    words = re.split(r'[ ,]', log_content)
    for index, word in enumerate(words):
        if word.startswith('/') and len(word) > 1:
            words[index] = ''
        if '=' in word:
            words[index] = word.split('=')[0]
        if re.search(r'\d', word):
            words[index] = ''

    words = [word for word in words if word]   # remove null
    return words


# 假设你的日志信息已经被处理成了一个列表的列表，如下：
dataset = 'Linux'

df = pd.read_csv(
    f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

# 选择某一列，例如'column_name'
logs = df['Content'].tolist()
logs = [tokenize(log) for log in logs]

# 使用TransactionEncoder将日志转换为布尔值矩阵
te = TransactionEncoder()
te_ary = te.fit(logs).transform(logs)

# 创建一个DataFrame
df = pd.DataFrame(te_ary, columns=te.columns_)

# 使用apriori找到频繁项集
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)

print(frequent_itemsets)

    support                     itemsets
0    0.3085                         (())
1    0.3735                        (Jul)
2    0.4560                         (at)
3    0.4620                 (connection)
4    0.4670                       (from)
5    0.3680                       (user)
6    0.3085                     (at, ())
7    0.3085             ((), connection)
8    0.3085                   ((), from)
9    0.3735                    (at, Jul)
10   0.3735            (Jul, connection)
11   0.3735                  (Jul, from)
12   0.4545             (at, connection)
13   0.4545                   (at, from)
14   0.4620           (connection, from)
15   0.3085         (at, (), connection)
16   0.3085               (at, (), from)
17   0.3085       ((), connection, from)
18   0.3735        (at, Jul, connection)
19   0.3735              (at, Jul, from)
20   0.3735      (Jul, connection, from)
21   0.4545       (at, connection, from)
22   0.3085   (at, (), connection, from)
23   0.3735  (at

In [None]:
print('cluster_nums:', cluster_nums)
num = 0
logs_test = []
for i, l in enumerate(list(labels)):

    if l == 0:  # 13是异常的模板

        print(logs[i])
        logs_test.append(logs[i])
        num += 1

# 75 + 45 + 30

print(num)

In [None]:
from evaluator import evaluate
import pandas as pd
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC',
            'Zookeeper', 'Mac', 'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
table_order = 'HDFS Hadoop Spark Zookeeper BGL HPC Thunderbird Windows Linux Android HealthApp Apache Proxifier OpenSSH OpenStack Mac'
datasets = table_order.split(' ')
m, n, p, q = [], [], [], []
for dataset in datasets:
    file = f'outputs/parser/Fourth_guding/{dataset}.csv'  # Fourth_guding
    # df = pd.read_csv(f'outputs/k_means/initial/{dataset}.csv')
    # df2 =
    a, b, c, d = evaluate(file, dataset)
    m.append(a)
    n.append(b)
    p.append(c)
    q.append(d)

print(sum(m)/len(m))
print(sum(n)/len(n))
print(sum(p)/len(p))

# 81.0 71.2

In [None]:
tokenized_logs = [tokenize(content) for content in logs]
labels = cluster(vectorize(tokenized_logs), method='dbscan')
num = 0
for i, l in enumerate(labels):

    if l == 0:  # 12, 14 , 17, 29是异常的模板
        print(logs[i])
        num += 1

print(num)

In [None]:
from collections import Counter 
logs_1 = []
for tokenized_log in tokenized_logs:
    logs_1.append(' '.join(tokenized_log))
freq = Counter(logs_1)
print(freq)
count1 = [0,0]
count2 = [0,0]
for key, value in freq.items():
    if value == 1:
        count1[0] += 1
        count1[1] += 1
    else:
        count2[0] += 1
        count2[1] += value
print(count1)
print(count2)


显示每一批分组情况

通过三个参数选取最合适的n值

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import re

def tokenize(log_content):
    words = re.split(r'[#= ,]', log_content)
    words = [word for word in words if not re.search(r'\d', word)]
    return words


def vectorize(tokenized_logs):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    return vectorizer.fit_transform(tokenized_logs)


def cluster(vectorized_logs, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(vectorized_logs)
    # 计算聚类误差
    cluster_error = kmeans.inertia_
    # 计算轮廓系数
    silhouette_avg = silhouette_score(vectorized_logs, kmeans.labels_)
    # 计算Calinski-Harabasz指数
    calinski_harabasz_avg = calinski_harabasz_score(vectorized_logs.toarray(), kmeans.labels_)
    return kmeans.labels_, cluster_error, silhouette_avg, calinski_harabasz_avg


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
            'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# 读取CSV文件
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

    # 选择某一列，例如'column_name'
    column = df['Content']

    # 将该列转换为列表
    column_list = column.tolist()

    tokenized_logs = [tokenize(content) for content in column_list]


    vectorized_logs = vectorize(tokenized_logs)

    range_n_clusters = range(250, 251)
    # 聚类误差
    errors = []

    # 轮廓系数
    silhouettes = []

    # Calinski-Harabasz指数
    calinski_harabasz_scores = []

    # for n_clusters in range_n_clusters:
    n_clusters = 250
    # 聚类
    labels, cluster_error, silhouette_avg, calinski_harabasz_avg = cluster(vectorized_logs, n_clusters)

    # errors.append(cluster_error)
    # silhouettes.append(silhouette_avg)
    # calinski_harabasz_scores.append(calinski_harabasz_avg)

# # 绘制聚类误差图
# plt.figure()
# plt.plot(range_n_clusters, errors, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Cluster error')
# plt.title('The Elbow Method')
# plt.show()

# # 绘制轮廓系数图
# plt.figure()
# plt.plot(range_n_clusters, silhouettes, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Coefficient')
# plt.title('The Silhouette Method')
# plt.show()

# # 绘制Calinski-Harabasz指数图
# plt.figure()
# plt.plot(range_n_clusters, calinski_harabasz_scores, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Calinski-Harabasz Index')
# plt.title('The Calinski-Harabasz Method')
# plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
# 假设log_data是我们的日志数据，已经被转换为数值向量
log_data = np.random.rand(2000, 5)  # 这只是一个示例，你需要用你的实际数据替换这里

# 读取CSV文件
dataset = 'HPC'
df = pd.read_csv(
    f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content']
# 使用TF-IDF将日志信息转换为数值向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(logs)
# 初始化DBSCAN对象
dbscan = DBSCAN(eps=0.3, min_samples=5)

# 对数据进行拟合
dbscan.fit(X)

# 获取聚类标签
labels = dbscan.labels_

In [None]:
num = 0
for i, l in enumerate(labels):
    if l == -1:  # 12, 14 , 17, 29是异常的模板
        print(logs[i])
        num += 1
# 75 + 45 + 30
print(num)

In [1]:
import math
from collections import Counter
import pandas as pd

def calculate_entropy(lst):
    # 计算列表中每个元素出现的频率
    counter = Counter(lst)
    probs = [count / len(lst) for count in counter.values()]

    # 计算信息熵
    entropy = -sum(p * math.log2(p) for p in probs)

    return entropy
def select_log_template_pairs_based_on_entropy(pairs, num_examples):
    # 计算每个对的信息熵
    entropies = [(pair, calculate_entropy(list(pair[0]) + list(pair[1])))
                 for pair in pairs]

    # 根据信息熵对对进行排序
    sorted_pairs = sorted(entropies, key=lambda x: x[1], reverse=True)

    # 选择信息熵最高的对
    selected_pairs = sorted_pairs[:num_examples]

    return [pair for pair, entropy in selected_pairs]

# discard the target dataset
datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
            'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# datasets.remove('BGL')
demonstration_templates = []
demonstration_logs = []
pairs = []
for d in datasets:
    df = pd.read_csv(f'dataset\{d}\{d}_2k.log_structured_corrected.csv')
    list1 = df['Content'].tolist()
    list2 = df['EventTemplate'].tolist()
    for log, template in zip(list1, list2):
        if template not in demonstration_templates:
            pairs.append((log, template))
            demonstration_templates.append(template)
            demonstration_logs.append(log)

list =  select_log_template_pairs_based_on_entropy(pairs, 1)
for log, template in list:
    print(log)
    print(template)

2017-07-02 15:46:41.445 ksfetch[32435/0x7fff79824000] [lvl=2] main() ksfetch fetching URL (<NSMutableURLRequest: 0x1005110b0> { URL: https://tools.google.com/service/update2?cup2hreq=53f725cf03f511fab16f19e789ce64aa1eed72395fc246e9f1100748325002f4&cup2key=7:1132320327 }) to folder:/tmp/KSOutOfProcessFetcher.YH2CjY1tnx/download
<*> ksfetch[<*>] [lvl=<*>] main() ksfetch fetching URL (<NSMutableURLRequest: <*> { URL: <*> }) to folder:<*>
