批量处理

1. try some tokenizer

In [61]:
import pandas as pd
import re
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer


def tokenize(log_content):
    words = re.split('[ ,]', log_content)
    # words = log_content.split('[ ,]')
    words = [word for word in words if not re.search(r'\d', word)]
    
    
    return words

def vectorize(tokenized_logs):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    return vectorizer.fit_transform(tokenized_logs)

def cluster(vectorized_logs, num_clusters=0, method='kmeans'):
    if method == 'kmeans':
        kmeans = KMeans(n_clusters=num_clusters)
        kmeans.fit(vectorized_logs)
        return kmeans.labels_
    if method == 'dbscan':
        dbscan = DBSCAN(eps=0.3, min_samples=5)
        dbscan.fit(vectorized_logs)
        return dbscan.labels_


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
            'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
labels_all = []
# 读取CSV文件
for dataset in datasets:
    df = pd.read_csv(
        f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

    # 选择某一列，例如'column_name'
    column = df['Content']

    # 将该列转换为列表
    column_list = column.tolist()

    tokenized_logs = [tokenize(content) for content in column_list]


    labels = cluster(vectorize(tokenized_logs), method='dbscan')
    labels_all.append(labels)



In [64]:
from collections import Counter

for labels in labels_all:
    freq = Counter(labels)
    print(max(labels))
    print(freq)
    

# num = 0
# for i, l in enumerate(labels):

#     if l == -1:  # 12, 14 , 17, 29是异常的模板

#         print(column_list[i])

#         num += 1

# # 75 + 45 + 30

# print(num)

44
Counter({3: 721, 33: 208, -1: 142, 24: 121, 1: 109, 2: 92, 42: 71, 6: 60, 41: 51, 0: 42, 26: 35, 7: 30, 32: 23, 8: 21, 35: 19, 5: 18, 38: 16, 4: 15, 28: 15, 20: 13, 21: 12, 13: 10, 25: 9, 29: 9, 34: 9, 44: 9, 9: 8, 11: 8, 40: 8, 19: 7, 39: 7, 43: 7, 12: 6, 22: 6, 23: 6, 30: 6, 36: 6, 10: 5, 14: 5, 15: 5, 16: 5, 17: 5, 18: 5, 27: 5, 31: 5, 37: 5})
10
Counter({1: 314, 0: 311, 2: 292, 3: 292, 6: 263, 9: 224, 4: 115, 7: 80, 8: 80, 5: 20, 10: 5, -1: 4})
43
Counter({2: 336, 37: 170, -1: 122, 36: 120, 1: 117, 0: 89, 17: 76, 38: 68, 34: 64, 40: 63, 29: 46, 3: 43, 4: 43, 5: 43, 6: 43, 7: 43, 10: 36, 11: 36, 20: 31, 33: 31, 25: 23, 27: 23, 30: 23, 31: 23, 32: 23, 35: 23, 39: 23, 41: 23, 43: 23, 9: 17, 42: 16, 8: 15, 12: 15, 26: 15, 18: 13, 16: 12, 19: 10, 22: 10, 28: 10, 23: 8, 24: 8, 15: 7, 13: 6, 14: 6, 21: 5})
23
Counter({1: 517, 0: 260, 5: 242, 6: 241, 7: 241, 10: 144, 8: 136, -1: 70, 2: 17, 3: 17, 4: 17, 9: 17, 13: 17, 12: 8, 23: 6, 11: 5, 14: 5, 15: 5, 16: 5, 17: 5, 18: 5, 19: 5, 20: 5,

显示每一批分组情况

通过三个参数选取最合适的n值

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import re

def tokenize(log_content):
    words = log_content.split()
    words = [word for word in words if not re.search(r'\d', word)]
    return words


def vectorize(tokenized_logs):
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
    return vectorizer.fit_transform(tokenized_logs)


def cluster(vectorized_logs, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(vectorized_logs)
    # 计算聚类误差
    cluster_error = kmeans.inertia_
    # 计算轮廓系数
    silhouette_avg = silhouette_score(vectorized_logs, kmeans.labels_)
    # 计算Calinski-Harabasz指数
    calinski_harabasz_avg = calinski_harabasz_score(vectorized_logs.toarray(), kmeans.labels_)
    return kmeans.labels_, cluster_error, silhouette_avg, calinski_harabasz_avg


datasets = ['BGL', 'HDFS', 'Linux', 'HealthApp', 'OpenStack', 'OpenSSH', 'Proxifier', 'HPC', 'Zookeeper', 'Mac',
            'Hadoop', 'Android', 'Windows', 'Apache', 'Thunderbird', 'Spark']
# 读取CSV文件
for dataset in datasets:
    df = pd.read_csv(f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')

    # 选择某一列，例如'column_name'
    column = df['Content']

    # 将该列转换为列表
    column_list = column.tolist()

    tokenized_logs = [tokenize(content) for content in column_list]


    vectorized_logs = vectorize(tokenized_logs)

    range_n_clusters = range(250, 251)
    # 聚类误差
    errors = []

    # 轮廓系数
    silhouettes = []

    # Calinski-Harabasz指数
    calinski_harabasz_scores = []

    # for n_clusters in range_n_clusters:
    n_clusters = 250
    # 聚类
    labels, cluster_error, silhouette_avg, calinski_harabasz_avg = cluster(vectorized_logs, n_clusters)

    # errors.append(cluster_error)
    # silhouettes.append(silhouette_avg)
    # calinski_harabasz_scores.append(calinski_harabasz_avg)

# # 绘制聚类误差图
# plt.figure()
# plt.plot(range_n_clusters, errors, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Cluster error')
# plt.title('The Elbow Method')
# plt.show()

# # 绘制轮廓系数图
# plt.figure()
# plt.plot(range_n_clusters, silhouettes, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Coefficient')
# plt.title('The Silhouette Method')
# plt.show()

# # 绘制Calinski-Harabasz指数图
# plt.figure()
# plt.plot(range_n_clusters, calinski_harabasz_scores, 'o-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Calinski-Harabasz Index')
# plt.title('The Calinski-Harabasz Method')
# plt.show()

In [3]:
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
# 假设log_data是我们的日志数据，已经被转换为数值向量
log_data = np.random.rand(2000, 5)  # 这只是一个示例，你需要用你的实际数据替换这里

# 读取CSV文件
dataset = 'HPC'
df = pd.read_csv(
    f'dataset/{dataset}/{dataset}_2k.log_structured_corrected.csv')
logs = df['Content']
# 使用TF-IDF将日志信息转换为数值向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(logs)
# 初始化DBSCAN对象
dbscan = DBSCAN(eps=0.3, min_samples=5)

# 对数据进行拟合
dbscan.fit(X)

# 获取聚类标签
labels = dbscan.labels_

In [60]:
num = 0
for i, l in enumerate(labels):
    if l == -1:  # 12, 14 , 17, 29是异常的模板
        print(logs[i])
        num += 1
# 75 + 45 + 30
print(num)

Component State Change: Component \042SCSI-WWID:01000010:6005-08b4-0001-00c6-0006-3000-003d-0000\042 is in the unavailable state (HWID=1973)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=3180)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=5089)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=4088)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=2538)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=2480)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=3713)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=3891)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=2478)
Component State Change: Component \042alt0\042 is in the unavailable state (HWID=2969)
Component State Change: Component \042alt0\042 is in the unavailable state (