In [5]:
import os
import nltk
# nltk.download('punkt')
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier as rf
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

data_dir = '../data'

In [6]:
sel_data = pd.read_csv(os.path.join(data_dir, 'raw/preliminary_sel_log_dataset.csv'))
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)
sel_data

Unnamed: 0,sn,time,msg,server_model
0,SERVER_10001,2020-05-01 08:54:43,Processor CPU1_Status | IERR | Asserted,SM57
1,SERVER_10001,2020-05-01 08:54:43,Processor CPU0_Status | IERR | Asserted,SM57
2,SERVER_10001,2020-05-01 08:55:03,Management Subsys Health System_Health | Sens...,SM57
3,SERVER_10001,2020-05-01 08:59:48,Processor CPU0_Status | IERR | Deasserted,SM57
4,SERVER_10001,2020-05-01 08:59:48,Processor CPU1_Status | IERR | Deasserted,SM57
...,...,...,...,...
482531,SERVER_9999,2020-10-13 02:56:48,Processor CPU1_Status | Configuration Error |...,SM57
482532,SERVER_9999,2020-10-13 02:56:56,Memory CPU1C0_DIMM_Stat | Uncorrectable ECC |...,SM57
482533,SERVER_9999,2020-10-13 02:56:57,Processor CPU1_Status | Configuration Error |...,SM57
482534,SERVER_9999,2020-10-13 02:57:03,Memory CPU1C0_DIMM_Stat | Uncorrectable ECC |...,SM57


In [None]:
# --------------------------------
# 运行：
#     sn_list: 长13705的 服务器名列表
#     tail_msg_list : 长13705的 以字符串格式存储的 13705台服务器的日志信息   ' Drive Slot HDD_L_14_Status | Drive Fault | Asserted. Drive Slot / Bay HDD_L_14_Status | Drive Fault | Asserted. Drive Slot HDD_L_14_Status | Drive Fault | Deasserted. Drive Slot / Bay HDD_L_14_Status | Drive Fault | Deasserted',
# --------------------------------
sn_list = sel_data['sn'].drop_duplicates(keep='first').to_list()   # 统计所有SERVER服务器  ---->  共13705台服务器
tail_msg_list = ['.'.join(sel_data[sel_data['sn']==i]['msg'].tail(10).to_list()) for i in sn_list]  # 取出每台服务器的最后十条日志， 同一台服务器的日志信息用.连接，保存为字符串格式

In [None]:
# --------------------------------
# 保存：
# 将两个列表保存为txt格式，方便下次读取
# 至此，服务器名 和 日志信息另存完毕
# --------------------------------
with open(os.path.join(data_dir, "/sn_list.txt"), 'w', encoding='utf-8')  as f:
    for sn in sn_list:
        f.writelines(sn)
        f.writelines("\n")  # 须分行

with open(os.path.join(data_dir, "/msg_list.txt"), 'w', encoding='utf-8')  as f:
    for msg in tail_msg_list:
        f.writelines(msg)
        f.writelines("\n")

In [7]:
# --------------------------------
# 另存标签df
# label中的SERVER可能会出现重复的情况
# 重复表明 同一个SERVER在不同时间出现多次不同LABEL的故障
#
# 将故障发生前的最后十条信息以txt格式保存
# 每行由 日志信息+故障标签 组成
# 日志信息和故障标签用'$'保存——已确认日志信息中无'$'
# --------------------------------
label = pd.read_csv(os.path.join(data_dir, 'raw/preliminary_train_label_dataset.csv'))
label.sort_values(by=['sn', 'fault_time'], inplace=True)
label.reset_index(drop=True, inplace=True)
label

Unnamed: 0,sn,fault_time,label
0,SERVER_10001,2020/5/1 10:04,1
1,SERVER_10003,2020/3/28 9:48,2
2,SERVER_10008,2020/2/25 16:12,1
3,SERVER_10008,2020/3/11 18:04,2
4,SERVER_10009,2020/5/8 16:37,3
...,...,...,...
16664,SERVER_9991,2020/10/7 18:42,2
16665,SERVER_9991,2020/8/4 22:49,2
16666,SERVER_9993,2020/5/14 23:50,2
16667,SERVER_9998,2020/5/29 11:25,2


In [None]:

# 保存每条故障发生前10条日志，用字符串格式存储
label_list = []
for i, row in label.iterrows():
    label_list.append('.'.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())
train_label = label['label'].values

with open(os.path.join(data_dir, "/log_label_list.txt"), 'w', encoding='utf-8') as f:
    for i in range(len(label_list)):
        log_label = label_list[i] + "$" + str(train_label[i])
        f.writelines(log_label)
        f.writelines('\n')

In [None]:
# --------------------------------
# 加载服务器名，日志等：
#     sn_list: 长13705的 服务器名列表
#     tail_msg_list : 长13705的 以字符串格式存储的 13705台服务器的日志信息
#               ' Drive Slot HDD_L_14_Status | Drive Fault | Asserted. Drive Slot / Bay HDD_L_14_Status | Drive Fault | Asserted. Drive Slot HDD_L_14_Status | Drive Fault | Deasserted. Drive Slot / Bay HDD_L_14_Status | Drive Fault | Deasserted',
#     tokenized_sent : 长137005的 以列表格式存储的 每条日志的分词
#               ['drive','slot','hdd_l_14_status', '|','drive','fault','|','asserted','.','drive','slot','/','bay','hdd_l_14_status','|','drive','fault','|','asserted','.','drive','slot','hdd_l_14_status','|','drive','fault','|','deasserted','.','drive','slot','/','bay','hdd_l_14_status','|','drive','fault','|','deasserted'],
# --------------------------------

# 读取服务器列表
sn_list = []
with open(os.path.join(data_dir, "sn_list.txt"), "r", encoding= "utf-8") as f:
    for line in f.readlines():
        sn_list.append(line.strip())
# 读取日志列表
msg_list = []
with open(os.path.join(data_dir, "msg_list.txt"), "r", encoding="utf-8") as f:
    for line in f.readlines():
        msg_list.append(line.strip())
# 加载每条日志的分词
tokenized_sent = [word_tokenize(s.lower()) for s in msg_list]

In [None]:
# --------------------------------
# embedding前的工作： 使用index顺序标记词向量
# tagged_data:
#     长13705的 以列表格式存储的 进行标记后的tokenized_sent
'''
tagged_data = [
    TageedDocument(tokenized_data[0], [0]), 
    TageedDocument(tokenized_data[1], [1]), 
    TageedDocument(tokenized_data[2], [2]), 
]
'''
# --------------------------------
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]

In [None]:
# -------------------------------
# Doc2Vec模型：
#   模型是基于Word2Vec基础上，引入了段落的概念
#   Word2Vec将每个单词用一个唯一词向量进行表示
#   Doc2Vec则是将词向量扩充成段落向量，
#   所以Doc2Vec模型需要的输入格式就是TaggedDocument： (词列表， 段落序号)
#
# 模型参数：
#   - sentences: 需要TaggedDocument格式的输入
#   - alpha: 学习率
#   - size: 特征向量的维度 默认100
#   - window: 表示当前词 和 预测词 在一个句子中最大距离是多少
#   - min_count: 词频少于min_count的单词会被丢弃, 默认为5
#
# 更多参数参考： https://blog.csdn.net/mpk_no1/article/details/72510655?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522164662087516780265422133%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=164662087516780265422133&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~sobaiduend~default-1-72510655.pc_search_result_control_group&utm_term=doc2vec%E5%8F%82%E6%95%B0&spm=1018.2226.3001.4187
# 模型数学理论： https://blog.csdn.net/itplus/article/details/37969635
# -------------------------------
model = Doc2Vec(tagged_data, vector_size = 10, window = 2, min_count = 1, epochs = 10)

In [None]:
# -------------------------------------
# 读取log_label_list.txt文件
# 将log保存到raw_train中
# 将lables保存到train_lable中
# -------------------------------------
raw_train = []
train_lable = []

with open('./log_label_list.txt', "r", encoding='utf-8') as f:
    for line in f.readlines():
        line = line.strip()
        content = line.split('$')
        raw_train.append(content[0])
        train_lable.append(int(content[1]))
train_tokenized = [word_tokenize(s) for s in raw_train]

In [None]:
# --------------------------------
# 创建训练集
# 将raw_train中的字符串转换成词向量，用shuzubaocun
# model.infer_vector()
#   - doc_words: 字符串 或 列表
#   - alpha: 学习率
#   - epochs
# infer_vector()根据 model的输入TaggedDocument构建一个模型
# 对于传入infer_vector()的分词列表创建一个推断词向量
# ---------------------------------
train_data = []
for i in range(len(train_lable)):
    train_data.append(model.infer_vector(train_tokenized[i]))

train_features = np.array(train_data)
train_label = np.array(train_lable)

np.save(os.path.join(data_dir, '/processed/train_features.npy'), train_features)
np.save(os.path.join(data_dir, '/processed/train_labels.npy'), train_label)