In [1]:
from datasets import load_dataset
from pathlib import Path
from typing import List
import re

# Removing timestamps

# HDFS1

In [2]:
hdfs1_log_path = Path('/home/cernypro/dev/source/ml4logs/data/interim/HDFS1/train-data-HDFS1-cv1-1-time-ordered.log')

In [3]:
def clean_hdfs1_lines(lines: List[str]):
    HDFS1_TIMESTAMP_PATTERN = re.compile(r'^(\d+) (\d+) (\d+) ')
    NULL_CHAR_PATTERN = re.compile('\x00')
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in lines)
    no_timestamps = [HDFS1_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]
    return no_timestamps

In [5]:
with hdfs1_log_path.open(mode='r') as f:
    hdfs1_lines = [line for line in f]
hdfs1_cleaned = clean_hdfs1_lines(hdfs1_lines)
with (hdfs1_log_path.parent / f'no_timestamps_{hdfs1_log_path.stem}.log').open(mode='w') as f:
    f.writelines(hdfs1_cleaned)

In [6]:
del hdfs1_lines
del hdfs1_cleaned

## HDFS2

In [71]:
logfolder_path = Path('/home/cernypro/dev/source/ml4logs/data/raw/HDFS2')

In [76]:
output_folder = Path('/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/simple_cleaning')
output_folder.mkdir(parents=True, exist_ok=True)

In [77]:
def clean_lines_remove_timestamps(lines: List[str]) -> List[str]:
    HDFS2_TIMESTAMP_PATTERN = re.compile(r'^\d{4,4}-\d\d-\d\d \d\d:\d\d:\d\d,\d{3,3} ')
    HDFS2_COMMENT_PATTERN = re.compile(r'^/?\*+/?$')
    NULL_CHAR_PATTERN = re.compile('\x00')
    no_comments = (line for line in lines if HDFS2_COMMENT_PATTERN.match(line) is None)
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in no_comments)
    no_timestamps = [HDFS2_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]
    return no_timestamps

def clean_lines_only(lines: List[str]) -> List[str]:
    HDFS2_TIMESTAMP_PATTERN = re.compile(r'^\d{4,4}-\d\d-\d\d \d\d:\d\d:\d\d,\d{3,3} ')
    HDFS2_COMMENT_PATTERN = re.compile(r'^/?\*+/?$')
    NULL_CHAR_PATTERN = re.compile('\x00')
    no_comments = (line for line in lines if HDFS2_COMMENT_PATTERN.match(line) is None)
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in no_comments)
    cleaned_with_newlines = [line+'\n' for line in stripped if line]
    return cleaned_with_newlines

In [None]:
for logfile_path in logfolder_path.glob('*.log'):
    print(logfile_path.stem)
    with logfile_path.open(mode='r') as f:
        lines = [line for line in f]
    cleaned = clean_lines_only(lines)
    with (output_folder / logfile_path.name).open(mode='w') as f:
        f.writelines(cleaned)

## Spark

In [2]:
spark_log_path = Path('/home/cernypro/dev/source/ml4logs/data/interim/Spark/concatenated_spark.log')

In [3]:
def clean_spark_lines(lines: List[str]):
    SPARK_TIMESTAMP_PATTERN = re.compile(r'\d\d/\d\d/\d\d \d\d:\d\d:\d\d ')
    NULL_CHAR_PATTERN = re.compile('\x00')
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in lines)
    no_timestamps = [SPARK_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]
    return no_timestamps

In [4]:
with spark_log_path.open(mode='r') as f:
    lines = [line for line in f]
cleaned = clean_spark_lines(lines)
with (spark_log_path.parent / 'no_timestamps_spark.log').open(mode='w') as f:
    f.writelines(cleaned)

In [8]:
cleaned[:20]

['SLF4J: Class path contains multiple SLF4J bindings.\n',
 'SLF4J: Found binding in [jar:file:/opt/hdfs/nodemanager/usercache/yxsu/filecache/20/spark-assembly-1.4.1-hadoop2.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
 'SLF4J: Found binding in [jar:file:/usr/local/hadoop/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n',
 'SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n',
 'SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]\n',
 'INFO executor.CoarseGrainedExecutorBackend: Registered signal handlers for [TERM, HUP, INT]\n',
 'INFO spark.SecurityManager: Changing view acls to: yarn,yxsu\n',
 'INFO spark.SecurityManager: Changing modify acls to: yarn,yxsu\n',
 'INFO spark.SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(yarn, yxsu); users with modify permissions: Set(yarn, yxsu)\n',
 'INFO slf4j.Slf4jLogger: Slf4jLogger sta

## Hadoop

In [6]:
hadoop_log_path = Path('/home/cernypro/dev/source/ml4logs/data/interim/Hadoop/concatenated_hadoop.log')

In [9]:
with hadoop_log_path.open(mode='r') as f:
    hadoop_lines = [line for line in f]

In [12]:
def clean_hadoop_lines(lines: List[str]):
    HADOOP_TIMESTAMP_PATTERN = re.compile(r'\d{4,4}-\d\d-\d\d \d\d:\d\d:\d\d,\d{3,3} ')
    NULL_CHAR_PATTERN = re.compile('\x00')
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in lines)
    no_timestamps = [HADOOP_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]
    return no_timestamps

In [15]:
hadoop_cleaned = clean_hadoop_lines(hadoop_lines)
with (hadoop_log_path.parent / 'no_timestamps_hadoop.log').open(mode='w') as f:
    f.writelines(hadoop_cleaned)

In [None]:
del hadoop_lines
del hadoop_cleaned

## Zookeeper

In [16]:
zookeeper_log_path = Path('/home/cernypro/dev/source/ml4logs/data/raw/Zookeeper/Zookeeper.log')

In [19]:
with zookeeper_log_path.open(mode='r') as f:
    zookeeper_lines = [line for line in f]

In [20]:
zookeeper_lines[:10]

['2015-07-29 17:41:41,536 - INFO  [main:QuorumPeerConfig@101] - Reading configuration from: /etc/zookeeper/conf/zoo.cfg\n',
 '2015-07-29 17:41:41,544 - INFO  [main:QuorumPeerConfig@334] - Defaulting to majority quorums\n',
 '2015-07-29 17:41:41,555 - INFO  [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3\n',
 '2015-07-29 17:41:41,555 - INFO  [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0\n',
 '2015-07-29 17:41:41,557 - INFO  [main:DatadirCleanupManager@101] - Purge task is not scheduled.\n',
 '2015-07-29 17:41:41,579 - INFO  [main:QuorumPeerMain@127] - Starting quorum peer\n',
 '2015-07-29 17:41:41,609 - INFO  [main:NIOServerCnxnFactory@94] - binding to port 0.0.0.0/0.0.0.0:2181\n',
 '2015-07-29 17:41:41,648 - INFO  [main:QuorumPeer@913] - tickTime set to 2000\n',
 '2015-07-29 17:41:41,649 - INFO  [main:QuorumPeer@933] - minSessionTimeout set to -1\n',
 '2015-07-29 17:41:41,649 - INFO  [main:QuorumPeer@944] - maxSessionTimeout set to -1\n']

In [21]:
def clean_zookeeper_lines(lines: List[str]):
    ZOOKEEPER_TIMESTAMP_PATTERN = re.compile(r'\d{4,4}-\d\d-\d\d \d\d:\d\d:\d\d,\d{3,3} - ')
    NULL_CHAR_PATTERN = re.compile('\x00')
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in lines)
    no_timestamps = [ZOOKEEPER_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]
    return no_timestamps

In [23]:
zookeeper_cleaned = clean_zookeeper_lines(zookeeper_lines)
zookeeper_output_file_path = zookeeper_log_path.parent.parent.parent / 'interim' / 'Zookeeper' / 'no_timestamps_zookeeper.log'
zookeeper_output_file_path.parent.mkdir(parents=True, exist_ok=True)
with (zookeeper_output_file_path).open(mode='w') as f:
    f.writelines(zookeeper_cleaned)

In [24]:
del zookeeper_lines
del zookeeper_cleaned

## BGL

In [25]:
bgl_log_path = Path('/home/cernypro/dev/source/ml4logs/data/raw/BGL/BGL.log')

In [26]:
with bgl_log_path.open(mode='r') as f:
    bgl_lines = [line for line in f]

In [27]:
bgl_lines[:10]

['- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.363779 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.527847 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.675872 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.823719 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.982731 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838571 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.51.131467 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected\n',
 '- 1117838571 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.51.293532 R02

In [55]:
def clean_bgl_lines(lines: List[str]):
    BGL_TIMESTAMP_PATTERN = re.compile(r'\d+ \d{4,4}.\d\d.\d\d (?P<someID>[-:\w]+) \d{4,4}-\d\d-\d\d-\d\d\.\d\d\.\d\d\.\d+ ')
    NULL_CHAR_PATTERN = re.compile('\x00')
    stripped = (NULL_CHAR_PATTERN.sub('', line).strip() for line in lines)
    no_timestamps = [BGL_TIMESTAMP_PATTERN.sub('', line)+'\n' for line in stripped if line]  # could sub with '\g<someID> ', but it appears that ID is duplicated when present, so we can leave one out
    return no_timestamps

In [56]:
bgl_cleaned = clean_bgl_lines(bgl_lines)

In [60]:
bgl_output_file_path = bgl_log_path.parent.parent.parent / 'interim' / 'BGL' / 'no_timestamps_bgl.log'
bgl_output_file_path.parent.mkdir(parents=True, exist_ok=True)
with (bgl_output_file_path).open(mode='w') as f:
    f.writelines(bgl_cleaned)

In [None]:
del bgl_lines
del bgl_cleaned

# Creating combined dataset
Datasets are already pretokenized and chunked

In [1]:
from datasets import load_from_disk
from pathlib import Path
from typing import List
import numpy as np

In [2]:
used_dataset_paths = {
    'train-data-HDFS1-cv1-1-time-ordered': '/home/cernypro/dev/source/ml4logs/data/interim/HDFS1/no_timestamps_train-data-HDFS1-cv1-1-time-ordered/chunked_size_10_tokens_text',
    'HDFS2-secondarynamenode': '/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/no_timestamps_cleaned/hadoop-hdfs-secondarynamenode-mesos-01/chunked_size_10_tokens_text',
    'HDFS2-namenode': '/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/no_timestamps_cleaned/hadoop-hdfs-namenode-mesos-01/chunked_size_10_tokens_text',
    'HDFS2-datanode-01': '/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/no_timestamps_cleaned/hadoop-hdfs-datanode-mesos-01/chunked_size_10_tokens_text',
    'HDFS2-datanode-13': '/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/no_timestamps_cleaned/hadoop-hdfs-datanode-mesos-13/chunked_size_10_tokens_text',
    'Spark': '/home/cernypro/dev/source/ml4logs/data/interim/Spark/no_timestamps_spark/chunked_size_10_tokens_text',
    'Zookeeper': '/home/cernypro/dev/source/ml4logs/data/interim/Zookeeper/no_timestamps_zookeeper/chunked_size_10_tokens_text',
    'BGL': '/home/cernypro/dev/source/ml4logs/data/interim/BGL/no_timestamps_bgl/chunked_size_10_tokens_text',
    'Hadoop': '/home/cernypro/dev/source/ml4logs/data/interim/Hadoop/no_timestamps_hadoop/chunked_size_10_tokens_text',
}

In [3]:
datasets = {name: load_from_disk(path) for name, path in used_dataset_paths.items()}

In [10]:
datasets

{'train-data-HDFS1-cv1-1-time-ordered': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 905102
 }),
 'HDFS2-secondarynamenode': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 71048
 }),
 'HDFS2-namenode': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 1689280
 }),
 'HDFS2-datanode-01': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 261472
 }),
 'HDFS2-datanode-13': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 196808
 }),
 'Spark': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 3323648
 }),
 'Zookeeper': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 7432
 }),
 'BGL': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 474792
 }),
 'Hadoop': Dataset({
     features: ['chunk_text', 'chunk_tokens'],
     num_rows: 39336
 })}

In [11]:
ds_lengths = {name: len(ds) for name, ds in datasets.items()}
ds_lengths

{'train-data-HDFS1-cv1-1-time-ordered': 905102,
 'HDFS2-secondarynamenode': 71048,
 'HDFS2-namenode': 1689280,
 'HDFS2-datanode-01': 261472,
 'HDFS2-datanode-13': 196808,
 'Spark': 3323648,
 'Zookeeper': 7432,
 'BGL': 474792,
 'Hadoop': 39336}

In [13]:
desired_contexts_to_take = {
    'train-data-HDFS1-cv1-1-time-ordered': 60000,
    'HDFS2-secondarynamenode': 60000,
    'HDFS2-namenode': 60000,
    'HDFS2-datanode-01': 60000,
    'HDFS2-datanode-13': 60000,
    'Spark': 240000,
    'Zookeeper': 60000,
    'BGL': 60000,
    'Hadoop': 60000,
}

In [14]:
actual_contexts_to_take = {name: min(ds_lengths[name], desired_contexts_to_take[name]) for name in desired_contexts_to_take}

In [21]:
np.ceil(5.2).astype(int)

6

In [29]:
def split_test_train_helper(dataset, name, desired_total_taken, val_ratio, rnd: np.random.Generator):
    total_to_take = min(len(dataset), desired_total_taken)
    val_size = int(np.ceil(total_to_take*val_ratio))
    train_size = total_to_take - val_size
    assert train_size + val_size == total_to_take
    print(f'{name} - Train: {train_size}, val: {val_size}')
    return dataset.train_test_split(test_size=val_size, train_size=train_size, generator=rnd, writer_batch_size=10000)

In [30]:
SEED = 43
rnd = np.random.default_rng(seed=SEED)
VAL_RATIO = 0.05

In [31]:
train_val_splits_datasets = {name: split_test_train_helper(dataset, name, desired_contexts_to_take[name], VAL_RATIO, rnd) for name, dataset in datasets.items()}

train-data-HDFS1-cv1-1-time-ordered - Train: 57000, val: 3000
HDFS2-secondarynamenode - Train: 57000, val: 3000
HDFS2-namenode - Train: 57000, val: 3000
HDFS2-datanode-01 - Train: 57000, val: 3000
HDFS2-datanode-13 - Train: 57000, val: 3000
Spark - Train: 228000, val: 12000
Zookeeper - Train: 7060, val: 372
BGL - Train: 57000, val: 3000
Hadoop - Train: 37369, val: 1967


In [35]:
train_datasets = {name: split['train'].flatten_indices() for name, split in train_val_splits_datasets.items()}
val_datasets = {name: split['test'].flatten_indices() for name, split in train_val_splits_datasets.items()}

HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))




KeyboardInterrupt: 

In [40]:
assert set(desired_contexts_to_take.keys()) == set(used_dataset_paths.keys()) 

In [41]:
tst = load_from_disk('/home/cernypro/dev/source/ml4logs/data/interim/HDFS2/no_timestamps_cleaned/hadoop-hdfs-secondarynamenode-mesos-01/chunked_size_10_tokens_text')

In [54]:
indices = rnd.choice(len(tst), 100, replace=False)

In [57]:
indices[80:]

array([25916, 19006, 39723, 21737, 31075,  3040, 39801, 21281,  7718,
       30934, 60144, 31883, 67125, 27506, 16985, 70681, 63906, 26383,
       12290, 30328])