# BlueGene/L Supercomputer Log Analysis
## Comprehensive System Performance Evaluation
**Author:** Nipun Bakshi    
**Dataset:** BGL.log (1.9M+ entries)

## 1. Environment Setup & Configuration

In [14]:
# ------ SPARK CONFIGURATION ------
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark with optimized settings
spark = SparkSession.builder \
    .appName("BGL Log Analysis") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# File path configuration
LOG_FILE_PATH = "BGL.log"
OUTPUT_DIR = "analysis_results/"

## 2. Data Loading & Initial Processing

In [16]:
# ------ LOG PARSING CONFIGURATION ------
LOG_PATTERN = (
    r"^(-)\\s+"  # alert_flag (no named group)
    r"(\\d+)\\s+"  # timestamp
    r"(\\d{4}\\.\\d{2}\\.\\d{2})\\s+"  # log_date
    r"([\\w-]+)\\s+"  # node_id
    r"(\\d{4}-\\d{2}-\\d{2}-\\d{2}\\.\\d{2}\\.\\d{2}\\.\\d{6})\\s+"  # datetime
    r"(\\S+)\\s+"  # message_type
    r"(\\S+)\\s+"  # system_component
    r"(\\S+)\\s+"  # severity
    r"(.*)$"  # message_content
)

# Update the parsed_logs select to match the new group indices:
parsed_logs = raw_logs.select(
    regexp_extract('value', LOG_PATTERN, 1).alias('alert_flag'),
    regexp_extract('value', LOG_PATTERN, 2).cast('long').alias('timestamp'),
    to_date(regexp_extract('value', LOG_PATTERN, 3), 'yyyy.MM.dd').alias('log_date'),
    regexp_extract('value', LOG_PATTERN, 4).alias('node_id'),
    regexp_extract('value', LOG_PATTERN, 5).alias('full_datetime'),
    regexp_extract('value', LOG_PATTERN, 6).alias('message_type'),
    regexp_extract('value', LOG_PATTERN, 7).alias('system_component'),
    regexp_extract('value', LOG_PATTERN, 8).alias('severity'),
    regexp_extract('value', LOG_PATTERN, 9).alias('message_content')
)

## 3. Core Analytical Modules

In [17]:
# ------ ERROR PATTERN ANALYSIS ------
def analyze_error_patterns(df):
    """Identify most frequent error types"""
    return df.filter(col('severity') == 'ERROR') \
             .groupBy('message_content') \
             .count() \
             .orderBy(desc('count'))

# ------ TEMPORAL ANALYSIS ------
def analyze_temporal_patterns(df):
    """Examine event distribution over time"""
    return df.groupBy('log_date') \
             .count() \
             .withColumn('7_day_avg', 
                        avg('count').over(Window.orderBy('log_date').rowsBetween(-6, 0)))

# ------ NODE PERFORMANCE ANALYSIS ------
def analyze_node_performance(df):
    """Identify nodes with most critical events"""
    return df.filter(col('message_content').contains('critical')) \
             .groupBy('node_id') \
             .count() \
             .orderBy(desc('count'))

# Execute analyses
error_analysis = analyze_error_patterns(processed_logs)
temporal_analysis = analyze_temporal_patterns(processed_logs)
node_analysis = analyze_node_performance(processed_logs)

## 4. Visualization & Reporting

In [18]:
# ------ VISUALIZATION CONFIGURATION ------
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_theme(style="whitegrid", palette="husl")

# Convert Spark DF to Pandas for visualization
temporal_pd = temporal_analysis.toPandas()
temporal_pd['log_date'] = pd.to_datetime(temporal_pd['log_date'])

# Temporal Analysis Plot
plt.figure(figsize=(14, 7))
plt.plot(temporal_pd['log_date'], temporal_pd['count'], label='Daily Events')
plt.plot(temporal_pd['log_date'], temporal_pd['7_day_avg'], 
         label='7-Day Moving Avg', linewidth=2.5, color='darkred')
plt.title('System Event Distribution with Trend Analysis', pad=20)
plt.xlabel('Observation Date', labelpad=15)
plt.ylabel('Event Count', labelpad=15)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}temporal_analysis.png')
plt.show()

Py4JJavaError: An error occurred while calling o362.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 2.0 failed 1 times, most recent failure: Lost task 9.0 in stage 2.0 (TID 41) (host.docker.internal executor driver): org.apache.spark.SparkRuntimeException: [INVALID_PARAMETER_VALUE.PATTERN] The value of parameter(s) `regexp` in `regexp_extract` is invalid: '^(?P<alert_flag>-)\\s+(?P<timestamp>\\d+)\\s+(?P<log_date>\\d{4}\\.\\d{2}\\.\\d{2})\\s+(?P<node_id>[\\w-]+)\\s+(?P<datetime>\\d{4}-\\d{2}-\\d{2}-\\d{2}\\.\\d{2}\\.\\d{2}\\.\\d{6})\\s+(?P<message_type>\\S+)\\s+(?P<system_component>\\S+)\\s+(?P<severity>\\S+)\\s+(?P<message_content>.*)$'.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidPatternError(QueryExecutionErrors.scala:2620)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidPatternError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:288)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.util.regex.PatternSyntaxException: Unknown inline modifier near index 3
^(?P<alert_flag>-)\s+(?P<timestamp>\d+)\s+(?P<log_date>\d{4}\.\d{2}\.\d{2})\s+(?P<node_id>[\w-]+)\s+(?P<datetime>\d{4}-\d{2}-\d{2}-\d{2}\.\d{2}\.\d{2}\.\d{6})\s+(?P<message_type>\S+)\s+(?P<system_component>\S+)\s+(?P<severity>\S+)\s+(?P<message_content>.*)$
   ^
	at java.util.regex.Pattern.error(Unknown Source)
	at java.util.regex.Pattern.group0(Unknown Source)
	at java.util.regex.Pattern.sequence(Unknown Source)
	at java.util.regex.Pattern.expr(Unknown Source)
	at java.util.regex.Pattern.compile(Unknown Source)
	at java.util.regex.Pattern.<init>(Unknown Source)
	at java.util.regex.Pattern.compile(Unknown Source)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkRuntimeException: [INVALID_PARAMETER_VALUE.PATTERN] The value of parameter(s) `regexp` in `regexp_extract` is invalid: '^(?P<alert_flag>-)\\s+(?P<timestamp>\\d+)\\s+(?P<log_date>\\d{4}\\.\\d{2}\\.\\d{2})\\s+(?P<node_id>[\\w-]+)\\s+(?P<datetime>\\d{4}-\\d{2}-\\d{2}-\\d{2}\\.\\d{2}\\.\\d{2}\\.\\d{6})\\s+(?P<message_type>\\S+)\\s+(?P<system_component>\\S+)\\s+(?P<severity>\\S+)\\s+(?P<message_content>.*)$'.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidPatternError(QueryExecutionErrors.scala:2620)
	at org.apache.spark.sql.errors.QueryExecutionErrors.invalidPatternError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.hasNext(InMemoryRelation.scala:119)
	at org.apache.spark.sql.execution.columnar.CachedRDDBuilder$$anon$2.hasNext(InMemoryRelation.scala:288)
	at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
	at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1597)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
	at org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.util.regex.PatternSyntaxException: Unknown inline modifier near index 3
^(?P<alert_flag>-)\s+(?P<timestamp>\d+)\s+(?P<log_date>\d{4}\.\d{2}\.\d{2})\s+(?P<node_id>[\w-]+)\s+(?P<datetime>\d{4}-\d{2}-\d{2}-\d{2}\.\d{2}\.\d{2}\.\d{6})\s+(?P<message_type>\S+)\s+(?P<system_component>\S+)\s+(?P<severity>\S+)\s+(?P<message_content>.*)$
   ^
	at java.util.regex.Pattern.error(Unknown Source)
	at java.util.regex.Pattern.group0(Unknown Source)
	at java.util.regex.Pattern.sequence(Unknown Source)
	at java.util.regex.Pattern.expr(Unknown Source)
	at java.util.regex.Pattern.compile(Unknown Source)
	at java.util.regex.Pattern.<init>(Unknown Source)
	at java.util.regex.Pattern.compile(Unknown Source)
	... 26 more


## 5. Results & Conclusions

In [None]:
# ------ RESULTS SUMMARY ------
print("\nTop 10 Error Types:")
error_analysis.show(10, truncate=False)

print("\nNode Performance Ranking:")
node_analysis.show(10)

# Save final results
error_analysis.write.mode('overwrite').csv(f'{OUTPUT_DIR}error_analysis')
temporal_analysis.write.mode('overwrite').csv(f'{OUTPUT_DIR}temporal_analysis')
node_analysis.write.mode('overwrite').csv(f'{OUTPUT_DIR}node_analysis')

# Cleanup resources
processed_logs.unpersist()
spark.stop()
print("\nAnalysis completed successfully. Resources released.")