In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import os

spark = SparkSession.builder \
    .appName("Log Analysis") \
    .getOrCreate()

log_data = """\
2024-12-18 10:15:32 INFO User logged in
2024-12-18 10:16:02 ERROR Page not found
2024-12-18 10:17:20 INFO Data uploaded successfully
2024-12-18 10:18:45 ERROR Database connection failed
2024-12-18 10:19:10 INFO User logged out
2024-12-18 10:20:00 ERROR File not found
2024-12-18 10:21:30 INFO User profile updated
"""

log_file_path = 'logfile.log'
with open(log_file_path, 'w') as f:
    f.write(log_data)

log_df = spark.read.text(log_file_path)
log_df.show(truncate=False)

pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (.*)'
extracted_df = log_df.select(
    F.regexp_extract('value', pattern, 1).alias('Timestamp'),
    F.regexp_extract('value', pattern, 2).alias('LogLevel'),
    F.regexp_extract('value', pattern, 3).alias('Message')
)

extracted_df.show(truncate=False)

log_level_counts = extracted_df.groupBy('LogLevel').count()
log_level_counts.show()

error_logs = extracted_df.filter(extracted_df.LogLevel == 'ERROR')
error_logs.show(truncate=False)

logs_by_hour = extracted_df.withColumn("Hour", F.hour(F.to_timestamp("Timestamp", "yyyy-MM-dd HH:mm:ss"))) \
                            .groupBy("Hour") \
                            .count() \
                            .orderBy("Hour")

logs_by_hour.show()

+----------------------------------------------------+
|value                                               |
+----------------------------------------------------+
|2024-12-18 10:15:32 INFO User logged in             |
|2024-12-18 10:16:02 ERROR Page not found            |
|2024-12-18 10:17:20 INFO Data uploaded successfully |
|2024-12-18 10:18:45 ERROR Database connection failed|
|2024-12-18 10:19:10 INFO User logged out            |
|2024-12-18 10:20:00 ERROR File not found            |
|2024-12-18 10:21:30 INFO User profile updated       |
+----------------------------------------------------+

+-------------------+--------+--------------------------+
|Timestamp          |LogLevel|Message                   |
+-------------------+--------+--------------------------+
|2024-12-18 10:15:32|INFO    |User logged in            |
|2024-12-18 10:16:02|ERROR   |Page not found            |
|2024-12-18 10:17:20|INFO    |Data uploaded successfully|
|2024-12-18 10:18:45|ERROR   |Database connect