In [2]:
import pandas as pd

STRUCT_PATH = "Spark_full_log_structured.csv"
TEMPLATE_PATH = "Spark_full_log_templates.csv"
OUTPUT_PATH = "Spark_logs_enriched.csv"


In [3]:
df_struct = pd.read_csv(STRUCT_PATH)
df_temp = pd.read_csv(TEMPLATE_PATH)

print("Structured logs:", df_struct.shape)
print("Templates:", df_temp.shape)

df_struct.head()


Structured logs: (16075117, 4)
Templates: (236, 3)


Unnamed: 0,LineId,Content,EventId,EventTemplate
0,1,"Registered signal handlers for [TERM, HUP, INT]",E164,Registered signal handlers for <*>
1,2,"Changing view acls to: yarn,curi",E200,"Changing view acls to: <*>,<*>"
2,3,"Changing modify acls to: yarn,curi",E191,"Changing modify acls to: <*>,<*>"
3,4,SecurityManager: authentication disabled; ui a...,E10,SecurityManager: authentication disabled; ui a...
4,5,"Changing view acls to: yarn,curi",E200,"Changing view acls to: <*>,<*>"


In [4]:
print("STRUCTURED columns:", df_struct.columns.tolist())
print("TEMPLATE columns:", df_temp.columns.tolist())


STRUCTURED columns: ['LineId', 'Content', 'EventId', 'EventTemplate']
TEMPLATE columns: ['EventId', 'EventTemplate', 'Occurrences']


In [5]:
df_temp_clean = df_temp.drop(columns=["EventTemplate"], errors="ignore")


In [6]:
df_joined = df_struct.merge(
    df_temp_clean,
    on="EventId",
    how="left"
)


In [7]:
print(df_joined.shape)
df_joined.head()


(16075117, 5)


Unnamed: 0,LineId,Content,EventId,EventTemplate,Occurrences
0,1,"Registered signal handlers for [TERM, HUP, INT]",E164,Registered signal handlers for <*>,2606
1,2,"Changing view acls to: yarn,curi",E200,"Changing view acls to: <*>,<*>",4963
2,3,"Changing modify acls to: yarn,curi",E191,"Changing modify acls to: <*>,<*>",4963
3,4,SecurityManager: authentication disabled; ui a...,E10,SecurityManager: authentication disabled; ui a...,4963
4,5,"Changing view acls to: yarn,curi",E200,"Changing view acls to: <*>,<*>",4963


In [8]:
df_joined.to_csv(OUTPUT_PATH, index=False)
print(f"Saved enriched log file → {OUTPUT_PATH}")


Saved enriched log file → Spark_logs_enriched.csv


In [10]:
patterns = ["ERROR", "FATAL", "EXCEPTION", "WARN"]
error_like_df = df_joined[df_joined["Content"].str.contains('|'.join(patterns), case=False, na=False)]
error_like_df.head()


Unnamed: 0,LineId,Content,EventId,EventTemplate,Occurrences
1047,1048,Exception in connection from mesos-slave-20/10...,E170,Exception in connection from <*>,190
1052,1053,Exception while beginning fetch of 1 outstandi...,E41,Exception while beginning fetch of <*> outstan...,2228
1055,1056,Exception while beginning fetch of 1 outstandi...,E41,Exception while beginning fetch of <*> outstan...,2228
1058,1059,Exception while beginning fetch of 1 outstandi...,E41,Exception while beginning fetch of <*> outstan...,2228
7038,7039,Exception while beginning fetch of 1 outstandi...,E41,Exception while beginning fetch of <*> outstan...,2228
